// Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. package record import ( "context" "encoding/binary" "io" "runtime/pprof" "sync" "sync/atomic" "time" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/crc" "github.com/prometheus/client_golang/prometheus" ) var walSyncLabels = pprof.Labels("pebble", "wal-sync") var errClosedWriter = errors.New("pebble/record: closed LogWriter") type block struct { // buf[:written] has already been filled with fragments. Updated atomically. written atomic.Int32 // buf[:flushed] has already been flushed to w. flushed int32 buf [blockSize]byte } type flusher interface { Flush() error } type syncer interface { Sync() error } const ( syncConcurrencyBits = 12 // SyncConcurrency is the maximum number of concurrent sync operations that // can be performed. Note that a sync operation is initiated either by a call // to SyncRecord or by a call to Close. Exported as this value also limits // the commit concurrency in commitPipeline. SyncConcurrency = 1 << syncConcurrencyBits ) type syncSlot struct { wg *sync.WaitGroup err *error } // syncQueue is a lock-free fixed-size single-producer, single-consumer // queue. The single-producer can push to the head, and the single-consumer can // pop multiple values from the tail. Popping calls Done() on each of the // available *sync.WaitGroup elements. type syncQueue struct { // headTail packs together a 32-bit head index and a 32-bit tail index. Both // are indexes into slots modulo len(slots)-1. // // tail = index of oldest data in queue // head = index of next slot to fill // // Slots in the range [tail, head) are owned by consumers. A consumer // continues to own a slot outside this range until it nils the slot, at // which point ownership passes to the producer. // // The head index is stored in the most-significant bits so that we can // atomically add to it and the overflow is harmless. headTail atomic.Uint64 // slots is a ring buffer of values stored in this queue. The size must be a // power of 2. A slot is in use until the tail index has moved beyond it. slots [SyncConcurrency]syncSlot // blocked is an atomic boolean which indicates whether syncing is currently // blocked or can proceed. It is used by the implementation of // min-sync-interval to block syncing until the min interval has passed. blocked atomic.Bool } const dequeueBits = 32 func (q *syncQueue) unpack(ptrs uint64) (head, tail uint32) { const mask = 1<> dequeueBits) & mask) tail = uint32(ptrs & mask) return } func (q *syncQueue) push(wg *sync.WaitGroup, err *error) { ptrs := q.headTail.Load() head, tail := q.unpack(ptrs) if (tail+uint32(len(q.slots)))&(1< 0 || written > w.block.flushed || !f.syncQ.empty() { break } if f.close { // If the writer is closed, pretend the sync timer fired immediately so // that we can process any queued sync requests. f.syncQ.clearBlocked() if !f.syncQ.empty() { break } return } f.ready.Wait() continue } // Found work to do, so no longer idle. workStartTime := time.Now() idleDuration := workStartTime.Sub(idleStartTime) pending = append(pending[:0], f.pending...) f.pending = f.pending[:0] f.metrics.PendingBufferLen.AddSample(int64(len(pending))) // Grab the list of sync waiters. Note that syncQueue.load() will return // 0,0 while we're waiting for the min-sync-interval to expire. This // allows flushing to proceed even if we're not ready to sync. head, tail, realSyncQLen := f.syncQ.load() f.metrics.SyncQueueLen.AddSample(int64(realSyncQLen)) // Grab the portion of the current block that requires flushing. Note that // the current block can be added to the pending blocks list after we // release the flusher lock, but it won't be part of pending. This has to // be ordered after we get the list of sync waiters from syncQ in order to // prevent a race where a waiter adds itself to syncQ, but this thread // picks up the entry in syncQ and not the buffered data. written := w.block.written.Load() data := w.block.buf[w.block.flushed:written] w.block.flushed = written // If flusher has an error, we propagate it to waiters. Note in spite of // error we consume the pending list above to free blocks for writers. if f.err != nil { f.syncQ.pop(head, tail, f.err, w.queueSemChan) // Update the idleStartTime if work could not be done, so that we don't // include the duration we tried to do work as idle. We don't bother // with the rest of the accounting, which means we will undercount. idleStartTime = time.Now() continue } f.Unlock() synced, syncLatency, bytesWritten, err := w.flushPending(data, pending, head, tail) f.Lock() if synced && f.fsyncLatency != nil { f.fsyncLatency.Observe(float64(syncLatency)) } f.err = err if f.err != nil { f.syncQ.clearBlocked() // Update the idleStartTime if work could not be done, so that we don't // include the duration we tried to do work as idle. We don't bother // with the rest of the accounting, which means we will undercount. idleStartTime = time.Now() continue } if synced && f.minSyncInterval != nil { // A sync was performed. Make sure we've waited for the min sync // interval before syncing again. if min := f.minSyncInterval(); min > 0 { f.syncQ.setBlocked() if syncTimer == nil { syncTimer = w.afterFunc(min, func() { f.syncQ.clearBlocked() f.ready.Signal() }) } else { syncTimer.Reset(min) } } } // Finished work, and started idling. idleStartTime = time.Now() workDuration := idleStartTime.Sub(workStartTime) f.metrics.WriteThroughput.Bytes += bytesWritten f.metrics.WriteThroughput.WorkDuration += workDuration f.metrics.WriteThroughput.IdleDuration += idleDuration } } func (w *LogWriter) flushPending( data []byte, pending []*block, head, tail uint32, ) (synced bool, syncLatency time.Duration, bytesWritten int64, err error) { defer func() { // Translate panics into errors. The errors will cause flushLoop to shut // down, but allows us to do so in a controlled way and avoid swallowing // the stack that created the panic if panic'ing itself hits a panic // (e.g. unlock of unlocked mutex). if r := recover(); r != nil { err = errors.Newf("%v", r) } }() for _, b := range pending { bytesWritten += blockSize - int64(b.flushed) if err = w.flushBlock(b); err != nil { break } } if n := len(data); err == nil && n > 0 { bytesWritten += int64(n) _, err = w.w.Write(data) } synced = head != tail if synced { if err == nil && w.s != nil { syncLatency, err = w.syncWithLatency() } f := &w.flusher if popErr := f.syncQ.pop(head, tail, err, w.queueSemChan); popErr != nil { return synced, syncLatency, bytesWritten, popErr } } return synced, syncLatency, bytesWritten, err } func (w *LogWriter) syncWithLatency() (time.Duration, error) { start := time.Now() err := w.s.Sync() syncLatency := time.Since(start) return syncLatency, err } func (w *LogWriter) flushBlock(b *block) error { if _, err := w.w.Write(b.buf[b.flushed:]); err != nil { return err } b.written.Store(0) b.flushed = 0 w.free.Lock() w.free.blocks = append(w.free.blocks, b) w.free.Unlock() return nil } // queueBlock queues the current block for writing to the underlying writer, // allocates a new block and reserves space for the next header. func (w *LogWriter) queueBlock() { // Allocate a new block, blocking until one is available. We do this first // because w.block is protected by w.flusher.Mutex. w.free.Lock() if len(w.free.blocks) == 0 { w.free.blocks = append(w.free.blocks, blockPool.Get().(*block)) } nextBlock := w.free.blocks[len(w.free.blocks)-1] w.free.blocks = w.free.blocks[:len(w.free.blocks)-1] w.free.Unlock() f := &w.flusher f.Lock() f.pending = append(f.pending, w.block) w.block = nextBlock f.ready.Signal() w.err = w.flusher.err f.Unlock() w.blockNum++ } // Close flushes and syncs any unwritten data and closes the writer. // Where required, external synchronisation is provided by commitPipeline.mu. func (w *LogWriter) Close() error { f := &w.flusher // Emit an EOF trailer signifying the end of this log. This helps readers // differentiate between a corrupted entry in the middle of a log from // garbage at the tail from a recycled log file. w.emitEOFTrailer() // Signal the flush loop to close. f.Lock() f.close = true f.ready.Signal() f.Unlock() // Wait for the flush loop to close. The flush loop will not close until all // pending data has been written or an error occurs. <-f.closed // Sync any flushed data to disk. NB: flushLoop will sync after flushing the // last buffered data only if it was requested via syncQ, so we need to sync // here to ensure that all the data is synced. err := w.flusher.err var syncLatency time.Duration if err == nil && w.s != nil { syncLatency, err = w.syncWithLatency() } f.Lock() if f.fsyncLatency != nil { f.fsyncLatency.Observe(float64(syncLatency)) } free := w.free.blocks f.Unlock() if w.c != nil { cerr := w.c.Close() w.c = nil if cerr != nil { return cerr } } for _, b := range free { b.flushed = 0 b.written.Store(0) blockPool.Put(b) } w.err = errClosedWriter return err } // WriteRecord writes a complete record. Returns the offset just past the end // of the record. // External synchronisation provided by commitPipeline.mu. func (w *LogWriter) WriteRecord(p []byte) (int64, error) { logSize, err := w.SyncRecord(p, nil, nil) return logSize, err } // SyncRecord writes a complete record. If wg != nil the record will be // asynchronously persisted to the underlying writer and done will be called on // the wait group upon completion. Returns the offset just past the end of the // record. // External synchronisation provided by commitPipeline.mu. func (w *LogWriter) SyncRecord( p []byte, wg *sync.WaitGroup, err *error, ) (logSize int64, err2 error) { if w.err != nil { return -1, w.err } // The `i == 0` condition ensures we handle empty records. Such records can // possibly be generated for VersionEdits stored in the MANIFEST. While the // MANIFEST is currently written using Writer, it is good to support the same // semantics with LogWriter. for i := 0; i == 0 || len(p) > 0; i++ { p = w.emitFragment(i, p) } if wg != nil { // If we've been asked to persist the record, add the WaitGroup to the sync // queue and signal the flushLoop. Note that flushLoop will write partial // blocks to the file if syncing has been requested. The contract is that // any record written to the LogWriter to this point will be flushed to the // OS and synced to disk. f := &w.flusher f.syncQ.push(wg, err) f.ready.Signal() } offset := w.blockNum*blockSize + int64(w.block.written.Load()) // Note that we don't return w.err here as a concurrent call to Close would // race with our read. That's ok because the only error we could be seeing is // one to syncing for which the caller can receive notification of by passing // in a non-nil err argument. return offset, nil } // Size returns the current size of the file. // External synchronisation provided by commitPipeline.mu. func (w *LogWriter) Size() int64 { return w.blockNum*blockSize + int64(w.block.written.Load()) } func (w *LogWriter) emitEOFTrailer() { // Write a recyclable chunk header with a different log number. Readers // will treat the header as EOF when the log number does not match. b := w.block i := b.written.Load() binary.LittleEndian.PutUint32(b.buf[i+0:i+4], 0) // CRC binary.LittleEndian.PutUint16(b.buf[i+4:i+6], 0) // Size b.buf[i+6] = recyclableFullChunkType binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum+1) // Log number b.written.Store(i + int32(recyclableHeaderSize)) } func (w *LogWriter) emitFragment(n int, p []byte) (remainingP []byte) { b := w.block i := b.written.Load() first := n == 0 last := blockSize-i-recyclableHeaderSize >= int32(len(p)) if last { if first { b.buf[i+6] = recyclableFullChunkType } else { b.buf[i+6] = recyclableLastChunkType } } else { if first { b.buf[i+6] = recyclableFirstChunkType } else { b.buf[i+6] = recyclableMiddleChunkType } } binary.LittleEndian.PutUint32(b.buf[i+7:i+11], w.logNum) r := copy(b.buf[i+recyclableHeaderSize:], p) j := i + int32(recyclableHeaderSize+r) binary.LittleEndian.PutUint32(b.buf[i+0:i+4], crc.New(b.buf[i+6:j]).Value()) binary.LittleEndian.PutUint16(b.buf[i+4:i+6], uint16(r)) b.written.Store(j) if blockSize-b.written.Load() < recyclableHeaderSize { // There is no room for another fragment in the block, so fill the // remaining bytes with zeros and queue the block for flushing. for i := b.written.Load(); i < blockSize; i++ { b.buf[i] = 0 } w.queueBlock() } return p[r:] } // Metrics must be called after Close. The callee will no longer modify the // returned LogWriterMetrics. func (w *LogWriter) Metrics() *LogWriterMetrics { return w.flusher.metrics } // LogWriterMetrics contains misc metrics for the log writer. type LogWriterMetrics struct { WriteThroughput base.ThroughputMetric PendingBufferLen base.GaugeSampleMetric SyncQueueLen base.GaugeSampleMetric } // Merge merges metrics from x. Requires that x is non-nil. func (m *LogWriterMetrics) Merge(x *LogWriterMetrics) error { m.WriteThroughput.Merge(x.WriteThroughput) m.PendingBufferLen.Merge(x.PendingBufferLen) m.SyncQueueLen.Merge(x.SyncQueueLen) return nil }