Add metrics for rocksdb query latency (#1709) (#1722)

* Add metrics for rocksdb query latency

* Add metrics for rocksdb write stalling

* Add metrics for rocksdb write stall histogram

* Fix tests

(cherry picked from commit 75c86a772b)

Co-authored-by: Evgeniy Scherbina <evgeniy.shcherbina.es@gmail.com>
This commit is contained in:
mergify[bot] 2023-09-15 16:56:46 -04:00 committed by Robert Pirtle
parent a442d20692
commit a6f771e49c
3 changed files with 158 additions and 1 deletions

View File

@ -35,6 +35,29 @@ type Metrics struct {
BlockCacheHit metrics.Gauge BlockCacheHit metrics.Gauge
BlockCacheAdd metrics.Gauge BlockCacheAdd metrics.Gauge
BlockCacheAddFailures metrics.Gauge BlockCacheAddFailures metrics.Gauge
// Latency
DBGetMicrosP50 metrics.Gauge
DBGetMicrosP95 metrics.Gauge
DBGetMicrosP99 metrics.Gauge
DBGetMicrosP100 metrics.Gauge
DBGetMicrosCount metrics.Gauge
DBWriteMicrosP50 metrics.Gauge
DBWriteMicrosP95 metrics.Gauge
DBWriteMicrosP99 metrics.Gauge
DBWriteMicrosP100 metrics.Gauge
DBWriteMicrosCount metrics.Gauge
// Write Stall
StallMicros metrics.Gauge
DBWriteStallP50 metrics.Gauge
DBWriteStallP95 metrics.Gauge
DBWriteStallP99 metrics.Gauge
DBWriteStallP100 metrics.Gauge
DBWriteStallCount metrics.Gauge
DBWriteStallSum metrics.Gauge
} }
// registerMetrics registers metrics in prometheus and initializes rocksdbMetrics variable // registerMetrics registers metrics in prometheus and initializes rocksdbMetrics variable
@ -137,6 +160,114 @@ func registerMetrics() {
Name: "block_cache_add_failures", Name: "block_cache_add_failures",
Help: "number of failures when adding blocks to block cache", Help: "number of failures when adding blocks to block cache",
}, labels), }, labels),
// Latency
DBGetMicrosP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_get_micros_p50",
Help: "",
}, labels),
DBGetMicrosP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_get_micros_p95",
Help: "",
}, labels),
DBGetMicrosP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_get_micros_p99",
Help: "",
}, labels),
DBGetMicrosP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_get_micros_p100",
Help: "",
}, labels),
DBGetMicrosCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_get_micros_count",
Help: "",
}, labels),
DBWriteMicrosP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_write_micros_p50",
Help: "",
}, labels),
DBWriteMicrosP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_write_micros_p95",
Help: "",
}, labels),
DBWriteMicrosP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_write_micros_p99",
Help: "",
}, labels),
DBWriteMicrosP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_write_micros_p100",
Help: "",
}, labels),
DBWriteMicrosCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "latency",
Name: "db_write_micros_count",
Help: "",
}, labels),
// Write Stall
StallMicros: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "stall_micros",
Help: "",
}, labels),
DBWriteStallP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_p50",
Help: "",
}, labels),
DBWriteStallP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_p95",
Help: "",
}, labels),
DBWriteStallP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_p99",
Help: "",
}, labels),
DBWriteStallP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_p100",
Help: "",
}, labels),
DBWriteStallCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_count",
Help: "",
}, labels),
DBWriteStallSum: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{
Namespace: "rocksdb",
Subsystem: "stall",
Name: "db_write_stall_sum",
Help: "",
}, labels),
} }
} }
@ -163,4 +294,27 @@ func (m *Metrics) report(props *properties, stats *stats) {
m.BlockCacheHit.Set(float64(stats.BlockCacheHit)) m.BlockCacheHit.Set(float64(stats.BlockCacheHit))
m.BlockCacheAdd.Set(float64(stats.BlockCacheAdd)) m.BlockCacheAdd.Set(float64(stats.BlockCacheAdd))
m.BlockCacheAddFailures.Set(float64(stats.BlockCacheAddFailures)) m.BlockCacheAddFailures.Set(float64(stats.BlockCacheAddFailures))
// Latency
m.DBGetMicrosP50.Set(stats.DBGetMicros.P50)
m.DBGetMicrosP95.Set(stats.DBGetMicros.P95)
m.DBGetMicrosP99.Set(stats.DBGetMicros.P99)
m.DBGetMicrosP100.Set(stats.DBGetMicros.P100)
m.DBGetMicrosCount.Set(stats.DBGetMicros.Count)
m.DBWriteMicrosP50.Set(stats.DBWriteMicros.P50)
m.DBWriteMicrosP95.Set(stats.DBWriteMicros.P95)
m.DBWriteMicrosP99.Set(stats.DBWriteMicros.P99)
m.DBWriteMicrosP100.Set(stats.DBWriteMicros.P100)
m.DBWriteMicrosCount.Set(stats.DBWriteMicros.Count)
// Write Stall
m.StallMicros.Set(float64(stats.StallMicros))
m.DBWriteStallP50.Set(stats.DBWriteStallHistogram.P50)
m.DBWriteStallP95.Set(stats.DBWriteStallHistogram.P95)
m.DBWriteStallP99.Set(stats.DBWriteStallHistogram.P99)
m.DBWriteStallP100.Set(stats.DBWriteStallHistogram.P100)
m.DBWriteStallCount.Set(stats.DBWriteStallHistogram.Count)
m.DBWriteStallSum.Set(stats.DBWriteStallHistogram.Sum)
} }

View File

@ -115,6 +115,7 @@ type stats struct {
// Writer has to wait for compaction or flush to finish. // Writer has to wait for compaction or flush to finish.
StallMicros int64 StallMicros int64
DBWriteStallHistogram *float64Histogram
// Last level and non-last level read statistics // Last level and non-last level read statistics
LastLevelReadBytes int64 LastLevelReadBytes int64
@ -180,6 +181,7 @@ func (l *statLoader) load() (*stats, error) {
BytesWritten: l.getInt64StatValue("rocksdb.bytes.written", count), BytesWritten: l.getInt64StatValue("rocksdb.bytes.written", count),
BytesRead: l.getInt64StatValue("rocksdb.bytes.read", count), BytesRead: l.getInt64StatValue("rocksdb.bytes.read", count),
StallMicros: l.getInt64StatValue("rocksdb.stall.micros", count), StallMicros: l.getInt64StatValue("rocksdb.stall.micros", count),
DBWriteStallHistogram: l.getFloat64HistogramStatValue("rocksdb.db.write.stall"),
LastLevelReadBytes: l.getInt64StatValue("rocksdb.last.level.read.bytes", count), LastLevelReadBytes: l.getInt64StatValue("rocksdb.last.level.read.bytes", count),
LastLevelReadCount: l.getInt64StatValue("rocksdb.last.level.read.count", count), LastLevelReadCount: l.getInt64StatValue("rocksdb.last.level.read.count", count),
NonLastLevelReadBytes: l.getInt64StatValue("rocksdb.non.last.level.read.bytes", count), NonLastLevelReadBytes: l.getInt64StatValue("rocksdb.non.last.level.read.bytes", count),

View File

@ -53,6 +53,7 @@ func TestStatsLoader(t *testing.T) {
"rocksdb.bytes.written": &defaultStat, "rocksdb.bytes.written": &defaultStat,
"rocksdb.bytes.read": &defaultStat, "rocksdb.bytes.read": &defaultStat,
"rocksdb.stall.micros": &defaultStat, "rocksdb.stall.micros": &defaultStat,
"rocksdb.db.write.stall": &defaultHistogramStat,
"rocksdb.last.level.read.bytes": &defaultStat, "rocksdb.last.level.read.bytes": &defaultStat,
"rocksdb.last.level.read.count": &defaultStat, "rocksdb.last.level.read.count": &defaultStat,
"rocksdb.non.last.level.read.bytes": &defaultStat, "rocksdb.non.last.level.read.bytes": &defaultStat,