From a6f771e49c14130d3d9710390fabe319741ff19e Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Fri, 15 Sep 2023 16:56:46 -0400 Subject: [PATCH] Add metrics for rocksdb query latency (#1709) (#1722) * Add metrics for rocksdb query latency * Add metrics for rocksdb write stalling * Add metrics for rocksdb write stall histogram * Fix tests (cherry picked from commit 75c86a772b7a28f9dd44a911adacef93355aa46b) Co-authored-by: Evgeniy Scherbina --- cmd/kava/opendb/metrics.go | 154 +++++++++++++++++++++++++++ cmd/kava/opendb/stats_loader.go | 4 +- cmd/kava/opendb/stats_loader_test.go | 1 + 3 files changed, 158 insertions(+), 1 deletion(-) diff --git a/cmd/kava/opendb/metrics.go b/cmd/kava/opendb/metrics.go index a326fceb..91e05d05 100644 --- a/cmd/kava/opendb/metrics.go +++ b/cmd/kava/opendb/metrics.go @@ -35,6 +35,29 @@ type Metrics struct { BlockCacheHit metrics.Gauge BlockCacheAdd metrics.Gauge BlockCacheAddFailures metrics.Gauge + + // Latency + DBGetMicrosP50 metrics.Gauge + DBGetMicrosP95 metrics.Gauge + DBGetMicrosP99 metrics.Gauge + DBGetMicrosP100 metrics.Gauge + DBGetMicrosCount metrics.Gauge + + DBWriteMicrosP50 metrics.Gauge + DBWriteMicrosP95 metrics.Gauge + DBWriteMicrosP99 metrics.Gauge + DBWriteMicrosP100 metrics.Gauge + DBWriteMicrosCount metrics.Gauge + + // Write Stall + StallMicros metrics.Gauge + + DBWriteStallP50 metrics.Gauge + DBWriteStallP95 metrics.Gauge + DBWriteStallP99 metrics.Gauge + DBWriteStallP100 metrics.Gauge + DBWriteStallCount metrics.Gauge + DBWriteStallSum metrics.Gauge } // registerMetrics registers metrics in prometheus and initializes rocksdbMetrics variable @@ -137,6 +160,114 @@ func registerMetrics() { Name: "block_cache_add_failures", Help: "number of failures when adding blocks to block cache", }, labels), + + // Latency + DBGetMicrosP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_get_micros_p50", + Help: "", + }, labels), + DBGetMicrosP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_get_micros_p95", + Help: "", + }, labels), + DBGetMicrosP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_get_micros_p99", + Help: "", + }, labels), + DBGetMicrosP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_get_micros_p100", + Help: "", + }, labels), + DBGetMicrosCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_get_micros_count", + Help: "", + }, labels), + + DBWriteMicrosP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_write_micros_p50", + Help: "", + }, labels), + DBWriteMicrosP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_write_micros_p95", + Help: "", + }, labels), + DBWriteMicrosP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_write_micros_p99", + Help: "", + }, labels), + DBWriteMicrosP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_write_micros_p100", + Help: "", + }, labels), + DBWriteMicrosCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "latency", + Name: "db_write_micros_count", + Help: "", + }, labels), + + // Write Stall + StallMicros: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "stall_micros", + Help: "", + }, labels), + + DBWriteStallP50: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_p50", + Help: "", + }, labels), + DBWriteStallP95: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_p95", + Help: "", + }, labels), + DBWriteStallP99: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_p99", + Help: "", + }, labels), + DBWriteStallP100: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_p100", + Help: "", + }, labels), + DBWriteStallCount: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_count", + Help: "", + }, labels), + DBWriteStallSum: prometheus.NewGaugeFrom(stdprometheus.GaugeOpts{ + Namespace: "rocksdb", + Subsystem: "stall", + Name: "db_write_stall_sum", + Help: "", + }, labels), } } @@ -163,4 +294,27 @@ func (m *Metrics) report(props *properties, stats *stats) { m.BlockCacheHit.Set(float64(stats.BlockCacheHit)) m.BlockCacheAdd.Set(float64(stats.BlockCacheAdd)) m.BlockCacheAddFailures.Set(float64(stats.BlockCacheAddFailures)) + + // Latency + m.DBGetMicrosP50.Set(stats.DBGetMicros.P50) + m.DBGetMicrosP95.Set(stats.DBGetMicros.P95) + m.DBGetMicrosP99.Set(stats.DBGetMicros.P99) + m.DBGetMicrosP100.Set(stats.DBGetMicros.P100) + m.DBGetMicrosCount.Set(stats.DBGetMicros.Count) + + m.DBWriteMicrosP50.Set(stats.DBWriteMicros.P50) + m.DBWriteMicrosP95.Set(stats.DBWriteMicros.P95) + m.DBWriteMicrosP99.Set(stats.DBWriteMicros.P99) + m.DBWriteMicrosP100.Set(stats.DBWriteMicros.P100) + m.DBWriteMicrosCount.Set(stats.DBWriteMicros.Count) + + // Write Stall + m.StallMicros.Set(float64(stats.StallMicros)) + + m.DBWriteStallP50.Set(stats.DBWriteStallHistogram.P50) + m.DBWriteStallP95.Set(stats.DBWriteStallHistogram.P95) + m.DBWriteStallP99.Set(stats.DBWriteStallHistogram.P99) + m.DBWriteStallP100.Set(stats.DBWriteStallHistogram.P100) + m.DBWriteStallCount.Set(stats.DBWriteStallHistogram.Count) + m.DBWriteStallSum.Set(stats.DBWriteStallHistogram.Sum) } diff --git a/cmd/kava/opendb/stats_loader.go b/cmd/kava/opendb/stats_loader.go index 93e5684c..b852299d 100644 --- a/cmd/kava/opendb/stats_loader.go +++ b/cmd/kava/opendb/stats_loader.go @@ -114,7 +114,8 @@ type stats struct { BytesRead int64 // Writer has to wait for compaction or flush to finish. - StallMicros int64 + StallMicros int64 + DBWriteStallHistogram *float64Histogram // Last level and non-last level read statistics LastLevelReadBytes int64 @@ -180,6 +181,7 @@ func (l *statLoader) load() (*stats, error) { BytesWritten: l.getInt64StatValue("rocksdb.bytes.written", count), BytesRead: l.getInt64StatValue("rocksdb.bytes.read", count), StallMicros: l.getInt64StatValue("rocksdb.stall.micros", count), + DBWriteStallHistogram: l.getFloat64HistogramStatValue("rocksdb.db.write.stall"), LastLevelReadBytes: l.getInt64StatValue("rocksdb.last.level.read.bytes", count), LastLevelReadCount: l.getInt64StatValue("rocksdb.last.level.read.count", count), NonLastLevelReadBytes: l.getInt64StatValue("rocksdb.non.last.level.read.bytes", count), diff --git a/cmd/kava/opendb/stats_loader_test.go b/cmd/kava/opendb/stats_loader_test.go index a6652dad..5cc1e22d 100644 --- a/cmd/kava/opendb/stats_loader_test.go +++ b/cmd/kava/opendb/stats_loader_test.go @@ -53,6 +53,7 @@ func TestStatsLoader(t *testing.T) { "rocksdb.bytes.written": &defaultStat, "rocksdb.bytes.read": &defaultStat, "rocksdb.stall.micros": &defaultStat, + "rocksdb.db.write.stall": &defaultHistogramStat, "rocksdb.last.level.read.bytes": &defaultStat, "rocksdb.last.level.read.count": &defaultStat, "rocksdb.non.last.level.read.bytes": &defaultStat,