mirror of
https://github.com/0glabs/0g-storage-node.git
synced 2025-01-23 13:36:08 +00:00
Add more metrics for file sync (#164)
Some checks are pending
abi-consistent-check / build-and-compare (push) Waiting to run
code-coverage / unittest-cov (push) Waiting to run
rust / check (push) Waiting to run
rust / test (push) Waiting to run
rust / lints (push) Waiting to run
functional-test / test (push) Waiting to run
Some checks are pending
abi-consistent-check / build-and-compare (push) Waiting to run
code-coverage / unittest-cov (push) Waiting to run
rust / check (push) Waiting to run
rust / test (push) Waiting to run
rust / lints (push) Waiting to run
functional-test / test (push) Waiting to run
* add metrics in log sync package * udpate auto sync metrics * Add metrics for completed file sync * add more metrics for serial file sync * adjust default timeout value for auto sync * fix metrics rpc for Timer type * add metrics for channel * refactor channel metrics * add timeout metrics for segment sync * refactor channel receiver
This commit is contained in:
parent
c1f465e009
commit
82fef674ed
3
Cargo.lock
generated
3
Cargo.lock
generated
@ -844,6 +844,7 @@ dependencies = [
|
||||
name = "channel"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"metrics",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@ -4538,6 +4539,8 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"jsonrpsee",
|
||||
"lazy_static",
|
||||
"metrics",
|
||||
"serde_json",
|
||||
"shared_types",
|
||||
"storage",
|
||||
|
@ -5,3 +5,4 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1.19.2", features = ["sync", "time"] }
|
||||
metrics = { workspace = true }
|
||||
|
@ -1,7 +1,9 @@
|
||||
use crate::error::Error;
|
||||
use crate::metrics::unbounded_channel;
|
||||
use metrics::{Counter, CounterUsize};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use tokio::sync::{mpsc, oneshot};
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::time::timeout;
|
||||
|
||||
const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3);
|
||||
@ -19,20 +21,30 @@ pub struct Channel<N, Req, Res> {
|
||||
}
|
||||
|
||||
impl<N, Req, Res> Channel<N, Req, Res> {
|
||||
pub fn unbounded() -> (Sender<N, Req, Res>, Receiver<N, Req, Res>) {
|
||||
let (sender, receiver) = mpsc::unbounded_channel();
|
||||
(Sender { chan: sender }, Receiver { chan: receiver })
|
||||
pub fn unbounded(name: &str) -> (Sender<N, Req, Res>, Receiver<N, Req, Res>) {
|
||||
let metrics_group = format!("common_channel_{}", name);
|
||||
let (sender, receiver) = unbounded_channel(metrics_group.as_str());
|
||||
let metrics_timeout = CounterUsize::register_with_group(metrics_group.as_str(), "timeout");
|
||||
(
|
||||
Sender {
|
||||
chan: sender,
|
||||
metrics_timeout,
|
||||
},
|
||||
receiver,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Sender<N, Req, Res> {
|
||||
chan: mpsc::UnboundedSender<Message<N, Req, Res>>,
|
||||
chan: crate::metrics::Sender<Message<N, Req, Res>>,
|
||||
metrics_timeout: Arc<dyn Counter<usize>>,
|
||||
}
|
||||
|
||||
impl<N, Req, Res> Clone for Sender<N, Req, Res> {
|
||||
fn clone(&self) -> Self {
|
||||
Sender {
|
||||
chan: self.chan.clone(),
|
||||
metrics_timeout: self.metrics_timeout.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -53,24 +65,15 @@ impl<N, Req, Res> Sender<N, Req, Res> {
|
||||
|
||||
timeout(DEFAULT_REQUEST_TIMEOUT, receiver)
|
||||
.await
|
||||
.map_err(|_| Error::TimeoutError)?
|
||||
.map_err(|_| {
|
||||
self.metrics_timeout.inc(1);
|
||||
Error::TimeoutError
|
||||
})?
|
||||
.map_err(|e| Error::RecvError(e))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Receiver<N, Req, Res> {
|
||||
chan: mpsc::UnboundedReceiver<Message<N, Req, Res>>,
|
||||
}
|
||||
|
||||
impl<N, Req, Res> Receiver<N, Req, Res> {
|
||||
pub async fn recv(&mut self) -> Option<Message<N, Req, Res>> {
|
||||
self.chan.recv().await
|
||||
}
|
||||
|
||||
pub fn try_recv(&mut self) -> Result<Message<N, Req, Res>, TryRecvError> {
|
||||
self.chan.try_recv()
|
||||
}
|
||||
}
|
||||
pub type Receiver<N, Req, Res> = crate::metrics::Receiver<Message<N, Req, Res>>;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@ -91,7 +94,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn request_response() {
|
||||
let (tx, mut rx) = Channel::<Notification, Request, Response>::unbounded();
|
||||
let (tx, mut rx) = Channel::<Notification, Request, Response>::unbounded("test");
|
||||
|
||||
let task1 = async move {
|
||||
match rx.recv().await.expect("not dropped") {
|
||||
|
@ -1,5 +1,6 @@
|
||||
mod channel;
|
||||
pub mod error;
|
||||
pub mod metrics;
|
||||
pub mod test_util;
|
||||
|
||||
pub use crate::channel::{Channel, Message, Receiver, ResponseSender, Sender};
|
||||
|
112
common/channel/src/metrics.rs
Normal file
112
common/channel/src/metrics.rs
Normal file
@ -0,0 +1,112 @@
|
||||
use std::{fmt::Debug, sync::Arc, time::Instant};
|
||||
|
||||
use metrics::{register_meter_with_group, Counter, CounterUsize, Histogram, Meter, Sample};
|
||||
use tokio::sync::mpsc::{
|
||||
error::{SendError, TryRecvError},
|
||||
unbounded_channel as new_unbounded_channel, UnboundedReceiver, UnboundedSender,
|
||||
};
|
||||
|
||||
pub fn unbounded_channel<T>(metric_name: &str) -> (Sender<T>, Receiver<T>) {
|
||||
let (sender, receiver) = new_unbounded_channel();
|
||||
let metrics_queued = CounterUsize::register_with_group(metric_name, "size");
|
||||
(
|
||||
Sender::new(sender, metric_name, metrics_queued.clone()),
|
||||
Receiver::new(receiver, metric_name, metrics_queued),
|
||||
)
|
||||
}
|
||||
|
||||
pub struct Sender<T> {
|
||||
sender: UnboundedSender<(Instant, T)>,
|
||||
metrics_send_qps: Arc<dyn Meter>,
|
||||
metrics_queued: Arc<dyn Counter<usize>>,
|
||||
}
|
||||
|
||||
impl<T> Clone for Sender<T> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
sender: self.sender.clone(),
|
||||
metrics_send_qps: self.metrics_send_qps.clone(),
|
||||
metrics_queued: self.metrics_queued.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Debug for Sender<T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self.sender)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Sender<T> {
|
||||
pub(crate) fn new(
|
||||
sender: UnboundedSender<(Instant, T)>,
|
||||
metrics_group: &str,
|
||||
metrics_queued: Arc<dyn Counter<usize>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
sender,
|
||||
metrics_send_qps: register_meter_with_group(metrics_group, "send"),
|
||||
metrics_queued,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn send(&self, value: T) -> Result<(), SendError<T>> {
|
||||
match self.sender.send((Instant::now(), value)) {
|
||||
Ok(()) => {
|
||||
self.metrics_send_qps.mark(1);
|
||||
self.metrics_queued.inc(1);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => Err(SendError(e.0 .1)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Receiver<T> {
|
||||
receiver: UnboundedReceiver<(Instant, T)>,
|
||||
metrics_recv_qps: Arc<dyn Meter>,
|
||||
metrics_queued: Arc<dyn Counter<usize>>,
|
||||
metrics_queue_latency: Arc<dyn Histogram>,
|
||||
}
|
||||
|
||||
impl<T> Debug for Receiver<T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self.receiver)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Receiver<T> {
|
||||
pub(crate) fn new(
|
||||
receiver: UnboundedReceiver<(Instant, T)>,
|
||||
metrics_group: &str,
|
||||
metrics_queued: Arc<dyn Counter<usize>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
receiver,
|
||||
metrics_recv_qps: register_meter_with_group(metrics_group, "recv"),
|
||||
metrics_queued,
|
||||
metrics_queue_latency: Sample::ExpDecay(0.015).register_with_group(
|
||||
metrics_group,
|
||||
"latency",
|
||||
1024,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn on_recv(&self, value: (Instant, T)) -> T {
|
||||
self.metrics_recv_qps.mark(1);
|
||||
self.metrics_queued.dec(1);
|
||||
self.metrics_queue_latency.update_since(value.0);
|
||||
value.1
|
||||
}
|
||||
|
||||
pub async fn recv(&mut self) -> Option<T> {
|
||||
let value = self.receiver.recv().await?;
|
||||
Some(self.on_recv(value))
|
||||
}
|
||||
|
||||
pub fn try_recv(&mut self) -> Result<T, TryRecvError> {
|
||||
let value = self.receiver.try_recv()?;
|
||||
Ok(self.on_recv(value))
|
||||
}
|
||||
}
|
@ -21,4 +21,6 @@ storage = { path = "../storage" }
|
||||
contract-interface = { path = "../../common/contract-interface" }
|
||||
futures-core = "0.3.28"
|
||||
futures-util = "0.3.28"
|
||||
thiserror = "1.0.44"
|
||||
thiserror = "1.0.44"
|
||||
lazy_static = "1.4.0"
|
||||
metrics = { workspace = true }
|
||||
|
7
node/log_entry_sync/src/sync_manager/metrics.rs
Normal file
7
node/log_entry_sync/src/sync_manager/metrics.rs
Normal file
@ -0,0 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use metrics::{register_timer, Timer};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
pub static ref STORE_PUT_TX: Arc<dyn Timer> = register_timer("log_entry_sync_store_put_tx");
|
||||
}
|
@ -11,7 +11,7 @@ use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use storage::log_store::{tx_store::BlockHashAndSubmissionIndex, Store};
|
||||
use task_executor::{ShutdownReason, TaskExecutor};
|
||||
use tokio::sync::broadcast;
|
||||
@ -358,7 +358,11 @@ impl LogSyncManager {
|
||||
}
|
||||
|
||||
async fn put_tx_inner(&mut self, tx: Transaction) -> bool {
|
||||
if let Err(e) = self.store.put_tx(tx.clone()) {
|
||||
let start_time = Instant::now();
|
||||
let result = self.store.put_tx(tx.clone());
|
||||
metrics::STORE_PUT_TX.update_since(start_time);
|
||||
|
||||
if let Err(e) = result {
|
||||
error!("put_tx error: e={:?}", e);
|
||||
false
|
||||
} else {
|
||||
@ -458,3 +462,4 @@ pub(crate) mod config;
|
||||
mod data_cache;
|
||||
mod log_entry_fetcher;
|
||||
mod log_query;
|
||||
mod metrics;
|
||||
|
@ -815,7 +815,7 @@ mod tests {
|
||||
let runtime = TestRuntime::default();
|
||||
let (network_globals, keypair) = Context::new_network_globals();
|
||||
let (network_send, network_recv) = mpsc::unbounded_channel();
|
||||
let (sync_send, sync_recv) = channel::Channel::unbounded();
|
||||
let (sync_send, sync_recv) = channel::Channel::unbounded("test");
|
||||
let (chunk_pool_send, _chunk_pool_recv) = mpsc::unbounded_channel();
|
||||
let store = LogManager::memorydb(LogConfig::default()).unwrap();
|
||||
Self {
|
||||
|
@ -4,7 +4,7 @@ use crate::{error, Context};
|
||||
use futures::prelude::*;
|
||||
use jsonrpsee::core::async_trait;
|
||||
use jsonrpsee::core::RpcResult;
|
||||
use metrics::DEFAULT_REGISTRY;
|
||||
use metrics::{DEFAULT_GROUPING_REGISTRY, DEFAULT_REGISTRY};
|
||||
use network::{multiaddr::Protocol, Multiaddr};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::net::IpAddr;
|
||||
@ -266,6 +266,21 @@ impl RpcServer for RpcServerImpl {
|
||||
}
|
||||
}
|
||||
|
||||
for (group_name, metrics) in DEFAULT_GROUPING_REGISTRY.read().get_all() {
|
||||
for (metric_name, metric) in metrics.iter() {
|
||||
let name = format!("{}.{}", group_name, metric_name);
|
||||
match &maybe_prefix {
|
||||
Some(prefix) if !name.starts_with(prefix) => {}
|
||||
_ => {
|
||||
result.insert(
|
||||
name,
|
||||
format!("{} {}", metric.get_type(), metric.get_value()),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use crate::{controllers::SyncState, Config, SyncRequest, SyncResponse, SyncSender};
|
||||
use crate::{controllers::SyncState, SyncRequest, SyncResponse, SyncSender};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{collections::HashSet, fmt::Debug, sync::Arc};
|
||||
use std::{collections::HashSet, fmt::Debug, sync::Arc, time::Duration};
|
||||
use storage_async::Store;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
@ -15,18 +15,23 @@ pub enum SyncResult {
|
||||
/// Supports to sync files concurrently.
|
||||
#[derive(Clone)]
|
||||
pub struct Batcher {
|
||||
pub(crate) config: Config,
|
||||
capacity: usize,
|
||||
find_peer_timeout: Duration,
|
||||
tasks: Arc<RwLock<HashSet<u64>>>, // files to sync
|
||||
store: Store,
|
||||
sync_send: SyncSender,
|
||||
}
|
||||
|
||||
impl Batcher {
|
||||
pub fn new(config: Config, capacity: usize, store: Store, sync_send: SyncSender) -> Self {
|
||||
pub fn new(
|
||||
capacity: usize,
|
||||
find_peer_timeout: Duration,
|
||||
store: Store,
|
||||
sync_send: SyncSender,
|
||||
) -> Self {
|
||||
Self {
|
||||
config,
|
||||
capacity,
|
||||
find_peer_timeout,
|
||||
tasks: Default::default(),
|
||||
store,
|
||||
sync_send,
|
||||
@ -128,7 +133,7 @@ impl Batcher {
|
||||
|
||||
// finding peers timeout
|
||||
Some(SyncState::FindingPeers { origin, .. })
|
||||
if origin.elapsed() > self.config.find_peer_timeout =>
|
||||
if origin.elapsed() > self.find_peer_timeout =>
|
||||
{
|
||||
debug!(%tx_seq, "Terminate file sync due to finding peers timeout");
|
||||
self.terminate_file_sync(tx_seq, false).await;
|
||||
@ -137,7 +142,7 @@ impl Batcher {
|
||||
|
||||
// connecting peers timeout
|
||||
Some(SyncState::ConnectingPeers { origin, .. })
|
||||
if origin.elapsed() > self.config.find_peer_timeout =>
|
||||
if origin.elapsed() > self.find_peer_timeout =>
|
||||
{
|
||||
debug!(%tx_seq, "Terminate file sync due to connecting peers timeout");
|
||||
self.terminate_file_sync(tx_seq, false).await;
|
||||
|
@ -22,6 +22,7 @@ pub struct RandomBatcherState {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RandomBatcher {
|
||||
config: Config,
|
||||
batcher: Batcher,
|
||||
sync_store: Arc<SyncStore>,
|
||||
}
|
||||
@ -34,7 +35,13 @@ impl RandomBatcher {
|
||||
sync_store: Arc<SyncStore>,
|
||||
) -> Self {
|
||||
Self {
|
||||
batcher: Batcher::new(config, config.max_random_workers, store, sync_send),
|
||||
config,
|
||||
batcher: Batcher::new(
|
||||
config.max_random_workers,
|
||||
config.random_find_peer_timeout,
|
||||
store,
|
||||
sync_send,
|
||||
),
|
||||
sync_store,
|
||||
}
|
||||
}
|
||||
@ -56,7 +63,7 @@ impl RandomBatcher {
|
||||
// disable file sync until catched up
|
||||
if !catched_up.load(Ordering::Relaxed) {
|
||||
trace!("Cannot sync file in catch-up phase");
|
||||
sleep(self.batcher.config.auto_sync_idle_interval).await;
|
||||
sleep(self.config.auto_sync_idle_interval).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -73,11 +80,11 @@ impl RandomBatcher {
|
||||
"File sync still in progress or idle, state = {:?}",
|
||||
self.get_state().await
|
||||
);
|
||||
sleep(self.batcher.config.auto_sync_idle_interval).await;
|
||||
sleep(self.config.auto_sync_idle_interval).await;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(%err, "Failed to sync file once, state = {:?}", self.get_state().await);
|
||||
sleep(self.batcher.config.auto_sync_error_interval).await;
|
||||
sleep(self.config.auto_sync_error_interval).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -96,7 +103,7 @@ impl RandomBatcher {
|
||||
|
||||
debug!(%tx_seq, ?sync_result, "Completed to sync file, state = {:?}", self.get_state().await);
|
||||
match sync_result {
|
||||
SyncResult::Completed => metrics::RANDOM_SYNC_RESULT_COMPLETED.inc(1),
|
||||
SyncResult::Completed => metrics::RANDOM_SYNC_RESULT_COMPLETED.mark(1),
|
||||
SyncResult::Failed => metrics::RANDOM_SYNC_RESULT_FAILED.inc(1),
|
||||
SyncResult::Timeout => metrics::RANDOM_SYNC_RESULT_TIMEOUT.inc(1),
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ use tokio::{
|
||||
/// Supports to sync files in sequence concurrently.
|
||||
#[derive(Clone)]
|
||||
pub struct SerialBatcher {
|
||||
config: Config,
|
||||
batcher: Batcher,
|
||||
|
||||
/// Next tx seq to sync.
|
||||
@ -80,13 +81,17 @@ impl SerialBatcher {
|
||||
sync_send: SyncSender,
|
||||
sync_store: Arc<SyncStore>,
|
||||
) -> Result<Self> {
|
||||
let capacity = config.max_sequential_workers;
|
||||
|
||||
// continue file sync from break point in db
|
||||
let (next_tx_seq, max_tx_seq) = sync_store.get_tx_seq_range().await?;
|
||||
|
||||
Ok(Self {
|
||||
batcher: Batcher::new(config, capacity, store, sync_send),
|
||||
config,
|
||||
batcher: Batcher::new(
|
||||
config.max_sequential_workers,
|
||||
config.sequential_find_peer_timeout,
|
||||
store,
|
||||
sync_send,
|
||||
),
|
||||
next_tx_seq: Arc::new(AtomicU64::new(next_tx_seq.unwrap_or(0))),
|
||||
max_tx_seq: Arc::new(AtomicU64::new(max_tx_seq.unwrap_or(u64::MAX))),
|
||||
pending_completed_txs: Default::default(),
|
||||
@ -136,7 +141,7 @@ impl SerialBatcher {
|
||||
// disable file sync until catched up
|
||||
if !catched_up.load(Ordering::Relaxed) {
|
||||
trace!("Cannot sync file in catch-up phase");
|
||||
sleep(self.batcher.config.auto_sync_idle_interval).await;
|
||||
sleep(self.config.auto_sync_idle_interval).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -157,11 +162,11 @@ impl SerialBatcher {
|
||||
"File sync still in progress or idle, state = {:?}",
|
||||
self.get_state().await
|
||||
);
|
||||
sleep(self.batcher.config.auto_sync_idle_interval).await;
|
||||
sleep(self.config.auto_sync_idle_interval).await;
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(%err, "Failed to sync file once, state = {:?}", self.get_state().await);
|
||||
sleep(self.batcher.config.auto_sync_error_interval).await;
|
||||
sleep(self.config.auto_sync_error_interval).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -257,7 +262,7 @@ impl SerialBatcher {
|
||||
|
||||
info!(%tx_seq, ?sync_result, "Completed to sync file, state = {:?}", self.get_state().await);
|
||||
match sync_result {
|
||||
SyncResult::Completed => metrics::SEQUENTIAL_SYNC_RESULT_COMPLETED.inc(1),
|
||||
SyncResult::Completed => metrics::SEQUENTIAL_SYNC_RESULT_COMPLETED.mark(1),
|
||||
SyncResult::Failed => metrics::SEQUENTIAL_SYNC_RESULT_FAILED.inc(1),
|
||||
SyncResult::Timeout => metrics::SEQUENTIAL_SYNC_RESULT_TIMEOUT.inc(1),
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use metrics::{Counter, CounterUsize, Gauge, GaugeUsize, Histogram, Sample};
|
||||
use metrics::{register_meter, Counter, CounterUsize, Gauge, GaugeUsize, Histogram, Meter, Sample};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
// sequential auto sync
|
||||
@ -9,8 +9,7 @@ lazy_static::lazy_static! {
|
||||
pub static ref SEQUENTIAL_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_sequential_state_txs_pending", 1024);
|
||||
pub static ref SEQUENTIAL_STATE_GAP_NEXT_DB: Arc<dyn Gauge<usize>> = GaugeUsize::register("sync_auto_sequential_state_gap_next_db");
|
||||
|
||||
pub static ref SEQUENTIAL_SYNC_RESULT_TOTAL: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_total");
|
||||
pub static ref SEQUENTIAL_SYNC_RESULT_COMPLETED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_completed");
|
||||
pub static ref SEQUENTIAL_SYNC_RESULT_COMPLETED: Arc<dyn Meter> = register_meter("sync_auto_sequential_sync_result_completed");
|
||||
pub static ref SEQUENTIAL_SYNC_RESULT_FAILED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_failed");
|
||||
pub static ref SEQUENTIAL_SYNC_RESULT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_timeout");
|
||||
|
||||
@ -19,8 +18,7 @@ lazy_static::lazy_static! {
|
||||
pub static ref RANDOM_STATE_TXS_READY: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_ready", 1024);
|
||||
pub static ref RANDOM_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_pending", 1024);
|
||||
|
||||
pub static ref RANDOM_SYNC_RESULT_TOTAL: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_total");
|
||||
pub static ref RANDOM_SYNC_RESULT_COMPLETED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_completed");
|
||||
pub static ref RANDOM_SYNC_RESULT_COMPLETED: Arc<dyn Meter> = register_meter("sync_auto_random_sync_result_completed");
|
||||
pub static ref RANDOM_SYNC_RESULT_FAILED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_failed");
|
||||
pub static ref RANDOM_SYNC_RESULT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_timeout");
|
||||
}
|
||||
|
11
node/sync/src/controllers/metrics.rs
Normal file
11
node/sync/src/controllers/metrics.rs
Normal file
@ -0,0 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use metrics::{register_timer, Counter, CounterUsize, Histogram, Sample, Timer};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
pub static ref SERIAL_SYNC_FILE_COMPLETED: Arc<dyn Timer> = register_timer("sync_controllers_serial_sync_file_completed");
|
||||
pub static ref SERIAL_SYNC_CHUNKS_COMPLETED: Arc<dyn Timer> = register_timer("sync_controllers_serial_sync_chunks_completed");
|
||||
pub static ref SERIAL_SYNC_SEGMENT_LATENCY: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_controllers_serial_sync_segment_latency", 1024);
|
||||
pub static ref SERIAL_SYNC_SEGMENT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_controllers_serial_sync_segment_timeout");
|
||||
pub static ref SERIAL_SYNC_UNEXPECTED_ERRORS: Arc<dyn Counter<usize>> = CounterUsize::register("sync_controllers_serial_sync_unexpected_errors");
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
mod metrics;
|
||||
mod peers;
|
||||
mod serial;
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
use crate::context::SyncNetworkContext;
|
||||
use crate::controllers::peers::{PeerState, SyncPeers};
|
||||
use crate::controllers::{FileSyncGoal, FileSyncInfo};
|
||||
use crate::controllers::{metrics, FileSyncGoal, FileSyncInfo};
|
||||
use crate::{Config, InstantWrapper};
|
||||
use file_location_cache::FileLocationCache;
|
||||
use libp2p::swarm::DialError;
|
||||
@ -311,15 +311,15 @@ impl SerialSyncController {
|
||||
.peers
|
||||
.add_new_peer_with_config(peer_id, addr.clone(), shard_config)
|
||||
{
|
||||
info!(%self.tx_seq, %peer_id, %addr, "Found new peer");
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "Found new peer");
|
||||
true
|
||||
} else {
|
||||
// e.g. multiple `AnnounceFile` messages propagated
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "Found an existing peer");
|
||||
trace!(%self.tx_seq, %peer_id, %addr, "Found an existing peer");
|
||||
false
|
||||
}
|
||||
} else {
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "Found peer without shard config");
|
||||
info!(%self.tx_seq, %peer_id, %addr, "Found peer without shard config");
|
||||
false
|
||||
}
|
||||
}
|
||||
@ -406,7 +406,6 @@ impl SerialSyncController {
|
||||
}
|
||||
|
||||
pub async fn on_response(&mut self, from_peer_id: PeerId, response: ChunkArrayWithProof) {
|
||||
debug!(%self.tx_seq, %from_peer_id, "Received RPC response");
|
||||
if self.handle_on_response_mismatch(from_peer_id) {
|
||||
return;
|
||||
}
|
||||
@ -429,6 +428,7 @@ impl SerialSyncController {
|
||||
let data_len = response.chunks.data.len();
|
||||
if data_len == 0 || data_len % CHUNK_SIZE > 0 {
|
||||
warn!(%from_peer_id, %self.tx_seq, %data_len, "Invalid chunk response data length");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.ban_peer(from_peer_id, "Invalid chunk response data length");
|
||||
self.state = SyncState::Idle;
|
||||
return;
|
||||
@ -466,6 +466,7 @@ impl SerialSyncController {
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(%err, %self.tx_seq, "Failed to validate chunks response");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.ban_peer(from_peer_id, "Chunk array validation failed");
|
||||
self.state = SyncState::Idle;
|
||||
return;
|
||||
@ -474,6 +475,8 @@ impl SerialSyncController {
|
||||
|
||||
self.failures = 0;
|
||||
|
||||
metrics::SERIAL_SYNC_SEGMENT_LATENCY.update_since(since.0);
|
||||
|
||||
let shard_config = self.store.get_store().flow().get_shard_config();
|
||||
let next_chunk = shard_config.next_segment_index(
|
||||
(from_chunk / PORA_CHUNK_SIZE as u64) as usize,
|
||||
@ -488,6 +491,7 @@ impl SerialSyncController {
|
||||
Ok(true) => self.next_chunk = next_chunk as u64,
|
||||
Ok(false) => {
|
||||
warn!(%self.tx_seq, ?self.tx_id, "Transaction reverted while storing chunks");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.state = SyncState::Failed {
|
||||
reason: FailureReason::TxReverted(self.tx_id),
|
||||
};
|
||||
@ -495,6 +499,7 @@ impl SerialSyncController {
|
||||
}
|
||||
Err(err) => {
|
||||
error!(%err, %self.tx_seq, "Unexpected DB error while storing chunks");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.state = SyncState::Failed {
|
||||
reason: FailureReason::DBError(err.to_string()),
|
||||
};
|
||||
@ -511,6 +516,7 @@ impl SerialSyncController {
|
||||
// completed to download chunks
|
||||
if !self.goal.is_all_chunks() {
|
||||
self.state = SyncState::Completed;
|
||||
metrics::SERIAL_SYNC_CHUNKS_COMPLETED.update_since(self.since.0);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -523,15 +529,18 @@ impl SerialSyncController {
|
||||
Ok(true) => {
|
||||
info!(%self.tx_seq, "Succeeded to finalize file");
|
||||
self.state = SyncState::Completed;
|
||||
metrics::SERIAL_SYNC_FILE_COMPLETED.update_since(self.since.0);
|
||||
}
|
||||
Ok(false) => {
|
||||
warn!(?self.tx_id, %self.tx_seq, "Transaction reverted during finalize_tx");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.state = SyncState::Failed {
|
||||
reason: FailureReason::TxReverted(self.tx_id),
|
||||
};
|
||||
}
|
||||
Err(err) => {
|
||||
error!(%err, %self.tx_seq, "Unexpected error during finalize_tx");
|
||||
metrics::SERIAL_SYNC_UNEXPECTED_ERRORS.inc(1);
|
||||
self.state = SyncState::Failed {
|
||||
reason: FailureReason::DBError(err.to_string()),
|
||||
};
|
||||
@ -675,6 +684,7 @@ impl SerialSyncController {
|
||||
debug!(%self.tx_seq, "No peer to continue downloading and try to find other peers to download");
|
||||
self.state = SyncState::Idle;
|
||||
} else if since.elapsed() >= self.config.peer_chunks_download_timeout {
|
||||
metrics::SERIAL_SYNC_SEGMENT_TIMEOUT.inc(1);
|
||||
self.handle_response_failure(peer_id, "RPC timeout");
|
||||
} else {
|
||||
completed = true;
|
||||
|
@ -52,7 +52,9 @@ pub struct Config {
|
||||
pub max_sequential_workers: usize,
|
||||
pub max_random_workers: usize,
|
||||
#[serde(deserialize_with = "deserialize_duration")]
|
||||
pub find_peer_timeout: Duration,
|
||||
pub sequential_find_peer_timeout: Duration,
|
||||
#[serde(deserialize_with = "deserialize_duration")]
|
||||
pub random_find_peer_timeout: Duration,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
@ -61,26 +63,27 @@ impl Default for Config {
|
||||
// sync service config
|
||||
heartbeat_interval: Duration::from_secs(5),
|
||||
auto_sync_enabled: false,
|
||||
max_sync_files: 16,
|
||||
max_sync_files: 32,
|
||||
sync_file_by_rpc_enabled: true,
|
||||
sync_file_on_announcement_enabled: false,
|
||||
|
||||
// serial sync config
|
||||
max_chunks_to_request: 2 * 1024,
|
||||
max_request_failures: 5,
|
||||
peer_connect_timeout: Duration::from_secs(5),
|
||||
peer_disconnect_timeout: Duration::from_secs(5),
|
||||
peer_find_timeout: Duration::from_secs(5),
|
||||
peer_chunks_download_timeout: Duration::from_secs(5),
|
||||
peer_connect_timeout: Duration::from_secs(15),
|
||||
peer_disconnect_timeout: Duration::from_secs(15),
|
||||
peer_find_timeout: Duration::from_secs(30),
|
||||
peer_chunks_download_timeout: Duration::from_secs(15),
|
||||
peer_wait_outgoing_connection_timeout: Duration::from_secs(10),
|
||||
peer_next_chunks_request_wait_timeout: Duration::from_secs(3),
|
||||
|
||||
// auto sync config
|
||||
auto_sync_idle_interval: Duration::from_secs(3),
|
||||
auto_sync_error_interval: Duration::from_secs(10),
|
||||
max_sequential_workers: 8,
|
||||
max_random_workers: 4,
|
||||
find_peer_timeout: Duration::from_secs(10),
|
||||
max_sequential_workers: 24,
|
||||
max_random_workers: 8,
|
||||
sequential_find_peer_timeout: Duration::from_secs(60),
|
||||
random_find_peer_timeout: Duration::from_secs(500),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ impl SyncService {
|
||||
event_recv: broadcast::Receiver<LogSyncEvent>,
|
||||
catch_up_end_recv: oneshot::Receiver<()>,
|
||||
) -> Result<SyncSender> {
|
||||
let (sync_send, sync_recv) = channel::Channel::unbounded();
|
||||
let (sync_send, sync_recv) = channel::Channel::unbounded("sync");
|
||||
let store = Store::new(store, executor.clone());
|
||||
|
||||
// init auto sync
|
||||
@ -912,7 +912,7 @@ mod tests {
|
||||
create_file_location_cache(init_peer_id, vec![txs[0].id()]);
|
||||
|
||||
let (network_send, mut network_recv) = mpsc::unbounded_channel::<NetworkMessage>();
|
||||
let (_, sync_recv) = channel::Channel::unbounded();
|
||||
let (_, sync_recv) = channel::Channel::unbounded("test");
|
||||
|
||||
let mut sync = SyncService {
|
||||
config: Config::default(),
|
||||
@ -941,7 +941,7 @@ mod tests {
|
||||
create_file_location_cache(init_peer_id, vec![txs[0].id()]);
|
||||
|
||||
let (network_send, mut network_recv) = mpsc::unbounded_channel::<NetworkMessage>();
|
||||
let (_, sync_recv) = channel::Channel::unbounded();
|
||||
let (_, sync_recv) = channel::Channel::unbounded("test");
|
||||
|
||||
let mut sync = SyncService {
|
||||
config: Config::default(),
|
||||
|
@ -228,11 +228,7 @@ reward_contract_address = "0x0496D0817BD8519e0de4894Dc379D35c35275609"
|
||||
auto_sync_enabled = true
|
||||
|
||||
# Maximum number of files in sync from other peers simultaneously.
|
||||
max_sync_files = 32
|
||||
|
||||
# Timeout to terminate a file sync when automatically sync from other peers.
|
||||
# If timeout, terminated file sync will be triggered later.
|
||||
# find_peer_timeout = "10s"
|
||||
# max_sync_files = 32
|
||||
|
||||
# Enable to start a file sync via RPC (e.g. `admin_startSyncFile`).
|
||||
# sync_file_by_rpc_enabled = true
|
||||
@ -241,10 +237,10 @@ max_sync_files = 32
|
||||
# sync_file_on_announcement_enabled = false
|
||||
|
||||
# Maximum threads to sync files in sequence.
|
||||
max_sequential_workers = 24
|
||||
# max_sequential_workers = 24
|
||||
|
||||
# Maximum threads to sync files randomly.
|
||||
# max_random_workers = 4
|
||||
# max_random_workers = 8
|
||||
|
||||
#######################################################################
|
||||
### File Location Cache Options ###
|
||||
@ -265,4 +261,4 @@ max_sequential_workers = 24
|
||||
|
||||
# Validity period of location information.
|
||||
# If the timestamp in the storage location information exceeds this duration from the current time, it will be removed from the cache.
|
||||
# entry_expiration_time_secs = 3600
|
||||
# entry_expiration_time_secs = 3600
|
||||
|
@ -228,11 +228,7 @@ reward_contract_address = "0x51998C4d486F406a788B766d93510980ae1f9360"
|
||||
auto_sync_enabled = true
|
||||
|
||||
# Maximum number of files in sync from other peers simultaneously.
|
||||
max_sync_files = 32
|
||||
|
||||
# Timeout to terminate a file sync when automatically sync from other peers.
|
||||
# If timeout, terminated file sync will be triggered later.
|
||||
# find_peer_timeout = "10s"
|
||||
# max_sync_files = 32
|
||||
|
||||
# Enable to start a file sync via RPC (e.g. `admin_startSyncFile`).
|
||||
# sync_file_by_rpc_enabled = true
|
||||
@ -241,10 +237,10 @@ max_sync_files = 32
|
||||
# sync_file_on_announcement_enabled = false
|
||||
|
||||
# Maximum threads to sync files in sequence.
|
||||
max_sequential_workers = 24
|
||||
# max_sequential_workers = 24
|
||||
|
||||
# Maximum threads to sync files randomly.
|
||||
# max_random_workers = 4
|
||||
# max_random_workers = 8
|
||||
|
||||
#######################################################################
|
||||
### File Location Cache Options ###
|
||||
@ -265,4 +261,4 @@ max_sequential_workers = 24
|
||||
|
||||
# Validity period of location information.
|
||||
# If the timestamp in the storage location information exceeds this duration from the current time, it will be removed from the cache.
|
||||
# entry_expiration_time_secs = 3600
|
||||
# entry_expiration_time_secs = 3600
|
||||
|
@ -227,11 +227,7 @@
|
||||
# auto_sync_enabled = false
|
||||
|
||||
# Maximum number of files in sync from other peers simultaneously.
|
||||
# max_sync_files = 16
|
||||
|
||||
# Timeout to terminate a file sync when automatically sync from other peers.
|
||||
# If timeout, terminated file sync will be triggered later.
|
||||
# find_peer_timeout = "10s"
|
||||
# max_sync_files = 32
|
||||
|
||||
# Enable to start a file sync via RPC (e.g. `admin_startSyncFile`).
|
||||
# sync_file_by_rpc_enabled = true
|
||||
@ -240,10 +236,10 @@
|
||||
# sync_file_on_announcement_enabled = false
|
||||
|
||||
# Maximum threads to sync files in sequence.
|
||||
# max_sequential_workers = 8
|
||||
# max_sequential_workers = 24
|
||||
|
||||
# Maximum threads to sync files randomly.
|
||||
# max_random_workers = 4
|
||||
# max_random_workers = 8
|
||||
|
||||
#######################################################################
|
||||
### File Location Cache Options ###
|
||||
@ -264,4 +260,4 @@
|
||||
|
||||
# Validity period of location information.
|
||||
# If the timestamp in the storage location information exceeds this duration from the current time, it will be removed from the cache.
|
||||
# entry_expiration_time_secs = 3600
|
||||
# entry_expiration_time_secs = 3600
|
||||
|
Loading…
Reference in New Issue
Block a user