Add metrics for router and auto sync (#161)

* Add metrics configurations

* Add metrics in router package

* Add catch up info in sync service state

* Add metrics for auto sync

* update cargo lock
This commit is contained in:
Bo QIU 2024-08-19 09:54:52 +08:00 committed by GitHub
parent 22ed8f5f91
commit f9120b1e4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 530 additions and 29 deletions

299
Cargo.lock generated
View File

@ -181,6 +181,12 @@ dependencies = [
"tracing",
]
[[package]]
name = "arc-swap"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
[[package]]
name = "arrayref"
version = "0.3.7"
@ -1431,6 +1437,17 @@ dependencies = [
"powerfmt",
]
[[package]]
name = "derivative"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "derive_builder"
version = "0.9.0"
@ -1467,6 +1484,12 @@ dependencies = [
"syn 2.0.68",
]
[[package]]
name = "destructure_traitobject"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c877555693c14d2f84191cfd3ad8582790fc52b5e2274b40b59cf5f5cea25c7"
[[package]]
name = "digest"
version = "0.9.0"
@ -1636,7 +1659,7 @@ dependencies = [
"rust_decimal",
"serde",
"thiserror",
"time",
"time 0.3.36",
]
[[package]]
@ -2406,6 +2429,21 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
@ -3012,6 +3050,12 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
version = "0.14.29"
@ -3080,6 +3124,19 @@ dependencies = [
"tracing",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "iana-time-zone"
version = "0.1.60"
@ -3263,6 +3320,19 @@ dependencies = [
"hashbrown 0.14.5",
]
[[package]]
name = "influx_db_client"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2ef03268010ccf98c178eed83aa7377b7f6531f8ec8d43a256902c24cadac60"
dependencies = [
"bytes",
"futures",
"reqwest",
"serde",
"serde_json",
]
[[package]]
name = "inout"
version = "0.1.3"
@ -4415,9 +4485,45 @@ version = "0.4.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
dependencies = [
"serde",
"value-bag",
]
[[package]]
name = "log-mdc"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a94d21414c1f4a51209ad204c1776a3d0765002c76c6abcb602a6f09f1e881c7"
[[package]]
name = "log4rs"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0816135ae15bd0391cf284eab37e6e3ee0a6ee63d2ceeb659862bd8d0a984ca6"
dependencies = [
"anyhow",
"arc-swap",
"chrono",
"derivative",
"flate2",
"fnv",
"humantime",
"libc",
"log",
"log-mdc",
"once_cell",
"parking_lot 0.12.3",
"rand 0.8.5",
"serde",
"serde-value",
"serde_json",
"serde_yaml",
"thiserror",
"thread-id",
"typemap-ors",
"winapi",
]
[[package]]
name = "log_entry_sync"
version = "0.1.0"
@ -4511,6 +4617,25 @@ dependencies = [
"tiny-keccak",
]
[[package]]
name = "metrics"
version = "0.1.0"
source = "git+https://github.com/Conflux-Chain/conflux-rust.git?rev=3ee498ce659e11fd0030bd4a264b7442705ade2b#3ee498ce659e11fd0030bd4a264b7442705ade2b"
dependencies = [
"chrono",
"futures",
"influx_db_client",
"lazy_static",
"log",
"log4rs",
"parking_lot 0.11.2",
"rand 0.7.3",
"serde",
"time 0.1.45",
"timer",
"tokio",
]
[[package]]
name = "mime"
version = "0.3.17"
@ -4692,6 +4817,23 @@ dependencies = [
"unsigned-varint",
]
[[package]]
name = "native-tls"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
dependencies = [
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "netlink-packet-core"
version = "0.4.2"
@ -4989,18 +5131,65 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "openssl"
version = "0.10.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1"
dependencies = [
"bitflags 2.6.0",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.68",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "option-ext"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "ordered-float"
version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
dependencies = [
"num-traits",
]
[[package]]
name = "ordered-multimap"
version = "0.4.3"
@ -6141,10 +6330,12 @@ dependencies = [
"http-body",
"hyper",
"hyper-rustls 0.24.2",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite 0.2.14",
@ -6156,6 +6347,7 @@ dependencies = [
"sync_wrapper",
"system-configuration",
"tokio",
"tokio-native-tls",
"tokio-rustls 0.24.1",
"tower-service",
"url",
@ -6291,6 +6483,7 @@ dependencies = [
"file_location_cache",
"futures",
"lazy_static",
"metrics",
"miner",
"network",
"pruner",
@ -6670,6 +6863,16 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde-value"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
dependencies = [
"ordered-float",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.203"
@ -6713,6 +6916,19 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_yaml"
version = "0.9.34+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47"
dependencies = [
"indexmap 2.2.6",
"itoa",
"ryu",
"serde",
"unsafe-libyaml",
]
[[package]]
name = "sha-1"
version = "0.9.8"
@ -6844,7 +7060,7 @@ dependencies = [
"num-bigint",
"num-traits",
"thiserror",
"time",
"time 0.3.36",
]
[[package]]
@ -7164,9 +7380,11 @@ dependencies = [
"duration-str",
"eth2_ssz",
"file_location_cache",
"lazy_static",
"libp2p",
"log_entry_sync",
"merkle_light",
"metrics",
"network",
"rand 0.8.5",
"serde",
@ -7309,6 +7527,16 @@ dependencies = [
"syn 2.0.68",
]
[[package]]
name = "thread-id"
version = "4.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe8f25bbdd100db7e1d34acf7fd2dc59c4bf8f7483f505eaa7d4f12f76cc0ea"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "thread_local"
version = "1.1.8"
@ -7329,6 +7557,17 @@ dependencies = [
"libc",
]
[[package]]
name = "time"
version = "0.1.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi",
]
[[package]]
name = "time"
version = "0.3.36"
@ -7360,6 +7599,15 @@ dependencies = [
"time-core",
]
[[package]]
name = "timer"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31d42176308937165701f50638db1c31586f183f1aab416268216577aec7306b"
dependencies = [
"chrono",
]
[[package]]
name = "tiny-keccak"
version = "2.0.2"
@ -7434,6 +7682,16 @@ dependencies = [
"syn 2.0.68",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.23.4"
@ -7592,7 +7850,7 @@ checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf"
dependencies = [
"crossbeam-channel",
"thiserror",
"time",
"time 0.3.36",
"tracing-subscriber",
]
@ -7705,7 +7963,7 @@ dependencies = [
"radix_trie",
"rand 0.8.5",
"thiserror",
"time",
"time 0.3.36",
"tokio",
"trust-dns-proto 0.20.4",
]
@ -7806,6 +8064,15 @@ dependencies = [
"utf-8",
]
[[package]]
name = "typemap-ors"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a68c24b707f02dd18f1e4ccceb9d49f2058c2fb86384ef9972592904d7a28867"
dependencies = [
"unsafe-any-ors",
]
[[package]]
name = "typenum"
version = "1.17.0"
@ -7898,6 +8165,21 @@ dependencies = [
"subtle",
]
[[package]]
name = "unsafe-any-ors"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0a303d30665362d9680d7d91d78b23f5f899504d4f08b3c4cf08d055d87c0ad"
dependencies = [
"destructure_traitobject",
]
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
[[package]]
name = "unsigned-varint"
version = "0.7.1"
@ -8015,6 +8297,12 @@ version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
@ -8559,6 +8847,7 @@ dependencies = [
"itertools 0.10.5",
"libp2p",
"log_entry_sync",
"metrics",
"miner",
"network",
"pruner",
@ -8619,7 +8908,7 @@ dependencies = [
"hmac 0.12.1",
"pbkdf2 0.11.0",
"sha1",
"time",
"time 0.3.36",
"zstd",
]

View File

@ -27,6 +27,9 @@ members = [
]
resolver = "2"
[workspace.dependencies]
metrics = { git = "https://github.com/Conflux-Chain/conflux-rust.git", rev = "3ee498ce659e11fd0030bd4a264b7442705ade2b" }
[patch.crates-io]
discv5 = { path = "version-meld/discv5" }
eth2_ssz = { path = "version-meld/eth2_ssz" }

View File

@ -38,6 +38,7 @@ duration-str = "0.5.1"
config = "0.13.1"
public-ip = "0.2"
ethers = "2.0.14"
metrics = { workspace = true }
[dependencies.libp2p]
version = "0.45.1"

View File

@ -24,6 +24,7 @@ rand = "0.8.5"
serde = { version = "1.0.137", features = ["derive"] }
duration-str = "0.5.1"
public-ip = "0.2"
metrics = { workspace = true }
[dev-dependencies]
channel = { path = "../../common/channel" }

View File

@ -2,6 +2,7 @@
extern crate tracing;
mod libp2p_event_handler;
mod metrics;
mod peer_manager;
mod service;

View File

@ -22,6 +22,7 @@ use sync::{SyncMessage, SyncSender};
use tokio::sync::mpsc::UnboundedSender;
use tokio::sync::{mpsc, RwLock};
use crate::metrics;
use crate::peer_manager::PeerManager;
use crate::Config;
@ -32,12 +33,18 @@ lazy_static::lazy_static! {
pub static ref TOLERABLE_DRIFT: chrono::Duration = chrono::Duration::seconds(10);
}
#[allow(deprecated)]
fn duration_since(timestamp: u32) -> chrono::Duration {
fn duration_since(timestamp: u32, metric: Arc<dyn ::metrics::Histogram>) -> chrono::Duration {
let timestamp = i64::from(timestamp);
let timestamp = chrono::NaiveDateTime::from_timestamp_opt(timestamp, 0).expect("should fit");
let now = chrono::Utc::now().naive_utc();
now.signed_duration_since(timestamp)
let timestamp = chrono::DateTime::from_timestamp(timestamp, 0).expect("should fit");
let now = chrono::Utc::now();
let duration = now.signed_duration_since(timestamp);
let num_secs = duration.num_seconds();
if num_secs > 0 {
metric.update(num_secs as u64);
}
duration
}
fn peer_id_to_public_key(peer_id: &PeerId) -> Result<PublicKey, String> {
@ -141,7 +148,7 @@ impl Libp2pEventHandler {
pub fn send_status(&self, peer_id: PeerId) {
let status_message = StatusMessage {
data: self.network_globals.network_id(),
}; // dummy status message
};
debug!(%peer_id, ?status_message, "Sending Status request");
self.send_to_network(NetworkMessage::SendRequest {
@ -149,6 +156,8 @@ impl Libp2pEventHandler {
request_id: RequestId::Router,
request: Request::Status(status_message),
});
metrics::LIBP2P_SEND_STATUS.mark(1);
}
pub async fn on_peer_connected(&self, peer_id: PeerId, outgoing: bool) {
@ -157,12 +166,16 @@ impl Libp2pEventHandler {
if outgoing {
self.send_status(peer_id);
self.send_to_sync(SyncMessage::PeerConnected { peer_id });
metrics::LIBP2P_HANDLE_PEER_CONNECTED_OUTGOING.mark(1);
} else {
metrics::LIBP2P_HANDLE_PEER_CONNECTED_INCOMING.mark(1);
}
}
pub async fn on_peer_disconnected(&self, peer_id: PeerId) {
self.peers.write().await.remove(&peer_id);
self.send_to_sync(SyncMessage::PeerDisconnected { peer_id });
metrics::LIBP2P_HANDLE_PEER_DISCONNECTED.mark(1);
}
pub async fn on_rpc_request(
@ -176,6 +189,7 @@ impl Libp2pEventHandler {
match request {
Request::Status(status) => {
self.on_status_request(peer_id, request_id, status);
metrics::LIBP2P_HANDLE_REQUEST_STATUS.mark(1);
}
Request::GetChunks(request) => {
self.send_to_sync(SyncMessage::RequestChunks {
@ -183,6 +197,7 @@ impl Libp2pEventHandler {
request_id,
request,
});
metrics::LIBP2P_HANDLE_REQUEST_GET_CHUNKS.mark(1);
}
Request::DataByHash(_) => {
// ignore
@ -196,7 +211,7 @@ impl Libp2pEventHandler {
let network_id = self.network_globals.network_id();
let status_message = StatusMessage {
data: network_id.clone(),
}; // dummy status message
};
debug!(%peer_id, ?status_message, "Sending Status response");
self.send_to_network(NetworkMessage::SendResponse {
@ -224,6 +239,7 @@ impl Libp2pEventHandler {
Response::Status(status_message) => {
debug!(%peer_id, ?status_message, "Received Status response");
self.on_status_response(peer_id, status_message);
metrics::LIBP2P_HANDLE_RESPONSE_STATUS.mark(1);
}
Response::Chunks(response) => {
let request_id = match request_id {
@ -236,6 +252,8 @@ impl Libp2pEventHandler {
request_id,
response,
});
metrics::LIBP2P_HANDLE_RESPONSE_GET_CHUNKS.mark(1);
}
Response::DataByHash(_) => {
// ignore
@ -253,6 +271,8 @@ impl Libp2pEventHandler {
request_id,
});
}
metrics::LIBP2P_HANDLE_RESPONSE_ERROR.mark(1);
}
pub async fn on_pubsub_message(
@ -266,11 +286,24 @@ impl Libp2pEventHandler {
match message {
PubsubMessage::ExampleMessage(_) => MessageAcceptance::Ignore,
PubsubMessage::FindFile(msg) => self.on_find_file(msg).await,
PubsubMessage::FindChunks(msg) => self.on_find_chunks(msg).await,
PubsubMessage::AnnounceFile(msg) => self.on_announce_file(propagation_source, msg),
PubsubMessage::AnnounceChunks(msg) => self.on_announce_chunks(propagation_source, msg),
PubsubMessage::FindFile(msg) => {
metrics::LIBP2P_HANDLE_PUBSUB_FIND_FILE.mark(1);
self.on_find_file(msg).await
}
PubsubMessage::FindChunks(msg) => {
metrics::LIBP2P_HANDLE_PUBSUB_FIND_CHUNKS.mark(1);
self.on_find_chunks(msg).await
}
PubsubMessage::AnnounceFile(msg) => {
metrics::LIBP2P_HANDLE_PUBSUB_ANNOUNCE_FILE.mark(1);
self.on_announce_file(propagation_source, msg)
}
PubsubMessage::AnnounceChunks(msg) => {
metrics::LIBP2P_HANDLE_PUBSUB_ANNOUNCE_CHUNKS.mark(1);
self.on_announce_chunks(propagation_source, msg)
}
PubsubMessage::AnnounceShardConfig(msg) => {
metrics::LIBP2P_HANDLE_PUBSUB_ANNOUNCE_SHARD.mark(1);
self.on_announce_shard_config(propagation_source, msg)
}
}
@ -401,7 +434,10 @@ impl Libp2pEventHandler {
let FindFile { tx_id, timestamp } = msg;
// verify timestamp
let d = duration_since(timestamp);
let d = duration_since(
timestamp,
metrics::LIBP2P_HANDLE_PUBSUB_LATENCY_FIND_FILE.clone(),
);
if d < TOLERABLE_DRIFT.neg() || d > *FIND_FILE_TIMEOUT {
debug!(%timestamp, ?d, "Invalid timestamp, ignoring FindFile message");
return MessageAcceptance::Ignore;
@ -479,7 +515,10 @@ impl Libp2pEventHandler {
}
// verify timestamp
let d = duration_since(msg.timestamp);
let d = duration_since(
msg.timestamp,
metrics::LIBP2P_HANDLE_PUBSUB_LATENCY_FIND_CHUNKS.clone(),
);
if d < TOLERABLE_DRIFT.neg() || d > *FIND_FILE_TIMEOUT {
debug!(%msg.timestamp, ?d, "Invalid timestamp, ignoring FindFile message");
return MessageAcceptance::Ignore;
@ -547,11 +586,14 @@ impl Libp2pEventHandler {
None => return false,
};
metrics::LIBP2P_VERIFY_ANNOUNCED_IP.mark(1);
let seen_ips: Vec<IpAddr> = match self.network_globals.peers.read().peer_info(peer_id) {
Some(v) => v.seen_ip_addresses().collect(),
None => {
// ignore file announcement from un-seen peers
trace!(%announced_ip, "Failed to verify announced IP address, no peer info found");
metrics::LIBP2P_VERIFY_ANNOUNCED_IP_UNSEEN.mark(1);
return false;
}
};
@ -561,6 +603,7 @@ impl Libp2pEventHandler {
} else {
// ignore file announcement if announced IP and seen IP mismatch
trace!(%announced_ip, ?seen_ips, "Failed to verify announced IP address, mismatch with seen ips");
metrics::LIBP2P_VERIFY_ANNOUNCED_IP_MISMATCH.mark(1);
false
}
}
@ -587,7 +630,10 @@ impl Libp2pEventHandler {
}
// propagate gossip to peers
let d = duration_since(msg.resend_timestamp);
let d = duration_since(
msg.resend_timestamp,
metrics::LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_FILE.clone(),
);
if d < TOLERABLE_DRIFT.neg() || d > *ANNOUNCE_FILE_TIMEOUT {
debug!(%msg.resend_timestamp, ?d, "Invalid resend timestamp, ignoring AnnounceFile message");
return MessageAcceptance::Ignore;
@ -628,7 +674,10 @@ impl Libp2pEventHandler {
}
// propagate gossip to peers
let d = duration_since(msg.resend_timestamp);
let d = duration_since(
msg.resend_timestamp,
metrics::LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_SHARD.clone(),
);
if d < TOLERABLE_DRIFT.neg() || d > *ANNOUNCE_SHARD_CONFIG_TIMEOUT {
debug!(%msg.resend_timestamp, ?d, "Invalid resend timestamp, ignoring AnnounceShardConfig message");
return MessageAcceptance::Ignore;
@ -674,7 +723,10 @@ impl Libp2pEventHandler {
}
// propagate gossip to peers
let d = duration_since(msg.resend_timestamp);
let d = duration_since(
msg.resend_timestamp,
metrics::LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_CHUNKS.clone(),
);
if d < TOLERABLE_DRIFT.neg() || d > *ANNOUNCE_FILE_TIMEOUT {
debug!(%msg.resend_timestamp, ?d, "Invalid resend timestamp, ignoring AnnounceChunks message");
return MessageAcceptance::Ignore;

View File

@ -0,0 +1,51 @@
use std::sync::Arc;
use metrics::{register_meter, Histogram, Meter, Sample};
lazy_static::lazy_static! {
// service
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE: Arc<dyn Meter> = register_meter("router_service_route_network_message");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_SEND_REQUEST: Arc<dyn Meter> = register_meter("router_service_route_network_message_send_request");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_SEND_RESPONSE: Arc<dyn Meter> = register_meter("router_service_route_network_message_send_response");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_SEND_ERROR_RESPONSE: Arc<dyn Meter> = register_meter("router_service_route_network_message_send_error_response");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_PUBLISH: Arc<dyn Meter> = register_meter("router_service_route_network_message_publish");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_REPORT_PEER: Arc<dyn Meter> = register_meter("router_service_route_network_message_report_peer");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_GOODBYE_PEER: Arc<dyn Meter> = register_meter("router_service_route_network_message_goodbye_peer");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER: Arc<dyn Meter> = register_meter("router_service_route_network_message_dail_peer");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_ALREADY: Arc<dyn Meter> = register_meter("router_service_route_network_message_dail_peer_already");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_NEW_OK: Arc<dyn Meter> = register_meter("router_service_route_network_message_dail_peer_new_ok");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_NEW_FAIL: Arc<dyn Meter> = register_meter("router_service_route_network_message_dail_peer_new_fail");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_ANNOUNCE_LOCAL_FILE: Arc<dyn Meter> = register_meter("router_service_route_network_message_announce_local_file");
pub static ref SERVICE_ROUTE_NETWORK_MESSAGE_UPNP: Arc<dyn Meter> = register_meter("router_service_route_network_message_upnp");
pub static ref SERVICE_EXPIRED_PEERS: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_service_expired_peers", 1024);
pub static ref SERVICE_EXPIRED_PEERS_DISCONNECT_OK: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_service_expired_peers_disconnect_ok", 1024);
pub static ref SERVICE_EXPIRED_PEERS_DISCONNECT_FAIL: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_service_expired_peers_disconnect_fail", 1024);
// libp2p_event_handler
pub static ref LIBP2P_SEND_STATUS: Arc<dyn Meter> = register_meter("router_libp2p_send_status");
pub static ref LIBP2P_HANDLE_PEER_CONNECTED_OUTGOING: Arc<dyn Meter> = register_meter("router_libp2p_handle_peer_connected_outgoing");
pub static ref LIBP2P_HANDLE_PEER_CONNECTED_INCOMING: Arc<dyn Meter> = register_meter("router_libp2p_handle_peer_connected_incoming");
pub static ref LIBP2P_HANDLE_PEER_DISCONNECTED: Arc<dyn Meter> = register_meter("router_libp2p_handle_peer_disconnected");
pub static ref LIBP2P_HANDLE_REQUEST_STATUS: Arc<dyn Meter> = register_meter("router_libp2p_handle_request_status");
pub static ref LIBP2P_HANDLE_REQUEST_GET_CHUNKS: Arc<dyn Meter> = register_meter("router_libp2p_handle_request_get_chunks");
pub static ref LIBP2P_HANDLE_RESPONSE_STATUS: Arc<dyn Meter> = register_meter("router_libp2p_handle_response_status");
pub static ref LIBP2P_HANDLE_RESPONSE_GET_CHUNKS: Arc<dyn Meter> = register_meter("router_libp2p_handle_response_get_chunks");
pub static ref LIBP2P_HANDLE_RESPONSE_ERROR: Arc<dyn Meter> = register_meter("router_libp2p_handle_response_error");
pub static ref LIBP2P_HANDLE_PUBSUB_FIND_FILE: Arc<dyn Meter> = register_meter("router_libp2p_handle_pubsub_find_file");
pub static ref LIBP2P_HANDLE_PUBSUB_FIND_CHUNKS: Arc<dyn Meter> = register_meter("router_libp2p_handle_pubsub_find_chunks");
pub static ref LIBP2P_HANDLE_PUBSUB_ANNOUNCE_FILE: Arc<dyn Meter> = register_meter("router_libp2p_handle_pubsub_announce_file");
pub static ref LIBP2P_HANDLE_PUBSUB_ANNOUNCE_CHUNKS: Arc<dyn Meter> = register_meter("router_libp2p_handle_pubsub_announce_chunks");
pub static ref LIBP2P_HANDLE_PUBSUB_ANNOUNCE_SHARD: Arc<dyn Meter> = register_meter("router_libp2p_handle_pubsub_announce_shard");
pub static ref LIBP2P_HANDLE_PUBSUB_LATENCY_FIND_FILE: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_libp2p_handle_pubsub_latency_find_file", 1024);
pub static ref LIBP2P_HANDLE_PUBSUB_LATENCY_FIND_CHUNKS: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_libp2p_handle_pubsub_latency_find_chunks", 1024);
pub static ref LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_FILE: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_libp2p_handle_pubsub_latency_announce_file", 1024);
pub static ref LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_CHUNKS: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_libp2p_handle_pubsub_latency_announce_chunks", 1024);
pub static ref LIBP2P_HANDLE_PUBSUB_LATENCY_ANNOUNCE_SHARD: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("router_libp2p_handle_pubsub_latency_announce_shard", 1024);
pub static ref LIBP2P_VERIFY_ANNOUNCED_IP: Arc<dyn Meter> = register_meter("router_libp2p_verify_announced_ip");
pub static ref LIBP2P_VERIFY_ANNOUNCED_IP_UNSEEN: Arc<dyn Meter> = register_meter("router_libp2p_verify_announced_ip_unseen");
pub static ref LIBP2P_VERIFY_ANNOUNCED_IP_MISMATCH: Arc<dyn Meter> = register_meter("router_libp2p_verify_announced_ip_mismatch");
}

View File

@ -1,3 +1,4 @@
use crate::metrics;
use crate::Config;
use crate::{libp2p_event_handler::Libp2pEventHandler, peer_manager::PeerManager};
use chunk_pool::ChunkPoolMessage;
@ -224,6 +225,8 @@ impl RouterService {
) {
trace!(?msg, "Received new message");
metrics::SERVICE_ROUTE_NETWORK_MESSAGE.mark(1);
match msg {
NetworkMessage::SendRequest {
peer_id,
@ -231,6 +234,7 @@ impl RouterService {
request_id,
} => {
self.libp2p.send_request(peer_id, request_id, request);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_SEND_REQUEST.mark(1);
}
NetworkMessage::SendResponse {
peer_id,
@ -238,6 +242,7 @@ impl RouterService {
id,
} => {
self.libp2p.send_response(peer_id, id, response);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_SEND_RESPONSE.mark(1);
}
NetworkMessage::SendErrorResponse {
peer_id,
@ -246,6 +251,7 @@ impl RouterService {
reason,
} => {
self.libp2p.respond_with_error(peer_id, id, error, reason);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_SEND_ERROR_RESPONSE.mark(1);
}
NetworkMessage::Publish { messages } => {
if self.libp2p.swarm.connected_peers().next().is_none() {
@ -275,29 +281,44 @@ impl RouterService {
"Sending pubsub messages",
);
self.libp2p.swarm.behaviour_mut().publish(messages);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_PUBLISH.mark(1);
}
NetworkMessage::ReportPeer {
peer_id,
action,
source,
msg,
} => self.libp2p.report_peer(&peer_id, action, source, msg),
} => {
self.libp2p.report_peer(&peer_id, action, source, msg);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_REPORT_PEER.mark(1);
}
NetworkMessage::GoodbyePeer {
peer_id,
reason,
source,
} => self.libp2p.goodbye_peer(&peer_id, reason, source),
} => {
self.libp2p.goodbye_peer(&peer_id, reason, source);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_GOODBYE_PEER.mark(1);
}
NetworkMessage::DialPeer { address, peer_id } => {
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER.mark(1);
if self.libp2p.swarm.is_connected(&peer_id) {
self.libp2p_event_handler
.send_to_sync(SyncMessage::PeerConnected { peer_id });
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_ALREADY.mark(1);
} else {
match Swarm::dial(&mut self.libp2p.swarm, address.clone()) {
Ok(()) => debug!(%address, "Dialing libp2p peer"),
Ok(()) => {
debug!(%address, "Dialing libp2p peer");
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_NEW_OK.mark(1);
}
Err(err) => {
info!(%address, error = ?err, "Failed to dial peer");
self.libp2p_event_handler
.send_to_sync(SyncMessage::DailFailed { peer_id, err });
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_DAIL_PEER_NEW_FAIL.mark(1);
}
};
}
@ -309,12 +330,14 @@ impl RouterService {
.await
{
self.libp2p_event_handler.publish(msg);
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_ANNOUNCE_LOCAL_FILE.mark(1);
}
}
NetworkMessage::UPnPMappingEstablished {
tcp_socket,
udp_socket,
} => {
metrics::SERVICE_ROUTE_NETWORK_MESSAGE_UPNP.mark(1);
self.upnp_mappings = (tcp_socket.map(|s| s.port()), udp_socket.map(|s| s.port()));
// If there is an external TCP port update, modify our local ENR.
if let Some(tcp_socket) = tcp_socket {
@ -362,16 +385,27 @@ impl RouterService {
async fn on_heartbeat(&mut self) {
let expired_peers = self.peers.write().await.expired_peers();
metrics::SERVICE_EXPIRED_PEERS.update(expired_peers.len() as u64);
trace!("heartbeat, expired peers = {:?}", expired_peers.len());
let mut num_succeeded = 0;
let mut num_failed = 0;
for peer_id in expired_peers {
// async operation, once peer disconnected, swarm event `PeerDisconnected`
// will be polled to handle in advance.
match self.libp2p.swarm.disconnect_peer_id(peer_id) {
Ok(_) => debug!(%peer_id, "Peer expired and disconnect it"),
Err(_) => error!(%peer_id, "Peer expired but failed to disconnect"),
Ok(_) => {
debug!(%peer_id, "Peer expired and disconnect it");
num_succeeded += 1;
}
Err(_) => {
debug!(%peer_id, "Peer expired but failed to disconnect");
num_failed += 1;
}
}
}
metrics::SERVICE_EXPIRED_PEERS_DISCONNECT_OK.update(num_succeeded);
metrics::SERVICE_EXPIRED_PEERS_DISCONNECT_FAIL.update(num_failed);
}
}

View File

@ -96,6 +96,9 @@ pub struct ZgsConfig {
// file location cache config, configured by [file_location_cache] section by `config` crate.
pub file_location_cache: file_location_cache::Config,
// metrics config, configured by [metrics] section by `config` crate.
pub metrics: metrics::MetricsConfiguration,
}
impl Deref for ZgsConfig {

View File

@ -60,6 +60,7 @@ fn main() -> Result<(), Box<dyn Error>> {
// CLI, config, and logs
let matches = cli::cli_app().get_matches();
let config = ZgsConfig::parse(&matches)?;
metrics::initialize(config.metrics.clone());
log::configure(
&config.log_config_file,
&config.log_directory,

View File

@ -20,6 +20,8 @@ tracing = "0.1.35"
eth2_ssz = "0.4.0"
serde = { version = "1.0.137", features = ["derive"] }
duration-str = "0.5.1"
lazy_static = "1.4.0"
metrics = { workspace = true }
[dev-dependencies]
merkle_light = { path = "../../common/merkle_light" }

View File

@ -1,5 +1,8 @@
use super::{batcher::Batcher, sync_store::SyncStore};
use crate::{auto_sync::batcher::SyncResult, Config, SyncSender};
use crate::{
auto_sync::{batcher::SyncResult, metrics},
Config, SyncSender,
};
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::sync::{
@ -57,6 +60,12 @@ impl RandomBatcher {
continue;
}
if let Ok(state) = self.get_state().await {
metrics::RANDOM_STATE_TXS_SYNCING.update(state.tasks.len() as u64);
metrics::RANDOM_STATE_TXS_READY.update(state.ready_txs as u64);
metrics::RANDOM_STATE_TXS_PENDING.update(state.pending_txs as u64);
}
match self.sync_once().await {
Ok(true) => {}
Ok(false) => {
@ -86,6 +95,11 @@ impl RandomBatcher {
};
debug!(%tx_seq, ?sync_result, "Completed to sync file, state = {:?}", self.get_state().await);
match sync_result {
SyncResult::Completed => metrics::RANDOM_SYNC_RESULT_COMPLETED.inc(1),
SyncResult::Failed => metrics::RANDOM_SYNC_RESULT_FAILED.inc(1),
SyncResult::Timeout => metrics::RANDOM_SYNC_RESULT_TIMEOUT.inc(1),
}
match sync_result {
SyncResult::Completed => self.sync_store.remove_tx(tx_seq).await?,

View File

@ -2,7 +2,7 @@ use super::{
batcher::{Batcher, SyncResult},
sync_store::SyncStore,
};
use crate::{Config, SyncSender};
use crate::{auto_sync::metrics, Config, SyncSender};
use anyhow::Result;
use log_entry_sync::LogSyncEvent;
use serde::{Deserialize, Serialize};
@ -140,6 +140,15 @@ impl SerialBatcher {
continue;
}
// update metrics
let state = self.get_state().await;
metrics::SEQUENTIAL_STATE_TXS_SYNCING.update(state.tasks.len() as u64);
if state.max != u64::MAX {
metrics::SEQUENTIAL_STATE_GAP_NEXT_MAX.update((state.max - state.next) as usize);
}
metrics::SEQUENTIAL_STATE_TXS_PENDING.update(state.pendings.len() as u64);
metrics::SEQUENTIAL_STATE_GAP_NEXT_DB.update((state.next - state.next_in_db) as usize);
// sync files
match self.sync_once().await {
Ok(true) => {}
@ -247,6 +256,12 @@ impl SerialBatcher {
};
info!(%tx_seq, ?sync_result, "Completed to sync file, state = {:?}", self.get_state().await);
match sync_result {
SyncResult::Completed => metrics::SEQUENTIAL_SYNC_RESULT_COMPLETED.inc(1),
SyncResult::Failed => metrics::SEQUENTIAL_SYNC_RESULT_FAILED.inc(1),
SyncResult::Timeout => metrics::SEQUENTIAL_SYNC_RESULT_TIMEOUT.inc(1),
}
self.pending_completed_txs
.write()
.await

View File

@ -21,6 +21,7 @@ pub struct AutoSyncManager {
pub serial: SerialBatcher,
pub random: RandomBatcher,
pub file_announcement_send: UnboundedSender<u64>,
pub catched_up: Arc<AtomicBool>,
}
impl AutoSyncManager {
@ -52,11 +53,12 @@ impl AutoSyncManager {
executor.spawn(random.clone().start(catched_up.clone()), "auto_sync_random");
// handle on catched up notification
let catched_up_cloned = catched_up.clone();
executor.spawn(
async move {
if catch_up_end_recv.await.is_ok() {
info!("log entry catched up");
catched_up.store(true, Ordering::Relaxed);
catched_up_cloned.store(true, Ordering::Relaxed);
}
},
"auto_sync_wait_for_catchup",
@ -66,6 +68,7 @@ impl AutoSyncManager {
serial,
random,
file_announcement_send: send,
catched_up,
})
}
}

View File

@ -0,0 +1,26 @@
use std::sync::Arc;
use metrics::{Counter, CounterUsize, Gauge, GaugeUsize, Histogram, Sample};
lazy_static::lazy_static! {
// sequential auto sync
pub static ref SEQUENTIAL_STATE_TXS_SYNCING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_sequential_state_txs_syncing", 1024);
pub static ref SEQUENTIAL_STATE_GAP_NEXT_MAX: Arc<dyn Gauge<usize>> = GaugeUsize::register("sync_auto_sequential_state_gap_next_max");
pub static ref SEQUENTIAL_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_sequential_state_txs_pending", 1024);
pub static ref SEQUENTIAL_STATE_GAP_NEXT_DB: Arc<dyn Gauge<usize>> = GaugeUsize::register("sync_auto_sequential_state_gap_next_db");
pub static ref SEQUENTIAL_SYNC_RESULT_TOTAL: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_total");
pub static ref SEQUENTIAL_SYNC_RESULT_COMPLETED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_completed");
pub static ref SEQUENTIAL_SYNC_RESULT_FAILED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_failed");
pub static ref SEQUENTIAL_SYNC_RESULT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_timeout");
// random auto sync
pub static ref RANDOM_STATE_TXS_SYNCING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_syncing", 1024);
pub static ref RANDOM_STATE_TXS_READY: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_ready", 1024);
pub static ref RANDOM_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_pending", 1024);
pub static ref RANDOM_SYNC_RESULT_TOTAL: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_total");
pub static ref RANDOM_SYNC_RESULT_COMPLETED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_completed");
pub static ref RANDOM_SYNC_RESULT_FAILED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_failed");
pub static ref RANDOM_SYNC_RESULT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_timeout");
}

View File

@ -2,5 +2,6 @@ mod batcher;
pub mod batcher_random;
pub mod batcher_serial;
pub mod manager;
mod metrics;
pub mod sync_store;
mod tx_store;

View File

@ -110,6 +110,7 @@ impl InstantWrapper {
#[serde(rename_all = "camelCase")]
pub struct SyncServiceState {
pub num_syncing: usize,
pub catched_up: Option<bool>,
pub auto_sync_serial: Option<SerialBatcherState>,
pub auto_sync_random: Option<RandomBatcherState>,
}

View File

@ -15,6 +15,7 @@ use network::{
PeerRequestId, SyncId as RequestId,
};
use shared_types::{bytes_to_chunks, timestamp_now, ChunkArrayWithProof, TxID};
use std::sync::atomic::Ordering;
use std::{
collections::{hash_map::Entry, HashMap},
sync::Arc,
@ -275,11 +276,13 @@ impl SyncService {
let state = match &self.auto_sync_manager {
Some(manager) => SyncServiceState {
num_syncing: self.controllers.len(),
catched_up: Some(manager.catched_up.load(Ordering::Relaxed)),
auto_sync_serial: Some(manager.serial.get_state().await),
auto_sync_random: manager.random.get_state().await.ok(),
},
None => SyncServiceState {
num_syncing: self.controllers.len(),
catched_up: None,
auto_sync_serial: None,
auto_sync_random: None,
},