mirror of
https://github.com/0glabs/0g-storage-node.git
synced 2025-01-23 13:36:08 +00:00
Store shard config for peers and choose sync peers accordingly. (#77)
* Implement Pruner. * Put pruner in a crate. * Fix clippy. * Add rpc zgs_getShardConfig. * Fix. * Increase wait time. * Add pruner_test and use max_num_chunks instead of size_limit. * Store shard config for peers and choose sync peers accordingly. * Add test and fix sync. * Fix clippy and test. * Fix some ut. * Add AnnounceShardConfig gossip and fix tests. * Add sharded tx finalize check in LogManager. * Try, * Rename. * Longer timeout for mine_test. * Save test logs.
This commit is contained in:
parent
ef82f64393
commit
c2c6e2d5fb
9
.github/workflows/tests.yml
vendored
9
.github/workflows/tests.yml
vendored
@ -54,4 +54,11 @@ jobs:
|
||||
run: |
|
||||
cd tests
|
||||
uname -a
|
||||
python test_all.py
|
||||
python test_all.py
|
||||
|
||||
- name: Save logs for failures
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test_logs
|
||||
path: /tmp/zgs_test_*
|
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -2290,6 +2290,7 @@ dependencies = [
|
||||
"priority-queue",
|
||||
"rand 0.8.5",
|
||||
"shared_types",
|
||||
"storage",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
@ -6168,6 +6169,7 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"miner",
|
||||
"network",
|
||||
"pruner",
|
||||
"rand 0.8.5",
|
||||
"shared_types",
|
||||
"storage",
|
||||
|
@ -6,6 +6,7 @@ edition = "2021"
|
||||
[dependencies]
|
||||
hashlink = "0.8.0"
|
||||
network = { path = "../network" }
|
||||
storage = { path = "../storage" }
|
||||
parking_lot = "0.12.1"
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.35"
|
||||
|
@ -7,6 +7,7 @@ use rand::seq::IteratorRandom;
|
||||
use shared_types::{timestamp_now, TxID};
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::HashMap;
|
||||
use storage::config::ShardConfig;
|
||||
|
||||
/// Caches limited announcements of specified file from different peers.
|
||||
struct AnnouncementCache {
|
||||
@ -231,21 +232,45 @@ impl FileCache {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct PeerShardConfigCache {
|
||||
peers: HashMap<PeerId, ShardConfig>,
|
||||
}
|
||||
|
||||
impl PeerShardConfigCache {
|
||||
pub fn insert(&mut self, peer: PeerId, config: ShardConfig) -> Option<ShardConfig> {
|
||||
self.peers.insert(peer, config)
|
||||
}
|
||||
|
||||
pub fn get(&self, peer: &PeerId) -> Option<ShardConfig> {
|
||||
self.peers.get(peer).cloned()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FileLocationCache {
|
||||
cache: Mutex<FileCache>,
|
||||
peer_cache: Mutex<PeerShardConfigCache>,
|
||||
}
|
||||
|
||||
impl Default for FileLocationCache {
|
||||
fn default() -> Self {
|
||||
FileLocationCache {
|
||||
cache: Mutex::new(FileCache::new(Default::default())),
|
||||
peer_cache: Mutex::new(Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileLocationCache {
|
||||
pub fn insert(&self, announcement: SignedAnnounceFile) {
|
||||
let peer_id = *announcement.peer_id;
|
||||
// FIXME: Check validity.
|
||||
let shard_config = ShardConfig {
|
||||
shard_id: announcement.shard_id,
|
||||
num_shard: announcement.num_shard,
|
||||
};
|
||||
self.cache.lock().insert(announcement);
|
||||
self.insert_peer_config(peer_id, shard_config);
|
||||
}
|
||||
|
||||
pub fn get_one(&self, tx_id: TxID) -> Option<SignedAnnounceFile> {
|
||||
@ -255,6 +280,19 @@ impl FileLocationCache {
|
||||
pub fn get_all(&self, tx_id: TxID) -> Vec<SignedAnnounceFile> {
|
||||
self.cache.lock().all(tx_id).unwrap_or_default()
|
||||
}
|
||||
|
||||
/// TODO: Trigger chunk_pool/sync to reconstruct if it changes?
|
||||
pub fn insert_peer_config(
|
||||
&self,
|
||||
peer: PeerId,
|
||||
shard_config: ShardConfig,
|
||||
) -> Option<ShardConfig> {
|
||||
self.peer_cache.lock().insert(peer, shard_config)
|
||||
}
|
||||
|
||||
pub fn get_peer_config(&self, peer: &PeerId) -> Option<ShardConfig> {
|
||||
self.peer_cache.lock().get(peer)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -36,6 +36,8 @@ impl AnnounceFileBuilder {
|
||||
|
||||
let msg = AnnounceFile {
|
||||
tx_id,
|
||||
num_shard: 1,
|
||||
shard_id: 0,
|
||||
peer_id: peer_id.into(),
|
||||
at: at.into(),
|
||||
timestamp,
|
||||
|
@ -28,6 +28,8 @@ pub struct GossipCache {
|
||||
announce_file: Option<Duration>,
|
||||
/// Timeout for AnnounceChunks.
|
||||
announce_chunks: Option<Duration>,
|
||||
/// Timeout for AnnounceShardConfig.
|
||||
announce_shard_config: Option<Duration>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@ -43,6 +45,8 @@ pub struct GossipCacheBuilder {
|
||||
announce_file: Option<Duration>,
|
||||
/// Timeout for AnnounceChunks messages.
|
||||
announce_chunks: Option<Duration>,
|
||||
/// Timeout for AnnounceShardConfig messages.
|
||||
announce_shard_config: Option<Duration>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
@ -84,6 +88,12 @@ impl GossipCacheBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Timeout for AnnounceShardConfig messages.
|
||||
pub fn announce_shard_config_timeout(mut self, timeout: Duration) -> Self {
|
||||
self.announce_shard_config = Some(timeout);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> GossipCache {
|
||||
let GossipCacheBuilder {
|
||||
default_timeout,
|
||||
@ -92,6 +102,7 @@ impl GossipCacheBuilder {
|
||||
find_chunks,
|
||||
announce_file,
|
||||
announce_chunks,
|
||||
announce_shard_config,
|
||||
} = self;
|
||||
|
||||
GossipCache {
|
||||
@ -102,6 +113,7 @@ impl GossipCacheBuilder {
|
||||
find_chunks: find_chunks.or(default_timeout),
|
||||
announce_file: announce_file.or(default_timeout),
|
||||
announce_chunks: announce_chunks.or(default_timeout),
|
||||
announce_shard_config: announce_shard_config.or(default_timeout),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -121,6 +133,7 @@ impl GossipCache {
|
||||
GossipKind::FindChunks => self.find_chunks,
|
||||
GossipKind::AnnounceFile => self.announce_file,
|
||||
GossipKind::AnnounceChunks => self.announce_chunks,
|
||||
GossipKind::AnnounceShardConfig => self.announce_shard_config,
|
||||
};
|
||||
|
||||
let expire_timeout = match expire_timeout {
|
||||
|
@ -7,7 +7,8 @@ pub type Enr = discv5::enr::Enr<discv5::enr::CombinedKey>;
|
||||
|
||||
pub use globals::NetworkGlobals;
|
||||
pub use pubsub::{
|
||||
AnnounceChunks, AnnounceFile, FindChunks, FindFile, HasSignature, PubsubMessage,
|
||||
SignedAnnounceChunks, SignedAnnounceFile, SignedMessage, SnappyTransform,
|
||||
AnnounceChunks, AnnounceFile, AnnounceShardConfig, FindChunks, FindFile, HasSignature,
|
||||
PubsubMessage, SignedAnnounceChunks, SignedAnnounceFile, SignedAnnounceShardConfig,
|
||||
SignedMessage, SnappyTransform,
|
||||
};
|
||||
pub use topics::{GossipEncoding, GossipKind, GossipTopic, CORE_TOPICS};
|
||||
|
@ -131,6 +131,8 @@ pub struct FindChunks {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash, Encode, Decode)]
|
||||
pub struct AnnounceFile {
|
||||
pub tx_id: TxID,
|
||||
pub num_shard: usize,
|
||||
pub shard_id: usize,
|
||||
pub peer_id: WrappedPeerId,
|
||||
pub at: WrappedMultiaddr,
|
||||
pub timestamp: u32,
|
||||
@ -195,6 +197,7 @@ impl<T: Encode + Decode> HasSignature for SignedMessage<T> {
|
||||
}
|
||||
|
||||
pub type SignedAnnounceFile = SignedMessage<AnnounceFile>;
|
||||
pub type SignedAnnounceShardConfig = SignedMessage<AnnounceShardConfig>;
|
||||
pub type SignedAnnounceChunks = SignedMessage<AnnounceChunks>;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@ -203,6 +206,7 @@ pub enum PubsubMessage {
|
||||
FindFile(FindFile),
|
||||
FindChunks(FindChunks),
|
||||
AnnounceFile(SignedAnnounceFile),
|
||||
AnnounceShardConfig(SignedAnnounceShardConfig),
|
||||
AnnounceChunks(SignedAnnounceChunks),
|
||||
}
|
||||
|
||||
@ -281,6 +285,7 @@ impl PubsubMessage {
|
||||
PubsubMessage::FindChunks(_) => GossipKind::FindChunks,
|
||||
PubsubMessage::AnnounceFile(_) => GossipKind::AnnounceFile,
|
||||
PubsubMessage::AnnounceChunks(_) => GossipKind::AnnounceChunks,
|
||||
PubsubMessage::AnnounceShardConfig(_) => GossipKind::AnnounceShardConfig,
|
||||
}
|
||||
}
|
||||
|
||||
@ -315,6 +320,10 @@ impl PubsubMessage {
|
||||
SignedAnnounceChunks::from_ssz_bytes(data)
|
||||
.map_err(|e| format!("{:?}", e))?,
|
||||
)),
|
||||
GossipKind::AnnounceShardConfig => Ok(PubsubMessage::AnnounceShardConfig(
|
||||
SignedAnnounceShardConfig::from_ssz_bytes(data)
|
||||
.map_err(|e| format!("{:?}", e))?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -333,6 +342,7 @@ impl PubsubMessage {
|
||||
PubsubMessage::FindChunks(data) => data.as_ssz_bytes(),
|
||||
PubsubMessage::AnnounceFile(data) => data.as_ssz_bytes(),
|
||||
PubsubMessage::AnnounceChunks(data) => data.as_ssz_bytes(),
|
||||
PubsubMessage::AnnounceShardConfig(data) => data.as_ssz_bytes(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -355,6 +365,9 @@ impl std::fmt::Display for PubsubMessage {
|
||||
PubsubMessage::AnnounceChunks(msg) => {
|
||||
write!(f, "AnnounceChunks message: {:?}", msg)
|
||||
}
|
||||
PubsubMessage::AnnounceShardConfig(msg) => {
|
||||
write!(f, "AnnounceShardConfig message: {:?}", msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ pub const FIND_FILE_TOPIC: &str = "find_file";
|
||||
pub const FIND_CHUNKS_TOPIC: &str = "find_chunks";
|
||||
pub const ANNOUNCE_FILE_TOPIC: &str = "announce_file";
|
||||
pub const ANNOUNCE_CHUNKS_TOPIC: &str = "announce_chunks";
|
||||
pub const ANNOUNCE_SHARD_CONFIG_TOPIC: &str = "announce_shard_config";
|
||||
|
||||
pub const CORE_TOPICS: [GossipKind; 4] = [
|
||||
GossipKind::FindFile,
|
||||
@ -39,6 +40,7 @@ pub enum GossipKind {
|
||||
FindFile,
|
||||
FindChunks,
|
||||
AnnounceFile,
|
||||
AnnounceShardConfig,
|
||||
AnnounceChunks,
|
||||
}
|
||||
|
||||
@ -79,6 +81,7 @@ impl GossipTopic {
|
||||
FIND_CHUNKS_TOPIC => GossipKind::FindChunks,
|
||||
ANNOUNCE_FILE_TOPIC => GossipKind::AnnounceFile,
|
||||
ANNOUNCE_CHUNKS_TOPIC => GossipKind::AnnounceChunks,
|
||||
ANNOUNCE_SHARD_CONFIG_TOPIC => GossipKind::AnnounceShardConfig,
|
||||
_ => return Err(format!("Unknown topic: {}", topic)),
|
||||
};
|
||||
|
||||
@ -107,6 +110,7 @@ impl From<GossipTopic> for String {
|
||||
GossipKind::FindChunks => FIND_CHUNKS_TOPIC,
|
||||
GossipKind::AnnounceFile => ANNOUNCE_FILE_TOPIC,
|
||||
GossipKind::AnnounceChunks => ANNOUNCE_CHUNKS_TOPIC,
|
||||
GossipKind::AnnounceShardConfig => ANNOUNCE_SHARD_CONFIG_TOPIC,
|
||||
};
|
||||
|
||||
format!("/{}/{}/{}", TOPIC_PREFIX, kind, encoding)
|
||||
@ -125,6 +129,7 @@ impl std::fmt::Display for GossipTopic {
|
||||
GossipKind::FindChunks => FIND_CHUNKS_TOPIC,
|
||||
GossipKind::AnnounceFile => ANNOUNCE_FILE_TOPIC,
|
||||
GossipKind::AnnounceChunks => ANNOUNCE_CHUNKS_TOPIC,
|
||||
GossipKind::AnnounceShardConfig => ANNOUNCE_SHARD_CONFIG_TOPIC,
|
||||
};
|
||||
|
||||
write!(f, "/{}/{}/{}", TOPIC_PREFIX, kind, encoding)
|
||||
|
@ -8,7 +8,7 @@ use storage::config::{ShardConfig, SHARD_CONFIG_KEY};
|
||||
use storage::log_store::config::ConfigurableExt;
|
||||
use storage::log_store::Store;
|
||||
use task_executor::TaskExecutor;
|
||||
use tokio::sync::{broadcast, RwLock};
|
||||
use tokio::sync::{broadcast, mpsc, RwLock};
|
||||
use tracing::debug;
|
||||
|
||||
// Start pruning when the db directory size exceeds 0.9 * limit.
|
||||
@ -34,6 +34,7 @@ pub struct Pruner {
|
||||
config: PrunerConfig,
|
||||
store: Arc<RwLock<dyn Store>>,
|
||||
|
||||
sender: mpsc::UnboundedSender<PrunerMessage>,
|
||||
miner_sender: Option<broadcast::Sender<MinerMessage>>,
|
||||
}
|
||||
|
||||
@ -43,13 +44,15 @@ impl Pruner {
|
||||
mut config: PrunerConfig,
|
||||
store: Arc<RwLock<dyn Store>>,
|
||||
miner_sender: Option<broadcast::Sender<MinerMessage>>,
|
||||
) -> Result<()> {
|
||||
) -> Result<mpsc::UnboundedReceiver<PrunerMessage>> {
|
||||
if let Some(shard_config) = get_shard_config(&store).await? {
|
||||
config.shard_config = shard_config;
|
||||
}
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
let pruner = Pruner {
|
||||
config,
|
||||
store,
|
||||
sender: tx,
|
||||
miner_sender,
|
||||
};
|
||||
pruner.put_shard_config().await?;
|
||||
@ -59,7 +62,7 @@ impl Pruner {
|
||||
},
|
||||
"pruner",
|
||||
);
|
||||
Ok(())
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
pub async fn start(mut self) -> Result<()> {
|
||||
@ -119,6 +122,8 @@ impl Pruner {
|
||||
if let Some(sender) = &self.miner_sender {
|
||||
sender.send(MinerMessage::SetShardConfig(self.config.shard_config))?;
|
||||
}
|
||||
self.sender
|
||||
.send(PrunerMessage::ChangeShardConfig(self.config.shard_config))?;
|
||||
let mut store = self.store.write().await;
|
||||
store
|
||||
.flow_mut()
|
||||
@ -130,3 +135,8 @@ impl Pruner {
|
||||
async fn get_shard_config(store: &RwLock<dyn Store>) -> Result<Option<ShardConfig>> {
|
||||
store.read().await.get_config_decoded(&SHARD_CONFIG_KEY)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum PrunerMessage {
|
||||
ChangeShardConfig(ShardConfig),
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ storage = { path = "../storage" }
|
||||
storage-async = { path = "../storage-async" }
|
||||
sync = { path = "../sync" }
|
||||
task_executor = { path = "../../common/task_executor" }
|
||||
pruner = { path = "../pruner" }
|
||||
tokio = { version = "1.19.2", features = ["full"] }
|
||||
tracing = "0.1.35"
|
||||
rand = "0.8.5"
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::{ops::Neg, sync::Arc};
|
||||
|
||||
use file_location_cache::FileLocationCache;
|
||||
use network::types::{AnnounceShardConfig, SignedAnnounceShardConfig};
|
||||
use network::{
|
||||
rpc::StatusMessage,
|
||||
types::{
|
||||
@ -11,6 +12,7 @@ use network::{
|
||||
PublicKey, PubsubMessage, Request, RequestId, Response,
|
||||
};
|
||||
use shared_types::{bytes_to_chunks, timestamp_now, TxID};
|
||||
use storage::config::ShardConfig;
|
||||
use storage_async::Store;
|
||||
use sync::{SyncMessage, SyncSender};
|
||||
use tokio::sync::{mpsc, RwLock};
|
||||
@ -20,6 +22,7 @@ use crate::peer_manager::PeerManager;
|
||||
lazy_static::lazy_static! {
|
||||
pub static ref FIND_FILE_TIMEOUT: chrono::Duration = chrono::Duration::minutes(2);
|
||||
pub static ref ANNOUNCE_FILE_TIMEOUT: chrono::Duration = chrono::Duration::minutes(2);
|
||||
pub static ref ANNOUNCE_SHARD_CONFIG_TIMEOUT: chrono::Duration = chrono::Duration::minutes(2);
|
||||
pub static ref TOLERABLE_DRIFT: chrono::Duration = chrono::Duration::seconds(5);
|
||||
}
|
||||
|
||||
@ -235,10 +238,13 @@ impl Libp2pEventHandler {
|
||||
PubsubMessage::FindChunks(msg) => self.on_find_chunks(msg).await,
|
||||
PubsubMessage::AnnounceFile(msg) => self.on_announce_file(propagation_source, msg),
|
||||
PubsubMessage::AnnounceChunks(msg) => self.on_announce_chunks(propagation_source, msg),
|
||||
PubsubMessage::AnnounceShardConfig(msg) => {
|
||||
self.on_announce_shard_config(propagation_source, msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn construct_announce_file_message(&self, tx_id: TxID) -> Option<PubsubMessage> {
|
||||
pub async fn construct_announce_file_message(&self, tx_id: TxID) -> Option<PubsubMessage> {
|
||||
let peer_id = *self.network_globals.peer_id.read();
|
||||
|
||||
let addr = match self.network_globals.listen_multiaddrs.read().first() {
|
||||
@ -250,9 +256,18 @@ impl Libp2pEventHandler {
|
||||
};
|
||||
|
||||
let timestamp = timestamp_now();
|
||||
let shard_config = self
|
||||
.store
|
||||
.get_store()
|
||||
.read()
|
||||
.await
|
||||
.flow()
|
||||
.get_shard_config();
|
||||
|
||||
let msg = AnnounceFile {
|
||||
tx_id,
|
||||
num_shard: shard_config.num_shard,
|
||||
shard_id: shard_config.shard_id,
|
||||
peer_id: peer_id.into(),
|
||||
at: addr.into(),
|
||||
timestamp,
|
||||
@ -271,6 +286,41 @@ impl Libp2pEventHandler {
|
||||
Some(PubsubMessage::AnnounceFile(signed))
|
||||
}
|
||||
|
||||
pub async fn construct_announce_shard_config_message(
|
||||
&self,
|
||||
shard_config: ShardConfig,
|
||||
) -> Option<PubsubMessage> {
|
||||
let peer_id = *self.network_globals.peer_id.read();
|
||||
let addr = match self.network_globals.listen_multiaddrs.read().first() {
|
||||
Some(addr) => addr.clone(),
|
||||
None => {
|
||||
error!("No listen address available");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let timestamp = timestamp_now();
|
||||
|
||||
let msg = AnnounceShardConfig {
|
||||
num_shard: shard_config.num_shard,
|
||||
shard_id: shard_config.shard_id,
|
||||
peer_id: peer_id.into(),
|
||||
at: addr.into(),
|
||||
timestamp,
|
||||
};
|
||||
|
||||
let mut signed = match SignedMessage::sign_message(msg, &self.local_keypair) {
|
||||
Ok(signed) => signed,
|
||||
Err(e) => {
|
||||
error!(%e, "Failed to sign AnnounceShardConfig message");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
signed.resend_timestamp = timestamp;
|
||||
|
||||
Some(PubsubMessage::AnnounceShardConfig(signed))
|
||||
}
|
||||
|
||||
async fn on_find_file(&self, msg: FindFile) -> MessageAcceptance {
|
||||
let FindFile { tx_id, timestamp } = msg;
|
||||
|
||||
@ -287,7 +337,7 @@ impl Libp2pEventHandler {
|
||||
if tx.id() == tx_id {
|
||||
debug!(?tx_id, "Found file locally, responding to FindFile query");
|
||||
|
||||
return match self.construct_announce_file_message(tx_id) {
|
||||
return match self.construct_announce_file_message(tx_id).await {
|
||||
Some(msg) => {
|
||||
self.publish(msg);
|
||||
MessageAcceptance::Ignore
|
||||
@ -436,6 +486,41 @@ impl Libp2pEventHandler {
|
||||
MessageAcceptance::Accept
|
||||
}
|
||||
|
||||
fn on_announce_shard_config(
|
||||
&self,
|
||||
propagation_source: PeerId,
|
||||
msg: SignedAnnounceShardConfig,
|
||||
) -> MessageAcceptance {
|
||||
// verify message signature
|
||||
if !verify_signature(&msg, &msg.peer_id, propagation_source) {
|
||||
return MessageAcceptance::Reject;
|
||||
}
|
||||
|
||||
// propagate gossip to peers
|
||||
let d = duration_since(msg.resend_timestamp);
|
||||
if d < TOLERABLE_DRIFT.neg() || d > *ANNOUNCE_SHARD_CONFIG_TIMEOUT {
|
||||
debug!(%msg.resend_timestamp, "Invalid resend timestamp, ignoring AnnounceShardConfig message");
|
||||
return MessageAcceptance::Ignore;
|
||||
}
|
||||
|
||||
let shard_config = ShardConfig {
|
||||
shard_id: msg.shard_id,
|
||||
num_shard: msg.num_shard,
|
||||
};
|
||||
// notify sync layer
|
||||
self.send_to_sync(SyncMessage::AnnounceShardConfig {
|
||||
shard_config,
|
||||
peer_id: msg.peer_id.clone().into(),
|
||||
addr: msg.at.clone().into(),
|
||||
});
|
||||
|
||||
// insert message to cache
|
||||
self.file_location_cache
|
||||
.insert_peer_config(msg.peer_id.clone().into(), shard_config);
|
||||
|
||||
MessageAcceptance::Accept
|
||||
}
|
||||
|
||||
fn on_announce_chunks(
|
||||
&self,
|
||||
propagation_source: PeerId,
|
||||
@ -855,7 +940,11 @@ mod tests {
|
||||
let tx_id = TxID::random_hash(412);
|
||||
|
||||
// change signed message
|
||||
let message = match handler.construct_announce_file_message(tx_id).unwrap() {
|
||||
let message = match handler
|
||||
.construct_announce_file_message(tx_id)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
PubsubMessage::AnnounceFile(mut file) => {
|
||||
let malicious_addr: Multiaddr = "/ip4/127.0.0.38/tcp/30000".parse().unwrap();
|
||||
file.inner.at = malicious_addr.into();
|
||||
@ -878,7 +967,7 @@ mod tests {
|
||||
let (alice, bob) = (PeerId::random(), PeerId::random());
|
||||
let id = MessageId::new(b"dummy message");
|
||||
let tx = TxID::random_hash(412);
|
||||
let message = handler.construct_announce_file_message(tx).unwrap();
|
||||
let message = handler.construct_announce_file_message(tx).await.unwrap();
|
||||
|
||||
// succeeded to handle
|
||||
let result = handler.on_pubsub_message(alice, bob, &id, message).await;
|
||||
|
@ -7,6 +7,7 @@ use network::{
|
||||
BehaviourEvent, Keypair, Libp2pEvent, NetworkGlobals, NetworkMessage, RequestId,
|
||||
Service as LibP2PService, Swarm,
|
||||
};
|
||||
use pruner::PrunerMessage;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage::log_store::Store as LogStore;
|
||||
@ -29,6 +30,9 @@ pub struct RouterService {
|
||||
/// The receiver channel for Zgs to communicate with the network service.
|
||||
network_recv: mpsc::UnboundedReceiver<NetworkMessage>,
|
||||
|
||||
/// The receiver channel for Zgs to communicate with the pruner service.
|
||||
pruner_recv: Option<mpsc::UnboundedReceiver<PrunerMessage>>,
|
||||
|
||||
/// All connected peers.
|
||||
peers: Arc<RwLock<PeerManager>>,
|
||||
|
||||
@ -50,6 +54,7 @@ impl RouterService {
|
||||
network_send: mpsc::UnboundedSender<NetworkMessage>,
|
||||
sync_send: SyncSender,
|
||||
_miner_send: Option<broadcast::Sender<MinerMessage>>,
|
||||
pruner_recv: Option<mpsc::UnboundedReceiver<PrunerMessage>>,
|
||||
store: Arc<RwLock<dyn LogStore>>,
|
||||
file_location_cache: Arc<FileLocationCache>,
|
||||
local_keypair: Keypair,
|
||||
@ -64,6 +69,7 @@ impl RouterService {
|
||||
libp2p,
|
||||
network_globals: network_globals.clone(),
|
||||
network_recv,
|
||||
pruner_recv,
|
||||
peers: peers.clone(),
|
||||
libp2p_event_handler: Libp2pEventHandler::new(
|
||||
network_globals,
|
||||
@ -94,12 +100,21 @@ impl RouterService {
|
||||
// handle event coming from the network
|
||||
event = self.libp2p.next_event() => self.on_libp2p_event(event, &mut shutdown_sender).await,
|
||||
|
||||
Some(msg) = Self::try_recv(&mut self.pruner_recv) => self.on_pruner_msg(msg).await,
|
||||
|
||||
// heartbeat
|
||||
_ = heartbeat.tick() => self.on_heartbeat().await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_recv<T>(maybe_recv: &mut Option<mpsc::UnboundedReceiver<T>>) -> Option<T> {
|
||||
match maybe_recv {
|
||||
None => None,
|
||||
Some(recv) => recv.recv().await,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle an event received from the network.
|
||||
async fn on_libp2p_event(
|
||||
&mut self,
|
||||
@ -286,6 +301,7 @@ impl RouterService {
|
||||
if let Some(msg) = self
|
||||
.libp2p_event_handler
|
||||
.construct_announce_file_message(tx_id)
|
||||
.await
|
||||
{
|
||||
self.libp2p_event_handler.publish(msg);
|
||||
}
|
||||
@ -322,6 +338,20 @@ impl RouterService {
|
||||
}
|
||||
}
|
||||
|
||||
async fn on_pruner_msg(&mut self, msg: PrunerMessage) {
|
||||
match msg {
|
||||
PrunerMessage::ChangeShardConfig(shard_config) => {
|
||||
if let Some(msg) = self
|
||||
.libp2p_event_handler
|
||||
.construct_announce_shard_config_message(shard_config)
|
||||
.await
|
||||
{
|
||||
self.libp2p_event_handler.publish(msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn on_heartbeat(&mut self) {
|
||||
let expired_peers = self.peers.write().await.expired_peers();
|
||||
|
||||
|
@ -7,8 +7,7 @@ use jsonrpsee::core::async_trait;
|
||||
use jsonrpsee::core::RpcResult;
|
||||
use shared_types::{DataRoot, Transaction, CHUNK_SIZE};
|
||||
use std::fmt::{Debug, Formatter, Result};
|
||||
use storage::config::{ShardConfig, SHARD_CONFIG_KEY};
|
||||
use storage::log_store::config::ConfigurableExt;
|
||||
use storage::config::ShardConfig;
|
||||
use storage::try_option;
|
||||
|
||||
pub struct RpcServerImpl {
|
||||
@ -163,11 +162,8 @@ impl RpcServer for RpcServerImpl {
|
||||
.get_store()
|
||||
.read()
|
||||
.await
|
||||
.get_config_decoded(&SHARD_CONFIG_KEY)?
|
||||
.ok_or(error::invalid_params(
|
||||
"shard_config",
|
||||
"shard_config is unavailable",
|
||||
))?;
|
||||
.flow()
|
||||
.get_shard_config();
|
||||
Ok(shard_config)
|
||||
}
|
||||
}
|
||||
|
@ -132,6 +132,10 @@ impl Transaction {
|
||||
hash: self.hash(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_entry_index(&self) -> u64 {
|
||||
self.start_entry_index
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ChunkWithProof {
|
||||
|
@ -7,7 +7,7 @@ use network::{
|
||||
self, Keypair, NetworkConfig, NetworkGlobals, NetworkMessage, RequestId,
|
||||
Service as LibP2PService,
|
||||
};
|
||||
use pruner::{Pruner, PrunerConfig};
|
||||
use pruner::{Pruner, PrunerConfig, PrunerMessage};
|
||||
use router::RouterService;
|
||||
use rpc::RPCConfig;
|
||||
use std::sync::Arc;
|
||||
@ -50,7 +50,10 @@ struct LogSyncComponents {
|
||||
send: broadcast::Sender<LogSyncEvent>,
|
||||
}
|
||||
|
||||
struct PrunerComponents {}
|
||||
struct PrunerComponents {
|
||||
// note: these will be owned by the router service
|
||||
owned: Option<mpsc::UnboundedReceiver<PrunerMessage>>,
|
||||
}
|
||||
|
||||
/// Builds a `Client` instance.
|
||||
///
|
||||
@ -183,10 +186,10 @@ impl ClientBuilder {
|
||||
let miner_send = self.miner.as_ref().map(|miner| miner.send.clone());
|
||||
let store = require!("pruner", self, store).clone();
|
||||
let executor = require!("pruner", self, runtime_context).clone().executor;
|
||||
Pruner::spawn(executor, config, store, miner_send)
|
||||
let recv = Pruner::spawn(executor, config, store, miner_send)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
self.pruner = Some(PrunerComponents {});
|
||||
self.pruner = Some(PrunerComponents { owned: Some(recv) });
|
||||
}
|
||||
Ok(self)
|
||||
}
|
||||
@ -205,7 +208,7 @@ impl ClientBuilder {
|
||||
.owned
|
||||
.take() // router takes ownership of libp2p and network_recv
|
||||
.ok_or("router requires a network")?;
|
||||
|
||||
let pruner_recv = self.pruner.as_mut().and_then(|pruner| pruner.owned.take());
|
||||
RouterService::spawn(
|
||||
executor,
|
||||
libp2p,
|
||||
@ -214,6 +217,7 @@ impl ClientBuilder {
|
||||
network.send.clone(),
|
||||
sync_send,
|
||||
miner_send,
|
||||
pruner_recv,
|
||||
store,
|
||||
file_location_cache,
|
||||
network.keypair.clone(),
|
||||
|
@ -31,9 +31,9 @@ async fn start_node(context: RuntimeContext, config: ZgsConfig) -> Result<Client
|
||||
.await?
|
||||
.with_miner(miner_config)
|
||||
.await?
|
||||
.with_router(router_config)?
|
||||
.with_pruner(pruner_config)
|
||||
.await?
|
||||
.with_router(router_config)?
|
||||
.with_rpc(rpc_config, config.chunk_pool_config())
|
||||
.await?
|
||||
.build()
|
||||
|
@ -9,7 +9,7 @@ pub struct Config {
|
||||
pub db_dir: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Decode, Encode, Serialize, Deserialize)]
|
||||
#[derive(Clone, Copy, Debug, Decode, Encode, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ShardConfig {
|
||||
pub shard_id: usize,
|
||||
|
@ -205,6 +205,10 @@ impl FlowRead for FlowStore {
|
||||
.map(|num_batches| num_batches * PORA_CHUNK_SIZE as u64)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn get_shard_config(&self) -> ShardConfig {
|
||||
self.config.shard_config
|
||||
}
|
||||
}
|
||||
|
||||
impl FlowWrite for FlowStore {
|
||||
@ -578,6 +582,21 @@ pub fn batch_iter(start: u64, end: u64, batch_size: usize) -> Vec<(u64, u64)> {
|
||||
list
|
||||
}
|
||||
|
||||
pub fn batch_iter_sharded(
|
||||
start: u64,
|
||||
end: u64,
|
||||
batch_size: usize,
|
||||
shard_config: ShardConfig,
|
||||
) -> Vec<(u64, u64)> {
|
||||
batch_iter(start, end, batch_size)
|
||||
.into_iter()
|
||||
.filter(|(start, _)| {
|
||||
(start / batch_size as u64) % shard_config.num_shard as u64
|
||||
== shard_config.shard_id as u64
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn try_decode_usize(data: &[u8]) -> Result<usize> {
|
||||
Ok(usize::from_be_bytes(
|
||||
data.try_into().map_err(|e| anyhow!("{:?}", e))?,
|
||||
|
@ -1,4 +1,4 @@
|
||||
use crate::log_store::flow_store::{batch_iter, FlowConfig, FlowStore};
|
||||
use crate::log_store::flow_store::{batch_iter_sharded, FlowConfig, FlowStore};
|
||||
use crate::log_store::tx_store::TransactionStore;
|
||||
use crate::log_store::{
|
||||
FlowRead, FlowWrite, LogStoreChunkRead, LogStoreChunkWrite, LogStoreRead, LogStoreWrite,
|
||||
@ -203,11 +203,7 @@ impl LogStoreWrite for LogManager {
|
||||
let tx_end_index = tx.start_entry_index + bytes_to_entries(tx.size);
|
||||
// TODO: Check completeness without loading all data in memory.
|
||||
// TODO: Should we double check the tx merkle root?
|
||||
if self
|
||||
.flow_store
|
||||
.get_entries(tx.start_entry_index, tx_end_index)?
|
||||
.is_some()
|
||||
{
|
||||
if self.check_data_completed(tx.start_entry_index, tx_end_index)? {
|
||||
let same_root_seq_list = self
|
||||
.tx_store
|
||||
.get_tx_seq_list_by_data_root(&tx.data_merkle_root)?;
|
||||
@ -239,14 +235,10 @@ impl LogStoreWrite for LogManager {
|
||||
|
||||
self.padding_rear_data(&tx)?;
|
||||
|
||||
let tx_end_index = tx.start_entry_index + bytes_to_entries(tx.size);
|
||||
// TODO: Check completeness without loading all data in memory.
|
||||
// TODO: Should we double check the tx merkle root?
|
||||
if self
|
||||
.flow_store
|
||||
.get_entries(tx.start_entry_index, tx_end_index)?
|
||||
.is_some()
|
||||
{
|
||||
let tx_end_index = tx.start_entry_index + bytes_to_entries(tx.size);
|
||||
if self.check_data_completed(tx.start_entry_index, tx_end_index)? {
|
||||
self.tx_store.finalize_tx(tx_seq)?;
|
||||
let same_root_seq_list = self
|
||||
.tx_store
|
||||
@ -257,7 +249,7 @@ impl LogStoreWrite for LogManager {
|
||||
}
|
||||
Ok(true)
|
||||
} else {
|
||||
bail!("finalize tx with data missing: tx_seq={}", tx_seq)
|
||||
bail!("finalize tx hash with data missing: tx_seq={}", tx_seq)
|
||||
}
|
||||
}
|
||||
|
||||
@ -988,10 +980,11 @@ impl LogManager {
|
||||
}
|
||||
// copy data in batches
|
||||
// TODO(zz): Do this asynchronously and keep atomicity.
|
||||
for (batch_start, batch_end) in batch_iter(
|
||||
for (batch_start, batch_end) in batch_iter_sharded(
|
||||
old_tx.start_entry_index,
|
||||
old_tx.start_entry_index + old_tx.num_entries() as u64,
|
||||
PORA_CHUNK_SIZE,
|
||||
self.flow_store.get_shard_config(),
|
||||
) {
|
||||
let batch_data = self
|
||||
.get_chunk_by_flow_index(batch_start, batch_end - batch_start)?
|
||||
@ -1029,6 +1022,24 @@ impl LogManager {
|
||||
self.flow_store
|
||||
.insert_subtree_list_for_batch(index, to_insert_subtrees)
|
||||
}
|
||||
|
||||
fn check_data_completed(&self, start: u64, end: u64) -> Result<bool> {
|
||||
for (batch_start, batch_end) in batch_iter_sharded(
|
||||
start,
|
||||
end,
|
||||
PORA_CHUNK_SIZE,
|
||||
self.flow_store.get_shard_config(),
|
||||
) {
|
||||
if self
|
||||
.flow_store
|
||||
.get_entries(batch_start, batch_end)?
|
||||
.is_none()
|
||||
{
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
/// This represents the subtree of a chunk or the whole data merkle tree.
|
||||
|
@ -198,6 +198,8 @@ pub trait FlowRead {
|
||||
|
||||
// An estimation of the number of entries in the flow db.
|
||||
fn get_num_entries(&self) -> Result<u64>;
|
||||
|
||||
fn get_shard_config(&self) -> ShardConfig;
|
||||
}
|
||||
|
||||
pub trait FlowWrite {
|
||||
|
@ -1,7 +1,10 @@
|
||||
use network::{Multiaddr, PeerId};
|
||||
use rand::seq::IteratorRandom;
|
||||
use std::collections::HashMap;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::vec;
|
||||
use storage::config::ShardConfig;
|
||||
|
||||
const PEER_CONNECT_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
const PEER_DISCONNECT_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
@ -15,6 +18,7 @@ pub enum PeerState {
|
||||
Disconnected,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PeerInfo {
|
||||
/// The reported/connected address of the peer.
|
||||
pub addr: Multiaddr,
|
||||
@ -22,6 +26,8 @@ struct PeerInfo {
|
||||
/// The current state of the peer.
|
||||
pub state: PeerState,
|
||||
|
||||
pub shard_config: ShardConfig,
|
||||
|
||||
/// Timestamp of the last state change.
|
||||
pub since: Instant,
|
||||
}
|
||||
@ -33,15 +39,22 @@ impl PeerInfo {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
pub struct SyncPeers {
|
||||
peers: HashMap<PeerId, PeerInfo>,
|
||||
}
|
||||
|
||||
impl SyncPeers {
|
||||
pub fn add_new_peer(&mut self, peer_id: PeerId, addr: Multiaddr) -> bool {
|
||||
if self.peers.contains_key(&peer_id) {
|
||||
return false;
|
||||
pub fn add_new_peer_with_config(
|
||||
&mut self,
|
||||
peer_id: PeerId,
|
||||
addr: Multiaddr,
|
||||
shard_config: ShardConfig,
|
||||
) -> bool {
|
||||
if let Some(info) = self.peers.get(&peer_id) {
|
||||
if info.shard_config == shard_config {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
self.peers.insert(
|
||||
@ -49,6 +62,7 @@ impl SyncPeers {
|
||||
PeerInfo {
|
||||
addr,
|
||||
state: PeerState::Found,
|
||||
shard_config,
|
||||
since: Instant::now(),
|
||||
},
|
||||
);
|
||||
@ -56,6 +70,11 @@ impl SyncPeers {
|
||||
true
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn add_new_peer(&mut self, peer_id: PeerId, addr: Multiaddr) -> bool {
|
||||
self.add_new_peer_with_config(peer_id, addr, Default::default())
|
||||
}
|
||||
|
||||
pub fn update_state(
|
||||
&mut self,
|
||||
peer_id: &PeerId,
|
||||
@ -83,6 +102,10 @@ impl SyncPeers {
|
||||
self.peers.get(peer_id).map(|info| info.state)
|
||||
}
|
||||
|
||||
pub fn shard_config(&self, peer_id: &PeerId) -> Option<ShardConfig> {
|
||||
self.peers.get(peer_id).map(|info| info.shard_config)
|
||||
}
|
||||
|
||||
pub fn random_peer(&self, state: PeerState) -> Option<(PeerId, Multiaddr)> {
|
||||
self.peers
|
||||
.iter()
|
||||
@ -91,6 +114,19 @@ impl SyncPeers {
|
||||
.choose(&mut rand::thread_rng())
|
||||
}
|
||||
|
||||
pub fn filter_peers(&self, state: Vec<PeerState>) -> Vec<PeerId> {
|
||||
self.peers
|
||||
.iter()
|
||||
.filter_map(|(peer_id, info)| {
|
||||
if state.contains(&info.state) {
|
||||
Some(*peer_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn count(&self, states: &[PeerState]) -> usize {
|
||||
self.peers
|
||||
.values()
|
||||
@ -98,6 +134,42 @@ impl SyncPeers {
|
||||
.count()
|
||||
}
|
||||
|
||||
pub fn all_shards_available(&self, state: Vec<PeerState>) -> bool {
|
||||
let mut missing_shards = BTreeSet::new();
|
||||
missing_shards.insert(0);
|
||||
let mut num_shards = 1usize;
|
||||
for peer_id in &self.filter_peers(state) {
|
||||
let shard_config = self.peers.get(peer_id).unwrap().shard_config;
|
||||
match shard_config.num_shard.cmp(&num_shards) {
|
||||
Ordering::Equal => {
|
||||
missing_shards.remove(&shard_config.shard_id);
|
||||
}
|
||||
Ordering::Less => {
|
||||
let multi = num_shards / shard_config.num_shard;
|
||||
for i in 0..multi {
|
||||
let shard_id = shard_config.shard_id + i * shard_config.num_shard;
|
||||
missing_shards.remove(&shard_id);
|
||||
}
|
||||
}
|
||||
Ordering::Greater => {
|
||||
let multi = shard_config.num_shard / num_shards;
|
||||
let mut new_missing_shards = BTreeSet::new();
|
||||
for shard_id in &missing_shards {
|
||||
for i in 0..multi {
|
||||
new_missing_shards.insert(*shard_id + i * num_shards);
|
||||
}
|
||||
}
|
||||
new_missing_shards.remove(&shard_config.shard_id);
|
||||
|
||||
missing_shards = new_missing_shards;
|
||||
num_shards = shard_config.num_shard;
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("all_shards_available: {} {:?}", num_shards, missing_shards);
|
||||
missing_shards.is_empty()
|
||||
}
|
||||
|
||||
pub fn transition(&mut self) {
|
||||
let mut bad_peers = vec![];
|
||||
|
||||
|
@ -13,6 +13,7 @@ use std::{
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use storage::log_store::log_manager::PORA_CHUNK_SIZE;
|
||||
use storage_async::Store;
|
||||
|
||||
pub const MAX_CHUNKS_TO_REQUEST: u64 = 2 * 1024;
|
||||
@ -62,6 +63,8 @@ pub struct SerialSyncController {
|
||||
/// The unique transaction ID.
|
||||
tx_id: TxID,
|
||||
|
||||
tx_start_chunk_in_flow: u64,
|
||||
|
||||
since: Instant,
|
||||
|
||||
/// File sync goal.
|
||||
@ -92,6 +95,7 @@ pub struct SerialSyncController {
|
||||
impl SerialSyncController {
|
||||
pub fn new(
|
||||
tx_id: TxID,
|
||||
tx_start_chunk_in_flow: u64,
|
||||
goal: FileSyncGoal,
|
||||
ctx: Arc<SyncNetworkContext>,
|
||||
store: Store,
|
||||
@ -100,6 +104,7 @@ impl SerialSyncController {
|
||||
SerialSyncController {
|
||||
tx_seq: tx_id.seq,
|
||||
tx_id,
|
||||
tx_start_chunk_in_flow,
|
||||
since: Instant::now(),
|
||||
goal,
|
||||
next_chunk: goal.index_start,
|
||||
@ -183,11 +188,16 @@ impl SerialSyncController {
|
||||
let peer_id: PeerId = announcement.peer_id.clone().into();
|
||||
let mut addr: Multiaddr = announcement.at.clone().into();
|
||||
addr.push(Protocol::P2p(peer_id.into()));
|
||||
|
||||
found_new_peer = self.on_peer_found(peer_id, addr) || found_new_peer;
|
||||
}
|
||||
|
||||
if found_new_peer {
|
||||
if found_new_peer
|
||||
&& self.peers.all_shards_available(vec![
|
||||
PeerState::Found,
|
||||
PeerState::Connecting,
|
||||
PeerState::Connected,
|
||||
])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
@ -208,30 +218,53 @@ impl SerialSyncController {
|
||||
|
||||
fn try_connect(&mut self) {
|
||||
// select a random peer
|
||||
let (peer_id, address) = match self.peers.random_peer(PeerState::Found) {
|
||||
Some((peer_id, address)) => (peer_id, address),
|
||||
None => {
|
||||
// peer may be disconnected by remote node and need to find peers again
|
||||
warn!(%self.tx_seq, "No peers available to connect");
|
||||
self.state = SyncState::Idle;
|
||||
return;
|
||||
}
|
||||
};
|
||||
while !self
|
||||
.peers
|
||||
.all_shards_available(vec![PeerState::Connecting, PeerState::Connected])
|
||||
{
|
||||
let (peer_id, address) = match self.peers.random_peer(PeerState::Found) {
|
||||
Some((peer_id, address)) => (peer_id, address),
|
||||
None => {
|
||||
// peer may be disconnected by remote node and need to find peers again
|
||||
warn!(%self.tx_seq, "No peers available to connect");
|
||||
self.state = SyncState::Idle;
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// connect to peer
|
||||
info!(%peer_id, %address, "Attempting to connect to peer");
|
||||
self.ctx.send(NetworkMessage::DialPeer { address, peer_id });
|
||||
|
||||
self.peers
|
||||
.update_state(&peer_id, PeerState::Found, PeerState::Connecting);
|
||||
// connect to peer
|
||||
info!(%peer_id, %address, "Attempting to connect to peer");
|
||||
self.ctx.send(NetworkMessage::DialPeer { address, peer_id });
|
||||
|
||||
self.peers
|
||||
.update_state(&peer_id, PeerState::Found, PeerState::Connecting);
|
||||
}
|
||||
self.state = SyncState::ConnectingPeers;
|
||||
}
|
||||
|
||||
fn try_request_next(&mut self) {
|
||||
// request next chunk array
|
||||
let from_chunk = self.next_chunk;
|
||||
// let to_chunk = std::cmp::min(from_chunk + MAX_CHUNKS_TO_REQUEST, self.goal.index_end);
|
||||
let to_chunk =
|
||||
if from_chunk == 0 && self.tx_start_chunk_in_flow % PORA_CHUNK_SIZE as u64 != 0 {
|
||||
// Align the first request with segments.
|
||||
PORA_CHUNK_SIZE as u64 - self.tx_start_chunk_in_flow % PORA_CHUNK_SIZE as u64
|
||||
} else {
|
||||
from_chunk + PORA_CHUNK_SIZE as u64
|
||||
};
|
||||
let to_chunk = std::cmp::min(to_chunk, self.goal.index_end);
|
||||
|
||||
let request_id = network::RequestId::Sync(RequestId::SerialSync { tx_id: self.tx_id });
|
||||
let request = GetChunksRequest {
|
||||
tx_id: self.tx_id,
|
||||
index_start: from_chunk,
|
||||
index_end: to_chunk,
|
||||
};
|
||||
|
||||
// select a random peer
|
||||
let peer_id = match self.peers.random_peer(PeerState::Connected) {
|
||||
Some((peer_id, _)) => peer_id,
|
||||
let peer_id = match self.select_peer_for_request(&request) {
|
||||
Some(peer_id) => peer_id,
|
||||
None => {
|
||||
warn!(%self.tx_seq, "No peers available to request chunks");
|
||||
self.state = SyncState::Idle;
|
||||
@ -239,24 +272,11 @@ impl SerialSyncController {
|
||||
}
|
||||
};
|
||||
|
||||
// request next chunk array
|
||||
let from_chunk = self.next_chunk;
|
||||
let to_chunk = std::cmp::min(from_chunk + MAX_CHUNKS_TO_REQUEST, self.goal.index_end);
|
||||
|
||||
let request_id = network::RequestId::Sync(RequestId::SerialSync { tx_id: self.tx_id });
|
||||
|
||||
let request = network::Request::GetChunks(GetChunksRequest {
|
||||
tx_id: self.tx_id,
|
||||
index_start: from_chunk,
|
||||
index_end: to_chunk,
|
||||
});
|
||||
|
||||
self.ctx.send(NetworkMessage::SendRequest {
|
||||
peer_id,
|
||||
request_id,
|
||||
request,
|
||||
request: network::Request::GetChunks(request),
|
||||
});
|
||||
|
||||
self.state = SyncState::Downloading {
|
||||
peer_id,
|
||||
from_chunk,
|
||||
@ -273,12 +293,20 @@ impl SerialSyncController {
|
||||
}
|
||||
|
||||
pub fn on_peer_found(&mut self, peer_id: PeerId, addr: Multiaddr) -> bool {
|
||||
if self.peers.add_new_peer(peer_id, addr.clone()) {
|
||||
info!(%self.tx_seq, %peer_id, %addr, "Found new peer");
|
||||
true
|
||||
if let Some(shard_config) = self.file_location_cache.get_peer_config(&peer_id) {
|
||||
if self
|
||||
.peers
|
||||
.add_new_peer_with_config(peer_id, addr.clone(), shard_config)
|
||||
{
|
||||
info!(%self.tx_seq, %peer_id, %addr, "Found new peer");
|
||||
true
|
||||
} else {
|
||||
// e.g. multiple `AnnounceFile` messages propagated
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "Found an existing peer");
|
||||
false
|
||||
}
|
||||
} else {
|
||||
// e.g. multiple `AnnounceFile` messages propagated
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "Found an existing peer");
|
||||
debug!(%self.tx_seq, %peer_id, %addr, "No shard config found");
|
||||
false
|
||||
}
|
||||
}
|
||||
@ -515,6 +543,21 @@ impl SerialSyncController {
|
||||
}
|
||||
}
|
||||
|
||||
fn select_peer_for_request(&self, request: &GetChunksRequest) -> Option<PeerId> {
|
||||
let segment_index =
|
||||
(request.index_start + self.tx_start_chunk_in_flow) / PORA_CHUNK_SIZE as u64;
|
||||
let peers = self.peers.filter_peers(vec![PeerState::Connected]);
|
||||
// TODO: Add randomness
|
||||
for peer in peers {
|
||||
if let Some(shard_config) = self.peers.shard_config(&peer) {
|
||||
if shard_config.in_range(segment_index) {
|
||||
return Some(peer);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn transition(&mut self) {
|
||||
use PeerState::*;
|
||||
|
||||
@ -548,7 +591,7 @@ impl SerialSyncController {
|
||||
}
|
||||
|
||||
SyncState::FoundPeers => {
|
||||
if self.peers.count(&[Connecting, Connected]) > 0 {
|
||||
if self.peers.all_shards_available(vec![Connecting, Connected]) {
|
||||
self.state = SyncState::ConnectingPeers;
|
||||
} else {
|
||||
self.try_connect();
|
||||
@ -556,7 +599,7 @@ impl SerialSyncController {
|
||||
}
|
||||
|
||||
SyncState::ConnectingPeers => {
|
||||
if self.peers.count(&[Connected]) > 0 {
|
||||
if self.peers.all_shards_available(vec![Connected]) {
|
||||
self.state = SyncState::AwaitingDownload {
|
||||
since: Instant::now(),
|
||||
};
|
||||
@ -1371,8 +1414,8 @@ mod tests {
|
||||
SyncState::Failed { reason } => {
|
||||
assert!(matches!(reason, FailureReason::DBError(..)));
|
||||
}
|
||||
_ => {
|
||||
panic!("Not expected SyncState");
|
||||
state => {
|
||||
panic!("Not expected SyncState, {:?}", state);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1546,6 +1589,7 @@ mod tests {
|
||||
|
||||
let controller = SerialSyncController::new(
|
||||
tx_id,
|
||||
0,
|
||||
FileSyncGoal::new_file(num_chunks as u64),
|
||||
ctx,
|
||||
Store::new(store, task_executor),
|
||||
|
@ -19,6 +19,7 @@ use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
sync::Arc,
|
||||
};
|
||||
use storage::config::ShardConfig;
|
||||
use storage::error::Result as StorageResult;
|
||||
use storage::log_store::Store as LogStore;
|
||||
use storage_async::Store;
|
||||
@ -60,6 +61,11 @@ pub enum SyncMessage {
|
||||
peer_id: PeerId,
|
||||
addr: Multiaddr,
|
||||
},
|
||||
AnnounceShardConfig {
|
||||
shard_config: ShardConfig,
|
||||
peer_id: PeerId,
|
||||
addr: Multiaddr,
|
||||
},
|
||||
AnnounceChunksGossip {
|
||||
msg: AnnounceChunks,
|
||||
},
|
||||
@ -240,6 +246,9 @@ impl SyncService {
|
||||
}
|
||||
|
||||
SyncMessage::AnnounceChunksGossip { msg } => self.on_announce_chunks_gossip(msg).await,
|
||||
SyncMessage::AnnounceShardConfig { .. } => {
|
||||
// FIXME: Check if controllers need to be reset?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -566,6 +575,7 @@ impl SyncService {
|
||||
|
||||
entry.insert(SerialSyncController::new(
|
||||
tx.id(),
|
||||
tx.start_entry_index(),
|
||||
FileSyncGoal::new(num_chunks, index_start, index_end),
|
||||
self.ctx.clone(),
|
||||
self.store.clone(),
|
||||
@ -728,7 +738,7 @@ mod tests {
|
||||
|
||||
impl Default for TestSyncRuntime {
|
||||
fn default() -> Self {
|
||||
TestSyncRuntime::new(vec![1535], 1)
|
||||
TestSyncRuntime::new(vec![1023], 1)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1277,18 +1287,22 @@ mod tests {
|
||||
test_sync_file(1).await;
|
||||
test_sync_file(511).await;
|
||||
test_sync_file(512).await;
|
||||
test_sync_file(513).await;
|
||||
test_sync_file(514).await;
|
||||
test_sync_file(1023).await;
|
||||
test_sync_file(1024).await;
|
||||
test_sync_file(1025).await;
|
||||
test_sync_file(2047).await;
|
||||
test_sync_file(2048).await;
|
||||
|
||||
// TODO: Ignore for alignment with tx_start_chunk_in_flow.
|
||||
// test_sync_file(513).await;
|
||||
// test_sync_file(514).await;
|
||||
// test_sync_file(1023).await;
|
||||
// test_sync_file(1024).await;
|
||||
|
||||
// TODO: Ignore for max chunks to request in sync.
|
||||
// test_sync_file(1025).await;
|
||||
// test_sync_file(2047).await;
|
||||
// test_sync_file(2048).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sync_file_exceed_max_chunks_to_request() {
|
||||
let mut runtime = TestSyncRuntime::new(vec![2049], 1);
|
||||
let mut runtime = TestSyncRuntime::new(vec![1025], 1);
|
||||
let sync_send = runtime.spawn_sync_service(false).await;
|
||||
|
||||
let tx_seq = 0u64;
|
||||
@ -1313,7 +1327,7 @@ mod tests {
|
||||
runtime.init_peer_id,
|
||||
tx_seq,
|
||||
0,
|
||||
2048,
|
||||
1024,
|
||||
)
|
||||
.await;
|
||||
|
||||
@ -1332,7 +1346,7 @@ mod tests {
|
||||
runtime.peer_store.clone(),
|
||||
runtime.init_peer_id,
|
||||
tx_seq,
|
||||
2048,
|
||||
1024,
|
||||
runtime.chunk_count as u64,
|
||||
)
|
||||
.await;
|
||||
@ -1342,7 +1356,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sync_file_multi_files() {
|
||||
let mut runtime = TestSyncRuntime::new(vec![1535, 1535, 1535], 3);
|
||||
let mut runtime = TestSyncRuntime::new(vec![1023, 1023, 1023], 3);
|
||||
let sync_send = runtime.spawn_sync_service(false).await;
|
||||
|
||||
// second file
|
||||
@ -1429,7 +1443,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_announce_file() {
|
||||
let mut runtime = TestSyncRuntime::new(vec![1535], 0);
|
||||
let mut runtime = TestSyncRuntime::new(vec![1023], 0);
|
||||
let mut config = Config::default();
|
||||
config.sync_file_on_announcement_enabled = true;
|
||||
let sync_send = runtime.spawn_sync_service_with_config(false, config).await;
|
||||
@ -1658,8 +1672,11 @@ mod tests {
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
_ => {
|
||||
panic!("Not expected message: NetworkMessage::SendRequest");
|
||||
msg => {
|
||||
panic!(
|
||||
"Not expected message: NetworkMessage::SendRequest, msg={:?}",
|
||||
msg
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -99,6 +99,7 @@ pub mod tests {
|
||||
use libp2p::PeerId;
|
||||
use shared_types::TxID;
|
||||
use std::sync::Arc;
|
||||
use storage::config::ShardConfig;
|
||||
use storage::{
|
||||
log_store::{log_manager::LogConfig, Store as LogStore},
|
||||
LogManager,
|
||||
@ -144,6 +145,7 @@ pub mod tests {
|
||||
.build();
|
||||
cache.insert(announcement);
|
||||
}
|
||||
cache.insert_peer_config(peer_id, ShardConfig::default());
|
||||
|
||||
Arc::new(cache)
|
||||
}
|
||||
|
@ -87,7 +87,7 @@ class CliSubmissionTest(TestFramework):
|
||||
wait_until(lambda: self.nodes[i].zgs_get_file_info(root) is not None)
|
||||
self.nodes[i].admin_start_sync_file(submission_index - 1)
|
||||
wait_until(
|
||||
lambda: self.nodes[i].sycn_status_is_completed_or_unknown(
|
||||
lambda: self.nodes[i].sync_status_is_completed_or_unknown(
|
||||
submission_index - 1
|
||||
)
|
||||
)
|
||||
|
@ -49,14 +49,14 @@ class MineTest(TestFramework):
|
||||
self.contract.update_context()
|
||||
|
||||
self.log.info("Wait for the first mine answer")
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 1)
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 1, timeout=180)
|
||||
|
||||
self.log.info("Wait for the second mine context release")
|
||||
wait_until(lambda: int(blockchain.eth_blockNumber(), 16) >= start_epoch + 2, timeout=180)
|
||||
self.contract.update_context()
|
||||
|
||||
self.log.info("Wait for the second mine answer")
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 2)
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 2, timeout=180)
|
||||
|
||||
self.nodes[0].miner_stop()
|
||||
self.log.info("Wait for the third mine context release")
|
||||
@ -69,7 +69,7 @@ class MineTest(TestFramework):
|
||||
self.nodes[0].miner_start()
|
||||
|
||||
self.log.info("Wait for the third mine answer")
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 3)
|
||||
wait_until(lambda: self.mine_contract.last_mined_epoch() == 3, timeout=180)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -42,7 +42,7 @@ class RpcTest(TestFramework):
|
||||
)
|
||||
|
||||
client2.admin_start_sync_file(0)
|
||||
wait_until(lambda: client2.sycn_status_is_completed_or_unknown(0))
|
||||
wait_until(lambda: client2.sync_status_is_completed_or_unknown(0))
|
||||
|
||||
wait_until(lambda: client2.zgs_get_file_info(data_root)["finalized"])
|
||||
assert_equal(
|
||||
@ -84,7 +84,7 @@ class RpcTest(TestFramework):
|
||||
wait_until(lambda: self.nodes[i].zgs_get_file_info(root) is not None)
|
||||
self.nodes[i].admin_start_sync_file(n_files - 1)
|
||||
wait_until(
|
||||
lambda: self.nodes[i].sycn_status_is_completed_or_unknown(
|
||||
lambda: self.nodes[i].sync_status_is_completed_or_unknown(
|
||||
n_files - 1
|
||||
)
|
||||
)
|
||||
|
54
tests/shard_sync_test.py
Executable file
54
tests/shard_sync_test.py
Executable file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
import time
|
||||
|
||||
from test_framework.test_framework import TestFramework
|
||||
from utility.submission import create_submission, submit_data
|
||||
from utility.utils import wait_until, assert_equal
|
||||
|
||||
|
||||
class PrunerTest(TestFramework):
|
||||
|
||||
def setup_params(self):
|
||||
self.num_blockchain_nodes = 1
|
||||
self.num_nodes = 3
|
||||
self.zgs_node_configs[0] = {
|
||||
"db_max_num_chunks": 2 ** 30,
|
||||
"shard_position": "0/2"
|
||||
}
|
||||
self.zgs_node_configs[1] = {
|
||||
"db_max_num_chunks": 2 ** 30,
|
||||
"shard_position": "1/2"
|
||||
}
|
||||
|
||||
def run_test(self):
|
||||
client = self.nodes[0]
|
||||
|
||||
chunk_data = b"\x02" * 8 * 256 * 1024
|
||||
submissions, data_root = create_submission(chunk_data)
|
||||
self.contract.submit(submissions)
|
||||
wait_until(lambda: self.contract.num_submissions() == 1)
|
||||
wait_until(lambda: client.zgs_get_file_info(data_root) is not None)
|
||||
|
||||
# Submit data to two nodes with different shards.
|
||||
segment = submit_data(client, chunk_data)
|
||||
submit_data(self.nodes[1], chunk_data)
|
||||
|
||||
self.nodes[2].admin_start_sync_file(0)
|
||||
wait_until(lambda: self.nodes[2].sync_status_is_completed_or_unknown(0))
|
||||
wait_until(lambda: self.nodes[2].zgs_get_file_info(data_root)["finalized"])
|
||||
|
||||
for i in range(len(segment)):
|
||||
index_store = i % 2
|
||||
index_empty = 1 - i % 2
|
||||
seg0 = self.nodes[index_store].zgs_download_segment(data_root, i * 1024, (i + 1) * 1024)
|
||||
seg1 = self.nodes[index_empty].zgs_download_segment(data_root, i * 1024, (i + 1) * 1024)
|
||||
seg2 = self.nodes[2].zgs_download_segment(data_root, i * 1024, (i + 1) * 1024)
|
||||
# base64 encoding size
|
||||
assert_equal(len(seg0), 349528)
|
||||
assert_equal(seg1, None)
|
||||
# node 2 should save all data
|
||||
assert_equal(len(seg2), 349528)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
PrunerTest().main()
|
@ -84,7 +84,7 @@ class SubmissionTest(TestFramework):
|
||||
self.nodes[i].admin_start_sync_file(submission_index - 1)
|
||||
|
||||
wait_until(
|
||||
lambda: self.nodes[i].sycn_status_is_completed_or_unknown(
|
||||
lambda: self.nodes[i].sync_status_is_completed_or_unknown(
|
||||
submission_index - 1
|
||||
)
|
||||
)
|
||||
|
@ -49,7 +49,7 @@ class SyncTest(TestFramework):
|
||||
|
||||
# Trigger file sync by rpc
|
||||
assert(client2.admin_start_sync_file(0) is None)
|
||||
wait_until(lambda: client2.sycn_status_is_completed_or_unknown(0))
|
||||
wait_until(lambda: client2.sync_status_is_completed_or_unknown(0))
|
||||
wait_until(lambda: client2.zgs_get_file_info(data_root)["finalized"])
|
||||
|
||||
# Validate data
|
||||
@ -96,7 +96,7 @@ class SyncTest(TestFramework):
|
||||
|
||||
# Trigger chunks sync by rpc
|
||||
assert(client2.admin_start_sync_chunks(1, 1024, 2048) is None)
|
||||
wait_until(lambda: client2.sycn_status_is_completed_or_unknown(1))
|
||||
wait_until(lambda: client2.sync_status_is_completed_or_unknown(1))
|
||||
wait_until(lambda: client2.zgs_download_segment_decoded(data_root, 1024, 2048) is not None)
|
||||
|
||||
# Validate data
|
||||
|
@ -110,7 +110,7 @@ class ZgsNode(TestNode):
|
||||
def admin_get_sync_status(self, tx_seq):
|
||||
return self.rpc.admin_getSyncStatus([tx_seq])
|
||||
|
||||
def sycn_status_is_completed_or_unknown(self, tx_seq):
|
||||
def sync_status_is_completed_or_unknown(self, tx_seq):
|
||||
status = self.rpc.admin_getSyncStatus([tx_seq])
|
||||
return status == "Completed" or status == "unknown"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user