mirror of
https://github.com/0glabs/0g-storage-node.git
synced 2025-01-18 19:15:18 +00:00
Repeat log catch-up until it's close to the latest height. (#114)
* Repeat log catch-up until it's close to the latest height. * Add an oneshot channel to send catch up end complete. * Fix comments. * Fix tests. * Fix ut.
This commit is contained in:
parent
59d3a03487
commit
3e23e81ae6
@ -2,6 +2,7 @@ use crate::sync_manager::config::LogSyncConfig;
|
||||
use crate::sync_manager::data_cache::DataCache;
|
||||
use crate::sync_manager::log_entry_fetcher::{LogEntryFetcher, LogFetchProgress};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use ethereum_types::H256;
|
||||
use ethers::{prelude::Middleware, types::BlockNumber};
|
||||
use futures::FutureExt;
|
||||
use jsonrpsee::tracing::{debug, error, trace, warn};
|
||||
@ -15,10 +16,11 @@ use storage::log_store::{tx_store::BlockHashAndSubmissionIndex, Store};
|
||||
use task_executor::{ShutdownReason, TaskExecutor};
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::mpsc::UnboundedReceiver;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::sync::{oneshot, RwLock};
|
||||
|
||||
const RETRY_WAIT_MS: u64 = 500;
|
||||
const BROADCAST_CHANNEL_CAPACITY: usize = 16;
|
||||
const CATCH_UP_END_GAP: u64 = 10;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum LogSyncEvent {
|
||||
@ -49,7 +51,7 @@ impl LogSyncManager {
|
||||
config: LogSyncConfig,
|
||||
executor: TaskExecutor,
|
||||
store: Arc<dyn Store>,
|
||||
) -> Result<broadcast::Sender<LogSyncEvent>> {
|
||||
) -> Result<(broadcast::Sender<LogSyncEvent>, oneshot::Receiver<()>)> {
|
||||
let next_tx_seq = store.next_tx_seq();
|
||||
|
||||
let executor_clone = executor.clone();
|
||||
@ -57,6 +59,7 @@ impl LogSyncManager {
|
||||
|
||||
let (event_send, _) = broadcast::channel(BROADCAST_CHANNEL_CAPACITY);
|
||||
let event_send_cloned = event_send.clone();
|
||||
let (catch_up_end_sender, catch_up_end_receiver) = oneshot::channel();
|
||||
|
||||
// Spawn the task to sync log entries from the blockchain.
|
||||
executor.spawn(
|
||||
@ -96,69 +99,38 @@ impl LogSyncManager {
|
||||
block_hash_cache,
|
||||
};
|
||||
|
||||
let finalized_block = match log_sync_manager
|
||||
.log_fetcher
|
||||
.provider()
|
||||
.get_block(BlockNumber::Finalized)
|
||||
.await
|
||||
{
|
||||
Ok(Some(finalized_block)) => finalized_block,
|
||||
e => {
|
||||
warn!("unable to get finalized block: {:?}", e);
|
||||
log_sync_manager
|
||||
.log_fetcher
|
||||
.provider()
|
||||
.get_block(0)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow!("None for block 0"))?
|
||||
}
|
||||
};
|
||||
let finalized_block_number = finalized_block
|
||||
.number
|
||||
.ok_or_else(|| anyhow!("None block number for finalized block"))?
|
||||
.as_u64();
|
||||
|
||||
// Load previous progress from db and check if chain reorg happens after restart.
|
||||
// TODO(zz): Handle reorg instead of return.
|
||||
let mut need_handle_reorg = false;
|
||||
let (mut start_block_number, mut start_block_hash) =
|
||||
match log_sync_manager.store.get_sync_progress()? {
|
||||
// No previous progress, so just use config.
|
||||
None => {
|
||||
let block_number = log_sync_manager.config.start_block_number;
|
||||
let block_hash = log_sync_manager
|
||||
.log_fetcher
|
||||
.provider()
|
||||
.get_block(block_number)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow!("None for block {}", block_number))?
|
||||
.hash
|
||||
.ok_or_else(|| {
|
||||
anyhow!("None block hash for block {}", block_number)
|
||||
})?;
|
||||
let block_hash =
|
||||
log_sync_manager.get_block(block_number.into()).await?.1;
|
||||
(block_number, block_hash)
|
||||
}
|
||||
Some((block_number, block_hash)) => {
|
||||
if block_number <= finalized_block_number {
|
||||
let expect_block_hash = log_sync_manager
|
||||
.log_fetcher
|
||||
.provider()
|
||||
.get_block(block_number)
|
||||
.await?
|
||||
.ok_or_else(|| anyhow!("None for block {}", block_number))?
|
||||
.hash
|
||||
.ok_or_else(|| {
|
||||
anyhow!("None block hash for block {}", block_number)
|
||||
})?;
|
||||
Some((block_number, block_hash)) => (block_number, block_hash),
|
||||
};
|
||||
|
||||
if expect_block_hash != block_hash {
|
||||
need_handle_reorg = true;
|
||||
}
|
||||
}
|
||||
|
||||
(block_number, block_hash)
|
||||
let (mut finalized_block_number, mut finalized_block_hash) =
|
||||
match log_sync_manager.get_block(BlockNumber::Finalized).await {
|
||||
Ok(finalized) => finalized,
|
||||
Err(e) => {
|
||||
warn!(?e, "unable to get finalized block");
|
||||
log_sync_manager.get_block(0.into()).await?
|
||||
}
|
||||
};
|
||||
|
||||
// Load previous progress from db and check if chain reorg happens after restart.
|
||||
let mut need_handle_reorg = false;
|
||||
if start_block_number <= finalized_block_number {
|
||||
let expect_block_hash = log_sync_manager
|
||||
.get_block(start_block_number.into())
|
||||
.await?
|
||||
.1;
|
||||
if expect_block_hash != start_block_hash {
|
||||
need_handle_reorg = true;
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"current start block number {}, block hash {}, finalized block number {}",
|
||||
start_block_number, start_block_hash, finalized_block_number
|
||||
@ -172,7 +144,6 @@ impl LogSyncManager {
|
||||
log_sync_manager.block_hash_cache.clone(),
|
||||
);
|
||||
log_sync_manager.handle_data(reorg_rx).await?;
|
||||
|
||||
if let Some((block_number, block_hash)) =
|
||||
log_sync_manager.store.get_sync_progress()?
|
||||
{
|
||||
@ -183,21 +154,26 @@ impl LogSyncManager {
|
||||
}
|
||||
}
|
||||
|
||||
// Start watching before recovery to ensure that no log is skipped.
|
||||
// TODO(zz): Rate limit to avoid OOM during recovery.
|
||||
let mut submission_idx = None;
|
||||
let parent_block_hash = if start_block_number >= finalized_block_number {
|
||||
if start_block_number > 0 {
|
||||
if let Some(b) = log_sync_manager
|
||||
.block_hash_cache
|
||||
.read()
|
||||
.await
|
||||
.get(&start_block_number)
|
||||
if start_block_number >= finalized_block_number {
|
||||
let block = log_sync_manager
|
||||
.block_hash_cache
|
||||
.read()
|
||||
.await
|
||||
.get(&start_block_number)
|
||||
.cloned();
|
||||
if let Some(b) = block {
|
||||
// special case avoid reorg
|
||||
if let Some(submission_idx) = b.as_ref().unwrap().first_submission_index
|
||||
{
|
||||
// special case avoid reorg
|
||||
submission_idx = b.as_ref().unwrap().first_submission_index;
|
||||
log_sync_manager.process_reverted(submission_idx).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let parent_block_hash = if start_block_number >= finalized_block_number {
|
||||
// No need to catch up data.
|
||||
if start_block_number > 0 {
|
||||
let parent_block_number = start_block_number.saturating_sub(1);
|
||||
match log_sync_manager
|
||||
.block_hash_cache
|
||||
@ -206,75 +182,60 @@ impl LogSyncManager {
|
||||
.get(&parent_block_number)
|
||||
{
|
||||
Some(b) => b.as_ref().unwrap().block_hash,
|
||||
_ => log_sync_manager
|
||||
.log_fetcher
|
||||
.provider()
|
||||
.get_block(parent_block_number)
|
||||
.await?
|
||||
.ok_or_else(|| {
|
||||
anyhow!("None for block {}", parent_block_number)
|
||||
})?
|
||||
.hash
|
||||
.ok_or_else(|| {
|
||||
anyhow!("None block hash for block {}", parent_block_number)
|
||||
})?,
|
||||
_ => {
|
||||
log_sync_manager
|
||||
.get_block(parent_block_number.into())
|
||||
.await?
|
||||
.1
|
||||
}
|
||||
}
|
||||
} else {
|
||||
start_block_hash
|
||||
}
|
||||
} else {
|
||||
finalized_block
|
||||
.hash
|
||||
.ok_or_else(|| anyhow!("None for finalized block hash"))?
|
||||
// Keep catching-up data until we are close to the latest height.
|
||||
loop {
|
||||
log_sync_manager
|
||||
.catch_up_data(
|
||||
executor_clone.clone(),
|
||||
start_block_number,
|
||||
finalized_block_number,
|
||||
)
|
||||
.await?;
|
||||
start_block_number = finalized_block_number.saturating_add(1);
|
||||
|
||||
let new_finalized_block =
|
||||
log_sync_manager.get_block(BlockNumber::Finalized).await?;
|
||||
if new_finalized_block.0.saturating_sub(finalized_block_number)
|
||||
<= CATCH_UP_END_GAP
|
||||
{
|
||||
break finalized_block_hash;
|
||||
}
|
||||
finalized_block_number = new_finalized_block.0;
|
||||
finalized_block_hash = new_finalized_block.1;
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(submission_idx) = submission_idx {
|
||||
log_sync_manager.process_reverted(submission_idx).await;
|
||||
if catch_up_end_sender.send(()).is_err() {
|
||||
warn!("catch_up_end send fails, possibly auto_sync is not enabled");
|
||||
}
|
||||
|
||||
let watch_rx = log_sync_manager.log_fetcher.start_watch(
|
||||
if start_block_number >= finalized_block_number {
|
||||
start_block_number
|
||||
} else {
|
||||
finalized_block_number.saturating_add(1)
|
||||
},
|
||||
start_block_number,
|
||||
parent_block_hash,
|
||||
&executor_clone,
|
||||
log_sync_manager.block_hash_cache.clone(),
|
||||
log_sync_manager.config.watch_loop_wait_time_ms,
|
||||
);
|
||||
|
||||
if start_block_number < finalized_block_number {
|
||||
let recover_rx = log_sync_manager.log_fetcher.start_recover(
|
||||
start_block_number,
|
||||
finalized_block_number,
|
||||
&executor_clone,
|
||||
Duration::from_millis(log_sync_manager.config.recover_query_delay),
|
||||
);
|
||||
log_sync_manager.handle_data(recover_rx).await?;
|
||||
}
|
||||
|
||||
log_sync_manager
|
||||
.log_fetcher
|
||||
.start_remove_finalized_block_task(
|
||||
&executor_clone,
|
||||
log_sync_manager.store.clone(),
|
||||
log_sync_manager.block_hash_cache.clone(),
|
||||
log_sync_manager.config.default_finalized_block_count,
|
||||
log_sync_manager
|
||||
.config
|
||||
.remove_finalized_block_interval_minutes,
|
||||
);
|
||||
|
||||
// Syncing `watch_rx` is supposed to block forever.
|
||||
log_sync_manager.handle_data(watch_rx).await?;
|
||||
Ok(())
|
||||
Ok::<(), anyhow::Error>(())
|
||||
},
|
||||
)
|
||||
.map(|_| ()),
|
||||
"log_sync",
|
||||
);
|
||||
Ok(event_send_cloned)
|
||||
Ok((event_send_cloned, catch_up_end_receiver))
|
||||
}
|
||||
|
||||
async fn put_tx(&mut self, tx: Transaction) -> bool {
|
||||
@ -423,6 +384,54 @@ impl LogSyncManager {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_block(&self, block_number: BlockNumber) -> Result<(u64, H256)> {
|
||||
let block = match self.log_fetcher.provider().get_block(block_number).await {
|
||||
Ok(Some(block)) => block,
|
||||
Ok(None) => {
|
||||
bail!("None for block {}", block_number);
|
||||
}
|
||||
e => {
|
||||
bail!("unable to get block: {:?}", e);
|
||||
}
|
||||
};
|
||||
Ok((
|
||||
block
|
||||
.number
|
||||
.ok_or_else(|| anyhow!("None block number for finalized block"))?
|
||||
.as_u64(),
|
||||
block
|
||||
.hash
|
||||
.ok_or_else(|| anyhow!("None block hash for finalized block"))?,
|
||||
))
|
||||
}
|
||||
|
||||
/// Return the ending block number and the parent block hash.
|
||||
async fn catch_up_data(
|
||||
&mut self,
|
||||
executor_clone: TaskExecutor,
|
||||
start_block_number: u64,
|
||||
finalized_block_number: u64,
|
||||
) -> Result<()> {
|
||||
if start_block_number < finalized_block_number {
|
||||
let recover_rx = self.log_fetcher.start_recover(
|
||||
start_block_number,
|
||||
finalized_block_number,
|
||||
&executor_clone,
|
||||
Duration::from_millis(self.config.recover_query_delay),
|
||||
);
|
||||
self.handle_data(recover_rx).await?;
|
||||
}
|
||||
|
||||
self.log_fetcher.start_remove_finalized_block_task(
|
||||
&executor_clone,
|
||||
self.store.clone(),
|
||||
self.block_hash_cache.clone(),
|
||||
self.config.default_finalized_block_count,
|
||||
self.config.remove_finalized_block_interval_minutes,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_and_log<R, E>(
|
||||
|
@ -15,7 +15,7 @@ use storage::log_store::log_manager::LogConfig;
|
||||
use storage::log_store::Store;
|
||||
use storage::{LogManager, StorageConfig};
|
||||
use sync::{SyncSender, SyncService};
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tokio::sync::{broadcast, mpsc, oneshot};
|
||||
|
||||
macro_rules! require {
|
||||
($component:expr, $self:ident, $e:ident) => {
|
||||
@ -48,6 +48,7 @@ struct MinerComponents {
|
||||
|
||||
struct LogSyncComponents {
|
||||
send: broadcast::Sender<LogSyncEvent>,
|
||||
catch_up_end_recv: Option<oneshot::Receiver<()>>,
|
||||
}
|
||||
|
||||
struct PrunerComponents {
|
||||
@ -163,6 +164,13 @@ impl ClientBuilder {
|
||||
let file_location_cache = require!("sync", self, file_location_cache).clone();
|
||||
let network_send = require!("sync", self, network).send.clone();
|
||||
let event_recv = require!("sync", self, log_sync).send.subscribe();
|
||||
let catch_up_end_recv = self
|
||||
.log_sync
|
||||
.as_mut()
|
||||
.expect("checked in event_recv")
|
||||
.catch_up_end_recv
|
||||
.take()
|
||||
.ok_or("sync requires a catch_up_end_recv")?;
|
||||
|
||||
let send = SyncService::spawn_with_config(
|
||||
config,
|
||||
@ -171,6 +179,7 @@ impl ClientBuilder {
|
||||
store,
|
||||
file_location_cache,
|
||||
event_recv,
|
||||
catch_up_end_recv,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to start sync service: {:?}", e))?;
|
||||
@ -295,11 +304,14 @@ impl ClientBuilder {
|
||||
pub async fn with_log_sync(mut self, config: LogSyncConfig) -> Result<Self, String> {
|
||||
let executor = require!("log_sync", self, runtime_context).clone().executor;
|
||||
let store = require!("log_sync", self, store).clone();
|
||||
let send = LogSyncManager::spawn(config, executor, store)
|
||||
let (send, catch_up_end_recv) = LogSyncManager::spawn(config, executor, store)
|
||||
.await
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
self.log_sync = Some(LogSyncComponents { send });
|
||||
self.log_sync = Some(LogSyncComponents {
|
||||
send,
|
||||
catch_up_end_recv: Some(catch_up_end_recv),
|
||||
});
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@ use anyhow::Result;
|
||||
use log_entry_sync::LogSyncEvent;
|
||||
use std::{collections::HashMap, fmt::Debug, sync::Arc};
|
||||
use storage_async::Store;
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::{
|
||||
sync::{broadcast::Receiver, mpsc::UnboundedReceiver},
|
||||
time::sleep,
|
||||
@ -83,9 +84,12 @@ impl SerialBatcher {
|
||||
mut self,
|
||||
mut file_announcement_recv: UnboundedReceiver<u64>,
|
||||
mut log_sync_recv: Receiver<LogSyncEvent>,
|
||||
catch_up_end_recv: oneshot::Receiver<()>,
|
||||
) {
|
||||
info!(?self, "Start to sync files");
|
||||
|
||||
catch_up_end_recv.await.expect("log sync sender dropped");
|
||||
|
||||
loop {
|
||||
// handle all pending file announcements
|
||||
if self
|
||||
|
@ -26,7 +26,7 @@ use storage::error::Result as StorageResult;
|
||||
use storage::log_store::Store as LogStore;
|
||||
use storage_async::Store;
|
||||
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tokio::sync::{broadcast, mpsc, oneshot};
|
||||
|
||||
const HEARTBEAT_INTERVAL_SEC: u64 = 5;
|
||||
|
||||
@ -135,6 +135,7 @@ impl SyncService {
|
||||
store: Arc<dyn LogStore>,
|
||||
file_location_cache: Arc<FileLocationCache>,
|
||||
event_recv: broadcast::Receiver<LogSyncEvent>,
|
||||
catch_up_end_recv: oneshot::Receiver<()>,
|
||||
) -> Result<SyncSender> {
|
||||
Self::spawn_with_config(
|
||||
Config::default(),
|
||||
@ -143,6 +144,7 @@ impl SyncService {
|
||||
store,
|
||||
file_location_cache,
|
||||
event_recv,
|
||||
catch_up_end_recv,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@ -154,6 +156,7 @@ impl SyncService {
|
||||
store: Arc<dyn LogStore>,
|
||||
file_location_cache: Arc<FileLocationCache>,
|
||||
event_recv: broadcast::Receiver<LogSyncEvent>,
|
||||
catch_up_end_recv: oneshot::Receiver<()>,
|
||||
) -> Result<SyncSender> {
|
||||
let (sync_send, sync_recv) = channel::Channel::unbounded();
|
||||
|
||||
@ -171,7 +174,10 @@ impl SyncService {
|
||||
let serial_batcher =
|
||||
SerialBatcher::new(config, store.clone(), sync_send.clone(), sync_store.clone())
|
||||
.await?;
|
||||
executor.spawn(serial_batcher.start(recv, event_recv), "auto_sync_serial");
|
||||
executor.spawn(
|
||||
serial_batcher.start(recv, event_recv, catch_up_end_recv),
|
||||
"auto_sync_serial",
|
||||
);
|
||||
|
||||
// sync randomly
|
||||
let random_batcher =
|
||||
@ -772,6 +778,7 @@ mod tests {
|
||||
network_send: UnboundedSender<NetworkMessage>,
|
||||
network_recv: UnboundedReceiver<NetworkMessage>,
|
||||
event_send: broadcast::Sender<LogSyncEvent>,
|
||||
catch_up_end_recv: Option<oneshot::Receiver<()>>,
|
||||
}
|
||||
|
||||
impl Default for TestSyncRuntime {
|
||||
@ -788,6 +795,7 @@ mod tests {
|
||||
let init_peer_id = identity::Keypair::generate_ed25519().public().to_peer_id();
|
||||
let (network_send, network_recv) = mpsc::unbounded_channel::<NetworkMessage>();
|
||||
let (event_send, _) = broadcast::channel(16);
|
||||
let (_, catch_up_end_recv) = oneshot::channel();
|
||||
|
||||
let tx_ids = txs.iter().take(seq_size).map(|tx| tx.id()).collect();
|
||||
|
||||
@ -803,16 +811,17 @@ mod tests {
|
||||
network_send,
|
||||
network_recv,
|
||||
event_send,
|
||||
catch_up_end_recv: Some(catch_up_end_recv),
|
||||
}
|
||||
}
|
||||
|
||||
async fn spawn_sync_service(&self, with_peer_store: bool) -> SyncSender {
|
||||
async fn spawn_sync_service(&mut self, with_peer_store: bool) -> SyncSender {
|
||||
self.spawn_sync_service_with_config(with_peer_store, Config::default())
|
||||
.await
|
||||
}
|
||||
|
||||
async fn spawn_sync_service_with_config(
|
||||
&self,
|
||||
&mut self,
|
||||
with_peer_store: bool,
|
||||
config: Config,
|
||||
) -> SyncSender {
|
||||
@ -829,6 +838,7 @@ mod tests {
|
||||
store,
|
||||
self.file_location_cache.clone(),
|
||||
self.event_send.subscribe(),
|
||||
self.catch_up_end_recv.take().unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
@ -1198,6 +1208,7 @@ mod tests {
|
||||
|
||||
let (network_send, mut network_recv) = mpsc::unbounded_channel::<NetworkMessage>();
|
||||
let (_event_send, event_recv) = broadcast::channel(16);
|
||||
let (_, catch_up_end_recv) = oneshot::channel();
|
||||
let sync_send = SyncService::spawn_with_config(
|
||||
Config::default(),
|
||||
runtime.task_executor.clone(),
|
||||
@ -1205,6 +1216,7 @@ mod tests {
|
||||
store.clone(),
|
||||
file_location_cache,
|
||||
event_recv,
|
||||
catch_up_end_recv,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@ -1544,7 +1556,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_sync_status_unknown() {
|
||||
let runtime = TestSyncRuntime::default();
|
||||
let mut runtime = TestSyncRuntime::default();
|
||||
let sync_send = runtime.spawn_sync_service(false).await;
|
||||
|
||||
assert!(matches!(
|
||||
|
Loading…
Reference in New Issue
Block a user