Tweak head syncing (#1845)
## Issue Addressed Fixes head syncing ## Proposed Changes - Get back to statusing peers after removing chain segments and making the peer manager deal with status according to the Sync status, preventing an old known deadlock - Also a bug where a chain would get removed if the optimistic batch succeeds being empty ## Additional Info Tested on Medalla and looking good
This commit is contained in:
parent
304793a6ab
commit
6c0c050fbb
@ -3,6 +3,7 @@
|
||||
pub use self::peerdb::*;
|
||||
use crate::discovery::{subnet_predicate, Discovery, DiscoveryEvent, TARGET_SUBNET_PEERS};
|
||||
use crate::rpc::{GoodbyeReason, MetaData, Protocol, RPCError, RPCResponseErrorCode};
|
||||
use crate::types::SyncState;
|
||||
use crate::{error, metrics};
|
||||
use crate::{EnrExt, NetworkConfig, NetworkGlobals, PeerId, SubnetDiscovery};
|
||||
use futures::prelude::*;
|
||||
@ -844,6 +845,8 @@ impl<TSpec: EthSpec> Stream for PeerManager<TSpec> {
|
||||
}
|
||||
}
|
||||
|
||||
if !matches!(self.network_globals.sync_state(), SyncState::SyncingFinalized{..}|SyncState::SyncingHead{..})
|
||||
{
|
||||
loop {
|
||||
match self.status_peers.poll_next_unpin(cx) {
|
||||
Poll::Ready(Some(Ok(peer_id))) => {
|
||||
@ -856,6 +859,7 @@ impl<TSpec: EthSpec> Stream for PeerManager<TSpec> {
|
||||
Poll::Ready(None) | Poll::Pending => break,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !self.events.is_empty() {
|
||||
return Poll::Ready(Some(self.events.remove(0)));
|
||||
|
@ -10,6 +10,9 @@ pub enum SyncState {
|
||||
/// The node is performing a long-range (batch) sync over one or many head chains.
|
||||
/// In this state parent lookups are disabled.
|
||||
SyncingHead { start_slot: Slot, target_slot: Slot },
|
||||
/// The node has identified the need for is sync operations and is transitioning to a syncing
|
||||
/// state.
|
||||
SyncTransition,
|
||||
/// The node is up to date with all known peers and is connected to at least one
|
||||
/// fully synced peer. In this state, parent lookups are enabled.
|
||||
Synced,
|
||||
@ -25,6 +28,7 @@ impl PartialEq for SyncState {
|
||||
(SyncState::SyncingHead { .. }, SyncState::SyncingHead { .. }) => true,
|
||||
(SyncState::Synced, SyncState::Synced) => true,
|
||||
(SyncState::Stalled, SyncState::Stalled) => true,
|
||||
(SyncState::SyncTransition, SyncState::SyncTransition) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@ -36,6 +40,7 @@ impl SyncState {
|
||||
match self {
|
||||
SyncState::SyncingFinalized { .. } => true,
|
||||
SyncState::SyncingHead { .. } => true,
|
||||
SyncState::SyncTransition => true,
|
||||
SyncState::Synced => false,
|
||||
SyncState::Stalled => false,
|
||||
}
|
||||
@ -54,6 +59,7 @@ impl std::fmt::Display for SyncState {
|
||||
SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"),
|
||||
SyncState::Synced { .. } => write!(f, "Synced"),
|
||||
SyncState::Stalled { .. } => write!(f, "Stalled"),
|
||||
SyncState::SyncTransition => write!(f, "Searching syncing peers"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -330,7 +330,7 @@ pub fn serve<T: BeaconChainTypes>(
|
||||
)))
|
||||
}
|
||||
}
|
||||
SyncState::SyncingHead { .. } => Ok(()),
|
||||
SyncState::SyncingHead { .. } | SyncState::SyncTransition => Ok(()),
|
||||
SyncState::Synced => Ok(()),
|
||||
SyncState::Stalled => Err(warp_utils::reject::not_synced(
|
||||
"sync is stalled".to_string(),
|
||||
@ -1231,12 +1231,12 @@ pub fn serve<T: BeaconChainTypes>(
|
||||
.and(network_globals.clone())
|
||||
.and_then(|network_globals: Arc<NetworkGlobals<T::EthSpec>>| {
|
||||
blocking_task(move || match *network_globals.sync_state.read() {
|
||||
SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } => {
|
||||
Ok(warp::reply::with_status(
|
||||
SyncState::SyncingFinalized { .. }
|
||||
| SyncState::SyncingHead { .. }
|
||||
| SyncState::SyncTransition => Ok(warp::reply::with_status(
|
||||
warp::reply(),
|
||||
warp::http::StatusCode::PARTIAL_CONTENT,
|
||||
))
|
||||
}
|
||||
)),
|
||||
SyncState::Synced => Ok(warp::reply::with_status(
|
||||
warp::reply(),
|
||||
warp::http::StatusCode::OK,
|
||||
|
@ -691,10 +691,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|
||||
{
|
||||
SyncState::Synced
|
||||
} else if peers.advanced_peers().next().is_some() {
|
||||
SyncState::SyncingHead {
|
||||
start_slot: head,
|
||||
target_slot: current_slot,
|
||||
}
|
||||
SyncState::SyncTransition
|
||||
} else if peers.synced_peers().next().is_none() {
|
||||
SyncState::Stalled
|
||||
} else {
|
||||
|
@ -91,6 +91,9 @@ pub struct SyncingChain<T: BeaconChainTypes> {
|
||||
/// The current processing batch, if any.
|
||||
current_processing_batch: Option<BatchId>,
|
||||
|
||||
/// Batches validated by this chain.
|
||||
validated_batches: u8,
|
||||
|
||||
/// A multi-threaded, non-blocking processor for applying messages to the beacon chain.
|
||||
beacon_processor_send: Sender<BeaconWorkEvent<T::EthSpec>>,
|
||||
|
||||
@ -140,6 +143,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
attempted_optimistic_starts: HashSet::default(),
|
||||
state: ChainSyncingState::Stopped,
|
||||
current_processing_batch: None,
|
||||
validated_batches: 0,
|
||||
beacon_processor_send,
|
||||
log: log.new(o!("chain" => id)),
|
||||
}
|
||||
@ -155,6 +159,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.id
|
||||
}
|
||||
|
||||
/// Peers currently syncing this chain.
|
||||
pub fn peers<'a>(&'a self) -> impl Iterator<Item = PeerId> + 'a {
|
||||
self.peers.keys().cloned()
|
||||
}
|
||||
|
||||
/// Progress in epochs made by the chain
|
||||
pub fn validated_epochs(&self) -> u64 {
|
||||
self.validated_batches as u64 * EPOCHS_PER_BATCH
|
||||
}
|
||||
|
||||
/// Removes a peer from the chain.
|
||||
/// If the peer has active batches, those are considered failed and re-requested.
|
||||
pub fn remove_peer(
|
||||
@ -447,6 +461,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
self.advance_chain(network, batch_id);
|
||||
// we register so that on chain switching we don't try it again
|
||||
self.attempted_optimistic_starts.insert(batch_id);
|
||||
self.processing_target += EPOCHS_PER_BATCH;
|
||||
} else if let Some(epoch) = self.optimistic_start {
|
||||
// check if this batch corresponds to an optimistic batch. In this case, we
|
||||
// reject it as an optimistic candidate since the batch was empty
|
||||
@ -456,10 +471,10 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
false, /* do not re-request */
|
||||
"batch was empty",
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
self.processing_target += EPOCHS_PER_BATCH;
|
||||
}
|
||||
}
|
||||
|
||||
// check if the chain has completed syncing
|
||||
if self.current_processed_slot() >= self.target_head_slot {
|
||||
@ -574,6 +589,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
||||
let removed_batches = std::mem::replace(&mut self.batches, remaining_batches);
|
||||
|
||||
for (id, batch) in removed_batches.into_iter() {
|
||||
self.validated_batches = self.validated_batches.saturating_add(1);
|
||||
// only for batches awaiting validation can we be sure the last attempt is
|
||||
// right, and thus, that any different attempt is wrong
|
||||
match batch.state() {
|
||||
@ -1024,6 +1040,7 @@ impl<T: BeaconChainTypes> slog::KV for SyncingChain<T> {
|
||||
)?;
|
||||
serializer.emit_usize("batches", self.batches.len())?;
|
||||
serializer.emit_usize("peers", self.peers.len())?;
|
||||
serializer.emit_u8("validated_batches", self.validated_batches)?;
|
||||
slog::Result::Ok(())
|
||||
}
|
||||
}
|
||||
@ -1037,7 +1054,7 @@ impl From<WrongBatchState> for RemoveChain {
|
||||
|
||||
impl std::fmt::Debug for RemoveChain {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// needed to avoid Debuggins Strings
|
||||
// needed to avoid Debugging Strings
|
||||
match self {
|
||||
RemoveChain::ChainCompleted => f.write_str("ChainCompleted"),
|
||||
RemoveChain::EmptyPeerPool => f.write_str("EmptyPeerPool"),
|
||||
@ -1047,3 +1064,12 @@ impl std::fmt::Debug for RemoveChain {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoveChain {
|
||||
pub fn is_critical(&self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
RemoveChain::WrongBatchState(..) | RemoveChain::WrongChainState(..)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ use crate::sync::PeerSyncInfo;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use eth2_libp2p::PeerId;
|
||||
use fnv::FnvHashMap;
|
||||
use slog::{debug, error};
|
||||
use slog::{crit, debug, error};
|
||||
use smallvec::SmallVec;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
@ -300,7 +300,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
match old_id {
|
||||
Some(Some(old_id)) => debug!(self.log, "Switching finalized chains";
|
||||
"old_id" => old_id, &chain),
|
||||
None => debug!(self.log, "Syncing new chain"; &chain),
|
||||
None => debug!(self.log, "Syncing new finalized chain"; &chain),
|
||||
Some(None) => {
|
||||
// this is the same chain. We try to advance it.
|
||||
}
|
||||
@ -311,8 +311,12 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
|
||||
if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch)
|
||||
{
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
|
||||
} else {
|
||||
// this happens only if sending a batch over the `network` fails a lot
|
||||
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
|
||||
}
|
||||
self.finalized_chains.remove(&new_id);
|
||||
self.on_chain_removed(&new_id, true);
|
||||
}
|
||||
@ -330,6 +334,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
) {
|
||||
// Include the awaiting head peers
|
||||
for (peer_id, peer_sync_info) in awaiting_head_peers.drain() {
|
||||
debug!(self.log, "including head peer");
|
||||
self.add_peer_or_create_chain(
|
||||
local_epoch,
|
||||
peer_sync_info.head_root,
|
||||
@ -368,7 +373,11 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
chain.start_syncing(network, local_epoch, local_head_epoch)
|
||||
{
|
||||
self.head_chains.remove(&id);
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
|
||||
} else {
|
||||
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
|
||||
}
|
||||
} else {
|
||||
syncing_chains.push(id);
|
||||
}
|
||||
@ -482,7 +491,11 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
|
||||
debug_assert_eq!(chain.target_head_root, target_head_root);
|
||||
debug_assert_eq!(chain.target_head_slot, target_head_slot);
|
||||
if let Err(remove_reason) = chain.add_peer(network, peer) {
|
||||
debug!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
|
||||
if remove_reason.is_critical() {
|
||||
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
|
||||
} else {
|
||||
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
|
||||
}
|
||||
let chain = entry.remove();
|
||||
self.on_chain_removed(&id, chain.is_syncing());
|
||||
}
|
||||
|
@ -19,7 +19,7 @@
|
||||
//! need to be downloaded.
|
||||
//!
|
||||
//! A few interesting notes about finalized chain syncing:
|
||||
//! - Only one finalized chain can sync at a time.
|
||||
//! - Only one finalized chain can sync at a time
|
||||
//! - The finalized chain with the largest peer pool takes priority.
|
||||
//! - As one finalized chain completes, others are checked to see if we they can be continued,
|
||||
//! otherwise they are removed.
|
||||
@ -39,7 +39,7 @@
|
||||
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
|
||||
//! and further batches are requested as current blocks are being processed.
|
||||
|
||||
use super::chain::ChainId;
|
||||
use super::chain::{ChainId, RemoveChain, SyncingChain};
|
||||
use super::chain_collection::ChainCollection;
|
||||
use super::sync_type::RangeSyncType;
|
||||
use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
|
||||
@ -49,7 +49,7 @@ use crate::sync::PeerSyncInfo;
|
||||
use crate::sync::RequestId;
|
||||
use beacon_chain::{BeaconChain, BeaconChainTypes};
|
||||
use eth2_libp2p::PeerId;
|
||||
use slog::{debug, error, trace};
|
||||
use slog::{crit, debug, error, trace};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::mpsc;
|
||||
@ -121,7 +121,8 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
.finalized_epoch
|
||||
.start_slot(T::EthSpec::slots_per_epoch());
|
||||
|
||||
// NOTE: A peer that has been re-status'd may now exist in multiple finalized chains.
|
||||
// NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. This
|
||||
// is OK since we since only one finalized chain at a time.
|
||||
|
||||
// determine which kind of sync to perform and set up the chains
|
||||
match RangeSyncType::new(&self.beacon_chain, &local_info, &remote_info) {
|
||||
@ -208,22 +209,22 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
|
||||
}) {
|
||||
Ok((removed_chain, sync_type)) => {
|
||||
if let Some(_removed_chain) = removed_chain {
|
||||
debug!(self.log, "Chain removed after block response"; "sync_type" => ?sync_type, "chain_id" => chain_id);
|
||||
// update the state of the collection
|
||||
self.chains.update(
|
||||
if let Some((removed_chain, remove_reason)) = removed_chain {
|
||||
self.on_chain_removed(
|
||||
removed_chain,
|
||||
sync_type,
|
||||
remove_reason,
|
||||
network,
|
||||
&mut self.awaiting_head_peers,
|
||||
&self.beacon_processor_send,
|
||||
"block response",
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
|
||||
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
|
||||
}
|
||||
}
|
||||
|
||||
@ -241,17 +242,18 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
Ok((None, _sync_type)) => {
|
||||
// Chain was found and not removed
|
||||
}
|
||||
Ok((Some(_removed_chain), sync_type)) => {
|
||||
debug!(self.log, "Chain removed after processing result"; "chain" => chain_id, "sync_type" => ?sync_type);
|
||||
self.chains.update(
|
||||
Ok((Some((removed_chain, remove_reason)), sync_type)) => {
|
||||
self.on_chain_removed(
|
||||
removed_chain,
|
||||
sync_type,
|
||||
remove_reason,
|
||||
network,
|
||||
&mut self.awaiting_head_peers,
|
||||
&self.beacon_processor_send,
|
||||
"batch processing result",
|
||||
);
|
||||
}
|
||||
|
||||
Err(_) => {
|
||||
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -279,14 +281,16 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
.chains
|
||||
.call_all(|chain| chain.remove_peer(peer_id, network))
|
||||
{
|
||||
debug!(self.log, "Chain removed after removing peer"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason);
|
||||
self.on_chain_removed(
|
||||
removed_chain,
|
||||
sync_type,
|
||||
remove_reason,
|
||||
network,
|
||||
"peer removed",
|
||||
);
|
||||
|
||||
// update the state of the collection
|
||||
}
|
||||
self.chains.update(
|
||||
network,
|
||||
&mut self.awaiting_head_peers,
|
||||
&self.beacon_processor_send,
|
||||
);
|
||||
}
|
||||
|
||||
/// An RPC error has occurred.
|
||||
@ -307,7 +311,40 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
}) {
|
||||
Ok((removed_chain, sync_type)) => {
|
||||
if let Some((removed_chain, remove_reason)) = removed_chain {
|
||||
debug!(self.log, "Chain removed on rpc error"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason);
|
||||
self.on_chain_removed(
|
||||
removed_chain,
|
||||
sync_type,
|
||||
remove_reason,
|
||||
network,
|
||||
"RPC error",
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
|
||||
}
|
||||
}
|
||||
|
||||
fn on_chain_removed(
|
||||
&mut self,
|
||||
chain: SyncingChain<T>,
|
||||
sync_type: RangeSyncType,
|
||||
remove_reason: RemoveChain,
|
||||
network: &mut SyncNetworkContext<T::EthSpec>,
|
||||
op: &'static str,
|
||||
) {
|
||||
if remove_reason.is_critical() {
|
||||
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
|
||||
} else {
|
||||
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
|
||||
}
|
||||
|
||||
network.status_peers(self.beacon_chain.clone(), chain.peers());
|
||||
|
||||
// update the state of the collection
|
||||
self.chains.update(
|
||||
network,
|
||||
@ -315,13 +352,4 @@ impl<T: BeaconChainTypes> RangeSync<T> {
|
||||
&self.beacon_processor_send,
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user