Tweak head syncing (#1845)

## Issue Addressed

Fixes head syncing

## Proposed Changes

- Get back to statusing peers after removing chain segments and making the peer manager deal with status according to the Sync status, preventing an old known deadlock
- Also a bug where a chain would get removed if the optimistic batch succeeds being empty

## Additional Info

Tested on Medalla and looking good
This commit is contained in:
divma 2020-11-01 23:37:39 +00:00
parent 304793a6ab
commit 6c0c050fbb
7 changed files with 134 additions and 60 deletions

View File

@ -3,6 +3,7 @@
pub use self::peerdb::*; pub use self::peerdb::*;
use crate::discovery::{subnet_predicate, Discovery, DiscoveryEvent, TARGET_SUBNET_PEERS}; use crate::discovery::{subnet_predicate, Discovery, DiscoveryEvent, TARGET_SUBNET_PEERS};
use crate::rpc::{GoodbyeReason, MetaData, Protocol, RPCError, RPCResponseErrorCode}; use crate::rpc::{GoodbyeReason, MetaData, Protocol, RPCError, RPCResponseErrorCode};
use crate::types::SyncState;
use crate::{error, metrics}; use crate::{error, metrics};
use crate::{EnrExt, NetworkConfig, NetworkGlobals, PeerId, SubnetDiscovery}; use crate::{EnrExt, NetworkConfig, NetworkGlobals, PeerId, SubnetDiscovery};
use futures::prelude::*; use futures::prelude::*;
@ -844,16 +845,19 @@ impl<TSpec: EthSpec> Stream for PeerManager<TSpec> {
} }
} }
loop { if !matches!(self.network_globals.sync_state(), SyncState::SyncingFinalized{..}|SyncState::SyncingHead{..})
match self.status_peers.poll_next_unpin(cx) { {
Poll::Ready(Some(Ok(peer_id))) => { loop {
self.status_peers.insert(peer_id.clone()); match self.status_peers.poll_next_unpin(cx) {
self.events.push(PeerManagerEvent::Status(peer_id)) Poll::Ready(Some(Ok(peer_id))) => {
self.status_peers.insert(peer_id.clone());
self.events.push(PeerManagerEvent::Status(peer_id))
}
Poll::Ready(Some(Err(e))) => {
error!(self.log, "Failed to check for peers to ping"; "error" => e.to_string())
}
Poll::Ready(None) | Poll::Pending => break,
} }
Poll::Ready(Some(Err(e))) => {
error!(self.log, "Failed to check for peers to ping"; "error" => e.to_string())
}
Poll::Ready(None) | Poll::Pending => break,
} }
} }

View File

@ -10,6 +10,9 @@ pub enum SyncState {
/// The node is performing a long-range (batch) sync over one or many head chains. /// The node is performing a long-range (batch) sync over one or many head chains.
/// In this state parent lookups are disabled. /// In this state parent lookups are disabled.
SyncingHead { start_slot: Slot, target_slot: Slot }, SyncingHead { start_slot: Slot, target_slot: Slot },
/// The node has identified the need for is sync operations and is transitioning to a syncing
/// state.
SyncTransition,
/// The node is up to date with all known peers and is connected to at least one /// The node is up to date with all known peers and is connected to at least one
/// fully synced peer. In this state, parent lookups are enabled. /// fully synced peer. In this state, parent lookups are enabled.
Synced, Synced,
@ -25,6 +28,7 @@ impl PartialEq for SyncState {
(SyncState::SyncingHead { .. }, SyncState::SyncingHead { .. }) => true, (SyncState::SyncingHead { .. }, SyncState::SyncingHead { .. }) => true,
(SyncState::Synced, SyncState::Synced) => true, (SyncState::Synced, SyncState::Synced) => true,
(SyncState::Stalled, SyncState::Stalled) => true, (SyncState::Stalled, SyncState::Stalled) => true,
(SyncState::SyncTransition, SyncState::SyncTransition) => true,
_ => false, _ => false,
} }
} }
@ -36,6 +40,7 @@ impl SyncState {
match self { match self {
SyncState::SyncingFinalized { .. } => true, SyncState::SyncingFinalized { .. } => true,
SyncState::SyncingHead { .. } => true, SyncState::SyncingHead { .. } => true,
SyncState::SyncTransition => true,
SyncState::Synced => false, SyncState::Synced => false,
SyncState::Stalled => false, SyncState::Stalled => false,
} }
@ -54,6 +59,7 @@ impl std::fmt::Display for SyncState {
SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"), SyncState::SyncingHead { .. } => write!(f, "Syncing Head Chain"),
SyncState::Synced { .. } => write!(f, "Synced"), SyncState::Synced { .. } => write!(f, "Synced"),
SyncState::Stalled { .. } => write!(f, "Stalled"), SyncState::Stalled { .. } => write!(f, "Stalled"),
SyncState::SyncTransition => write!(f, "Searching syncing peers"),
} }
} }
} }

View File

@ -330,7 +330,7 @@ pub fn serve<T: BeaconChainTypes>(
))) )))
} }
} }
SyncState::SyncingHead { .. } => Ok(()), SyncState::SyncingHead { .. } | SyncState::SyncTransition => Ok(()),
SyncState::Synced => Ok(()), SyncState::Synced => Ok(()),
SyncState::Stalled => Err(warp_utils::reject::not_synced( SyncState::Stalled => Err(warp_utils::reject::not_synced(
"sync is stalled".to_string(), "sync is stalled".to_string(),
@ -1231,12 +1231,12 @@ pub fn serve<T: BeaconChainTypes>(
.and(network_globals.clone()) .and(network_globals.clone())
.and_then(|network_globals: Arc<NetworkGlobals<T::EthSpec>>| { .and_then(|network_globals: Arc<NetworkGlobals<T::EthSpec>>| {
blocking_task(move || match *network_globals.sync_state.read() { blocking_task(move || match *network_globals.sync_state.read() {
SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } => { SyncState::SyncingFinalized { .. }
Ok(warp::reply::with_status( | SyncState::SyncingHead { .. }
warp::reply(), | SyncState::SyncTransition => Ok(warp::reply::with_status(
warp::http::StatusCode::PARTIAL_CONTENT, warp::reply(),
)) warp::http::StatusCode::PARTIAL_CONTENT,
} )),
SyncState::Synced => Ok(warp::reply::with_status( SyncState::Synced => Ok(warp::reply::with_status(
warp::reply(), warp::reply(),
warp::http::StatusCode::OK, warp::http::StatusCode::OK,

View File

@ -691,10 +691,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
{ {
SyncState::Synced SyncState::Synced
} else if peers.advanced_peers().next().is_some() { } else if peers.advanced_peers().next().is_some() {
SyncState::SyncingHead { SyncState::SyncTransition
start_slot: head,
target_slot: current_slot,
}
} else if peers.synced_peers().next().is_none() { } else if peers.synced_peers().next().is_none() {
SyncState::Stalled SyncState::Stalled
} else { } else {

View File

@ -91,6 +91,9 @@ pub struct SyncingChain<T: BeaconChainTypes> {
/// The current processing batch, if any. /// The current processing batch, if any.
current_processing_batch: Option<BatchId>, current_processing_batch: Option<BatchId>,
/// Batches validated by this chain.
validated_batches: u8,
/// A multi-threaded, non-blocking processor for applying messages to the beacon chain. /// A multi-threaded, non-blocking processor for applying messages to the beacon chain.
beacon_processor_send: Sender<BeaconWorkEvent<T::EthSpec>>, beacon_processor_send: Sender<BeaconWorkEvent<T::EthSpec>>,
@ -140,6 +143,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
attempted_optimistic_starts: HashSet::default(), attempted_optimistic_starts: HashSet::default(),
state: ChainSyncingState::Stopped, state: ChainSyncingState::Stopped,
current_processing_batch: None, current_processing_batch: None,
validated_batches: 0,
beacon_processor_send, beacon_processor_send,
log: log.new(o!("chain" => id)), log: log.new(o!("chain" => id)),
} }
@ -155,6 +159,16 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.id self.id
} }
/// Peers currently syncing this chain.
pub fn peers<'a>(&'a self) -> impl Iterator<Item = PeerId> + 'a {
self.peers.keys().cloned()
}
/// Progress in epochs made by the chain
pub fn validated_epochs(&self) -> u64 {
self.validated_batches as u64 * EPOCHS_PER_BATCH
}
/// Removes a peer from the chain. /// Removes a peer from the chain.
/// If the peer has active batches, those are considered failed and re-requested. /// If the peer has active batches, those are considered failed and re-requested.
pub fn remove_peer( pub fn remove_peer(
@ -447,6 +461,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
self.advance_chain(network, batch_id); self.advance_chain(network, batch_id);
// we register so that on chain switching we don't try it again // we register so that on chain switching we don't try it again
self.attempted_optimistic_starts.insert(batch_id); self.attempted_optimistic_starts.insert(batch_id);
self.processing_target += EPOCHS_PER_BATCH;
} else if let Some(epoch) = self.optimistic_start { } else if let Some(epoch) = self.optimistic_start {
// check if this batch corresponds to an optimistic batch. In this case, we // check if this batch corresponds to an optimistic batch. In this case, we
// reject it as an optimistic candidate since the batch was empty // reject it as an optimistic candidate since the batch was empty
@ -456,11 +471,11 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
false, /* do not re-request */ false, /* do not re-request */
"batch was empty", "batch was empty",
)?; )?;
} else {
self.processing_target += EPOCHS_PER_BATCH;
} }
} }
self.processing_target += EPOCHS_PER_BATCH;
// check if the chain has completed syncing // check if the chain has completed syncing
if self.current_processed_slot() >= self.target_head_slot { if self.current_processed_slot() >= self.target_head_slot {
// chain is completed // chain is completed
@ -574,6 +589,7 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
let removed_batches = std::mem::replace(&mut self.batches, remaining_batches); let removed_batches = std::mem::replace(&mut self.batches, remaining_batches);
for (id, batch) in removed_batches.into_iter() { for (id, batch) in removed_batches.into_iter() {
self.validated_batches = self.validated_batches.saturating_add(1);
// only for batches awaiting validation can we be sure the last attempt is // only for batches awaiting validation can we be sure the last attempt is
// right, and thus, that any different attempt is wrong // right, and thus, that any different attempt is wrong
match batch.state() { match batch.state() {
@ -1024,6 +1040,7 @@ impl<T: BeaconChainTypes> slog::KV for SyncingChain<T> {
)?; )?;
serializer.emit_usize("batches", self.batches.len())?; serializer.emit_usize("batches", self.batches.len())?;
serializer.emit_usize("peers", self.peers.len())?; serializer.emit_usize("peers", self.peers.len())?;
serializer.emit_u8("validated_batches", self.validated_batches)?;
slog::Result::Ok(()) slog::Result::Ok(())
} }
} }
@ -1037,7 +1054,7 @@ impl From<WrongBatchState> for RemoveChain {
impl std::fmt::Debug for RemoveChain { impl std::fmt::Debug for RemoveChain {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// needed to avoid Debuggins Strings // needed to avoid Debugging Strings
match self { match self {
RemoveChain::ChainCompleted => f.write_str("ChainCompleted"), RemoveChain::ChainCompleted => f.write_str("ChainCompleted"),
RemoveChain::EmptyPeerPool => f.write_str("EmptyPeerPool"), RemoveChain::EmptyPeerPool => f.write_str("EmptyPeerPool"),
@ -1047,3 +1064,12 @@ impl std::fmt::Debug for RemoveChain {
} }
} }
} }
impl RemoveChain {
pub fn is_critical(&self) -> bool {
matches!(
self,
RemoveChain::WrongBatchState(..) | RemoveChain::WrongChainState(..)
)
}
}

View File

@ -11,7 +11,7 @@ use crate::sync::PeerSyncInfo;
use beacon_chain::{BeaconChain, BeaconChainTypes}; use beacon_chain::{BeaconChain, BeaconChainTypes};
use eth2_libp2p::PeerId; use eth2_libp2p::PeerId;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use slog::{debug, error}; use slog::{crit, debug, error};
use smallvec::SmallVec; use smallvec::SmallVec;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
@ -300,7 +300,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
match old_id { match old_id {
Some(Some(old_id)) => debug!(self.log, "Switching finalized chains"; Some(Some(old_id)) => debug!(self.log, "Switching finalized chains";
"old_id" => old_id, &chain), "old_id" => old_id, &chain),
None => debug!(self.log, "Syncing new chain"; &chain), None => debug!(self.log, "Syncing new finalized chain"; &chain),
Some(None) => { Some(None) => {
// this is the same chain. We try to advance it. // this is the same chain. We try to advance it.
} }
@ -311,8 +311,12 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch) if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch)
{ {
// this happens only if sending a batch over the `network` fails a lot if remove_reason.is_critical() {
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason); crit!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
} else {
// this happens only if sending a batch over the `network` fails a lot
error!(self.log, "Chain removed while switching chains"; "chain" => new_id, "reason" => ?remove_reason);
}
self.finalized_chains.remove(&new_id); self.finalized_chains.remove(&new_id);
self.on_chain_removed(&new_id, true); self.on_chain_removed(&new_id, true);
} }
@ -330,6 +334,7 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
) { ) {
// Include the awaiting head peers // Include the awaiting head peers
for (peer_id, peer_sync_info) in awaiting_head_peers.drain() { for (peer_id, peer_sync_info) in awaiting_head_peers.drain() {
debug!(self.log, "including head peer");
self.add_peer_or_create_chain( self.add_peer_or_create_chain(
local_epoch, local_epoch,
peer_sync_info.head_root, peer_sync_info.head_root,
@ -368,7 +373,11 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
chain.start_syncing(network, local_epoch, local_head_epoch) chain.start_syncing(network, local_epoch, local_head_epoch)
{ {
self.head_chains.remove(&id); self.head_chains.remove(&id);
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason); if remove_reason.is_critical() {
crit!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
} else {
error!(self.log, "Chain removed while switching head chains"; "chain" => id, "reason" => ?remove_reason);
}
} else { } else {
syncing_chains.push(id); syncing_chains.push(id);
} }
@ -482,7 +491,11 @@ impl<T: BeaconChainTypes> ChainCollection<T> {
debug_assert_eq!(chain.target_head_root, target_head_root); debug_assert_eq!(chain.target_head_root, target_head_root);
debug_assert_eq!(chain.target_head_slot, target_head_slot); debug_assert_eq!(chain.target_head_slot, target_head_slot);
if let Err(remove_reason) = chain.add_peer(network, peer) { if let Err(remove_reason) = chain.add_peer(network, peer) {
debug!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason); if remove_reason.is_critical() {
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
} else {
error!(self.log, "Chain removed after adding peer"; "chain" => id, "reason" => ?remove_reason);
}
let chain = entry.remove(); let chain = entry.remove();
self.on_chain_removed(&id, chain.is_syncing()); self.on_chain_removed(&id, chain.is_syncing());
} }

View File

@ -19,7 +19,7 @@
//! need to be downloaded. //! need to be downloaded.
//! //!
//! A few interesting notes about finalized chain syncing: //! A few interesting notes about finalized chain syncing:
//! - Only one finalized chain can sync at a time. //! - Only one finalized chain can sync at a time
//! - The finalized chain with the largest peer pool takes priority. //! - The finalized chain with the largest peer pool takes priority.
//! - As one finalized chain completes, others are checked to see if we they can be continued, //! - As one finalized chain completes, others are checked to see if we they can be continued,
//! otherwise they are removed. //! otherwise they are removed.
@ -39,7 +39,7 @@
//! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially
//! and further batches are requested as current blocks are being processed. //! and further batches are requested as current blocks are being processed.
use super::chain::ChainId; use super::chain::{ChainId, RemoveChain, SyncingChain};
use super::chain_collection::ChainCollection; use super::chain_collection::ChainCollection;
use super::sync_type::RangeSyncType; use super::sync_type::RangeSyncType;
use crate::beacon_processor::WorkEvent as BeaconWorkEvent; use crate::beacon_processor::WorkEvent as BeaconWorkEvent;
@ -49,7 +49,7 @@ use crate::sync::PeerSyncInfo;
use crate::sync::RequestId; use crate::sync::RequestId;
use beacon_chain::{BeaconChain, BeaconChainTypes}; use beacon_chain::{BeaconChain, BeaconChainTypes};
use eth2_libp2p::PeerId; use eth2_libp2p::PeerId;
use slog::{debug, error, trace}; use slog::{crit, debug, error, trace};
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::mpsc; use tokio::sync::mpsc;
@ -121,7 +121,8 @@ impl<T: BeaconChainTypes> RangeSync<T> {
.finalized_epoch .finalized_epoch
.start_slot(T::EthSpec::slots_per_epoch()); .start_slot(T::EthSpec::slots_per_epoch());
// NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. // NOTE: A peer that has been re-status'd may now exist in multiple finalized chains. This
// is OK since we since only one finalized chain at a time.
// determine which kind of sync to perform and set up the chains // determine which kind of sync to perform and set up the chains
match RangeSyncType::new(&self.beacon_chain, &local_info, &remote_info) { match RangeSyncType::new(&self.beacon_chain, &local_info, &remote_info) {
@ -208,22 +209,22 @@ impl<T: BeaconChainTypes> RangeSync<T> {
chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block) chain.on_block_response(network, batch_id, &peer_id, request_id, beacon_block)
}) { }) {
Ok((removed_chain, sync_type)) => { Ok((removed_chain, sync_type)) => {
if let Some(_removed_chain) = removed_chain { if let Some((removed_chain, remove_reason)) = removed_chain {
debug!(self.log, "Chain removed after block response"; "sync_type" => ?sync_type, "chain_id" => chain_id); self.on_chain_removed(
// update the state of the collection removed_chain,
self.chains.update( sync_type,
remove_reason,
network, network,
&mut self.awaiting_head_peers, "block response",
&self.beacon_processor_send,
); );
} }
} }
Err(_) => { Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
} }
} }
} else { } else {
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id) trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
} }
} }
@ -241,17 +242,18 @@ impl<T: BeaconChainTypes> RangeSync<T> {
Ok((None, _sync_type)) => { Ok((None, _sync_type)) => {
// Chain was found and not removed // Chain was found and not removed
} }
Ok((Some(_removed_chain), sync_type)) => { Ok((Some((removed_chain, remove_reason)), sync_type)) => {
debug!(self.log, "Chain removed after processing result"; "chain" => chain_id, "sync_type" => ?sync_type); self.on_chain_removed(
self.chains.update( removed_chain,
sync_type,
remove_reason,
network, network,
&mut self.awaiting_head_peers, "batch processing result",
&self.beacon_processor_send,
); );
} }
Err(_) => { Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
} }
} }
} }
@ -279,14 +281,16 @@ impl<T: BeaconChainTypes> RangeSync<T> {
.chains .chains
.call_all(|chain| chain.remove_peer(peer_id, network)) .call_all(|chain| chain.remove_peer(peer_id, network))
{ {
debug!(self.log, "Chain removed after removing peer"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason); self.on_chain_removed(
removed_chain,
sync_type,
remove_reason,
network,
"peer removed",
);
// update the state of the collection // update the state of the collection
} }
self.chains.update(
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
);
} }
/// An RPC error has occurred. /// An RPC error has occurred.
@ -307,21 +311,45 @@ impl<T: BeaconChainTypes> RangeSync<T> {
}) { }) {
Ok((removed_chain, sync_type)) => { Ok((removed_chain, sync_type)) => {
if let Some((removed_chain, remove_reason)) = removed_chain { if let Some((removed_chain, remove_reason)) = removed_chain {
debug!(self.log, "Chain removed on rpc error"; "sync_type" => ?sync_type, "chain" => removed_chain.get_id(), "reason" => ?remove_reason); self.on_chain_removed(
// update the state of the collection removed_chain,
self.chains.update( sync_type,
remove_reason,
network, network,
&mut self.awaiting_head_peers, "RPC error",
&self.beacon_processor_send,
); );
} }
} }
Err(_) => { Err(_) => {
debug!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id) trace!(self.log, "BlocksByRange response for removed chain"; "chain" => chain_id)
} }
} }
} else { } else {
debug!(self.log, "Response/Error for non registered request"; "request_id" => request_id) trace!(self.log, "Response/Error for non registered request"; "request_id" => request_id)
} }
} }
fn on_chain_removed(
&mut self,
chain: SyncingChain<T>,
sync_type: RangeSyncType,
remove_reason: RemoveChain,
network: &mut SyncNetworkContext<T::EthSpec>,
op: &'static str,
) {
if remove_reason.is_critical() {
crit!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
} else {
debug!(self.log, "Chain removed"; "sync_type" => ?sync_type, &chain, "reason" => ?remove_reason, "op" => op);
}
network.status_peers(self.beacon_chain.clone(), chain.peers());
// update the state of the collection
self.chains.update(
network,
&mut self.awaiting_head_peers,
&self.beacon_processor_send,
);
}
} }