Keep track of failed head chains and prevent re-lookups (#1534)

## Overview

There are forked chains which get referenced by blocks and attestations on a network. Typically if these chains are very long, we stop looking up the chain and downvote the peer. In extreme circumstances, many peers are on many chains, the chains can be very deep and become time consuming performing lookups. 

This PR adds a cache to known failed chain lookups. This prevents us from starting a parent-lookup (or stopping one half way through) if we have attempted the chain lookup in the past.
This commit is contained in:
Age Manning 2020-08-18 03:54:09 +00:00
parent cc44a64d15
commit 3bb30754d9
9 changed files with 316 additions and 14 deletions

8
Cargo.lock generated
View File

@ -2993,6 +2993,13 @@ dependencies = [
"linked-hash-map", "linked-hash-map",
] ]
[[package]]
name = "lru_cache"
version = "0.1.0"
dependencies = [
"fnv",
]
[[package]] [[package]]
name = "lru_time_cache" name = "lru_time_cache"
version = "0.10.0" version = "0.10.0"
@ -3271,6 +3278,7 @@ dependencies = [
"itertools 0.9.0", "itertools 0.9.0",
"lazy_static", "lazy_static",
"lighthouse_metrics", "lighthouse_metrics",
"lru_cache",
"matches", "matches",
"num_cpus", "num_cpus",
"parking_lot 0.11.0", "parking_lot 0.11.0",

View File

@ -28,6 +28,7 @@ members = [
"common/lighthouse_metrics", "common/lighthouse_metrics",
"common/lighthouse_version", "common/lighthouse_version",
"common/logging", "common/logging",
"common/lru_cache",
"common/remote_beacon_node", "common/remote_beacon_node",
"common/rest_types", "common/rest_types",
"common/slot_clock", "common/slot_clock",

View File

@ -40,3 +40,4 @@ lighthouse_metrics = { path = "../../common/lighthouse_metrics" }
environment = { path = "../../lighthouse/environment" } environment = { path = "../../lighthouse/environment" }
itertools = "0.9.0" itertools = "0.9.0"
num_cpus = "1.13.0" num_cpus = "1.13.0"
lru_cache = { path = "../../common/lru_cache" }

View File

@ -7,15 +7,15 @@ use eth2_libp2p::PeerId;
use slog::{debug, error, trace, warn}; use slog::{debug, error, trace, warn};
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use types::{Epoch, EthSpec, SignedBeaconBlock}; use types::{Epoch, EthSpec, Hash256, SignedBeaconBlock};
/// Id associated to a block processing request, either a batch or a single block. /// Id associated to a block processing request, either a batch or a single block.
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub enum ProcessId { pub enum ProcessId {
/// Processing Id of a range syncing batch. /// Processing Id of a range syncing batch.
RangeBatchId(ChainId, Epoch), RangeBatchId(ChainId, Epoch),
/// Processing Id of the parent lookup of a block /// Processing Id of the parent lookup of a block.
ParentLookup(PeerId), ParentLookup(PeerId, Hash256),
} }
pub fn handle_chain_segment<T: BeaconChainTypes>( pub fn handle_chain_segment<T: BeaconChainTypes>(
@ -71,7 +71,7 @@ pub fn handle_chain_segment<T: BeaconChainTypes>(
}); });
} }
// this a parent lookup request from the sync manager // this a parent lookup request from the sync manager
ProcessId::ParentLookup(peer_id) => { ProcessId::ParentLookup(peer_id, chain_head) => {
debug!( debug!(
log, "Processing parent lookup"; log, "Processing parent lookup";
"last_peer_id" => format!("{}", peer_id), "last_peer_id" => format!("{}", peer_id),
@ -83,7 +83,7 @@ pub fn handle_chain_segment<T: BeaconChainTypes>(
(_, Err(e)) => { (_, Err(e)) => {
warn!(log, "Parent lookup failed"; "last_peer_id" => format!("{}", peer_id), "error" => e); warn!(log, "Parent lookup failed"; "last_peer_id" => format!("{}", peer_id), "error" => e);
sync_send sync_send
.send(SyncMessage::ParentLookupFailed(peer_id)) .send(SyncMessage::ParentLookupFailed{peer_id, chain_head})
.unwrap_or_else(|_| { .unwrap_or_else(|_| {
// on failure, inform to downvote the peer // on failure, inform to downvote the peer
debug!( debug!(

View File

@ -44,6 +44,7 @@ use eth2_libp2p::rpc::{methods::MAX_REQUEST_BLOCKS, BlocksByRootRequest, Goodbye
use eth2_libp2p::types::NetworkGlobals; use eth2_libp2p::types::NetworkGlobals;
use eth2_libp2p::{PeerAction, PeerId}; use eth2_libp2p::{PeerAction, PeerId};
use fnv::FnvHashMap; use fnv::FnvHashMap;
use lru_cache::LRUCache;
use slog::{crit, debug, error, info, trace, warn, Logger}; use slog::{crit, debug, error, info, trace, warn, Logger};
use smallvec::SmallVec; use smallvec::SmallVec;
use ssz_types::VariableList; use ssz_types::VariableList;
@ -105,8 +106,13 @@ pub enum SyncMessage<T: EthSpec> {
result: BatchProcessResult, result: BatchProcessResult,
}, },
/// A parent lookup has failed for a block given by this `peer_id`. /// A parent lookup has failed.
ParentLookupFailed(PeerId), ParentLookupFailed {
/// The head of the chain of blocks that failed to process.
chain_head: Hash256,
/// The peer that instigated the chain lookup.
peer_id: PeerId,
},
} }
/// The result of processing a multiple blocks (a chain segment). /// The result of processing a multiple blocks (a chain segment).
@ -161,6 +167,9 @@ pub struct SyncManager<T: BeaconChainTypes> {
/// A collection of parent block lookups. /// A collection of parent block lookups.
parent_queue: SmallVec<[ParentRequests<T::EthSpec>; 3]>, parent_queue: SmallVec<[ParentRequests<T::EthSpec>; 3]>,
/// A cache of failed chain lookups to prevent duplicate searches.
failed_chains: LRUCache<Hash256>,
/// A collection of block hashes being searched for and a flag indicating if a result has been /// A collection of block hashes being searched for and a flag indicating if a result has been
/// received or not. /// received or not.
/// ///
@ -222,6 +231,7 @@ pub fn spawn<T: BeaconChainTypes>(
network_globals, network_globals,
input_channel: sync_recv, input_channel: sync_recv,
parent_queue: SmallVec::new(), parent_queue: SmallVec::new(),
failed_chains: LRUCache::new(500),
single_block_lookups: FnvHashMap::default(), single_block_lookups: FnvHashMap::default(),
log: log.clone(), log: log.clone(),
beacon_processor_send, beacon_processor_send,
@ -351,6 +361,22 @@ impl<T: BeaconChainTypes> SyncManager<T> {
return; return;
} }
}; };
// check if the parent of this block isn't in our failed cache. If it is, this
// chain should be dropped and the peer downscored.
if self.failed_chains.contains(&block.message.parent_root) {
debug!(self.log, "Parent chain ignored due to past failure"; "block" => format!("{:?}", block.message.parent_root), "slot" => block.message.slot);
if !parent_request.downloaded_blocks.is_empty() {
// Add the root block to failed chains
self.failed_chains
.insert(parent_request.downloaded_blocks[0].canonical_root());
} else {
crit!(self.log, "Parent chain has no blocks");
}
self.network
.report_peer(peer_id, PeerAction::MidToleranceError);
return;
}
// add the block to response // add the block to response
parent_request.downloaded_blocks.push(block); parent_request.downloaded_blocks.push(block);
// queue for processing // queue for processing
@ -510,6 +536,15 @@ impl<T: BeaconChainTypes> SyncManager<T> {
} }
} }
let block_root = block.canonical_root();
// If this block or it's parent is part of a known failed chain, ignore it.
if self.failed_chains.contains(&block.message.parent_root)
|| self.failed_chains.contains(&block_root)
{
debug!(self.log, "Block is from a past failed chain. Dropping"; "block_root" => format!("{:?}", block_root), "block_slot" => block.message.slot);
return;
}
// Make sure this block is not already being searched for // Make sure this block is not already being searched for
// NOTE: Potentially store a hashset of blocks for O(1) lookups // NOTE: Potentially store a hashset of blocks for O(1) lookups
for parent_req in self.parent_queue.iter() { for parent_req in self.parent_queue.iter() {
@ -697,6 +732,8 @@ impl<T: BeaconChainTypes> SyncManager<T> {
// If the last block in the queue has an unknown parent, we continue the parent // If the last block in the queue has an unknown parent, we continue the parent
// lookup-search. // lookup-search.
let chain_block_hash = parent_request.downloaded_blocks[0].canonical_root();
let newest_block = parent_request let newest_block = parent_request
.downloaded_blocks .downloaded_blocks
.pop() .pop()
@ -715,8 +752,10 @@ impl<T: BeaconChainTypes> SyncManager<T> {
self.request_parent(parent_request); self.request_parent(parent_request);
} }
Ok(_) | Err(BlockError::BlockIsAlreadyKnown { .. }) => { Ok(_) | Err(BlockError::BlockIsAlreadyKnown { .. }) => {
let process_id = let process_id = ProcessId::ParentLookup(
ProcessId::ParentLookup(parent_request.last_submitted_peer.clone()); parent_request.last_submitted_peer.clone(),
chain_block_hash,
);
let blocks = parent_request.downloaded_blocks; let blocks = parent_request.downloaded_blocks;
match self match self
@ -742,6 +781,10 @@ impl<T: BeaconChainTypes> SyncManager<T> {
"outcome" => format!("{:?}", outcome), "outcome" => format!("{:?}", outcome),
"last_peer" => parent_request.last_submitted_peer.to_string(), "last_peer" => parent_request.last_submitted_peer.to_string(),
); );
// Add this chain to cache of failed chains
self.failed_chains.insert(chain_block_hash);
// This currently can be a host of errors. We permit this due to the partial // This currently can be a host of errors. We permit this due to the partial
// ambiguity. // ambiguity.
// TODO: Refine the error types and score the peer appropriately. // TODO: Refine the error types and score the peer appropriately.
@ -764,8 +807,17 @@ impl<T: BeaconChainTypes> SyncManager<T> {
|| parent_request.downloaded_blocks.len() >= PARENT_DEPTH_TOLERANCE || parent_request.downloaded_blocks.len() >= PARENT_DEPTH_TOLERANCE
{ {
let error = if parent_request.failed_attempts >= PARENT_FAIL_TOLERANCE { let error = if parent_request.failed_attempts >= PARENT_FAIL_TOLERANCE {
// This is a peer-specific error and the chain could be continued with another
// peer. We don't consider this chain a failure and prevent retries with another
// peer.
"too many failed attempts" "too many failed attempts"
} else { } else {
if !parent_request.downloaded_blocks.is_empty() {
self.failed_chains
.insert(parent_request.downloaded_blocks[0].canonical_root());
} else {
crit!(self.log, "Parent lookup has no blocks");
}
"reached maximum lookup-depth" "reached maximum lookup-depth"
}; };
@ -774,6 +826,11 @@ impl<T: BeaconChainTypes> SyncManager<T> {
"ancestors_found" => parent_request.downloaded_blocks.len(), "ancestors_found" => parent_request.downloaded_blocks.len(),
"reason" => error "reason" => error
); );
// Downscore the peer.
self.network.report_peer(
parent_request.last_submitted_peer,
PeerAction::LowToleranceError,
);
return; // drop the request return; // drop the request
} }
@ -854,12 +911,13 @@ impl<T: BeaconChainTypes> SyncManager<T> {
result, result,
); );
} }
SyncMessage::ParentLookupFailed(peer_id) => { SyncMessage::ParentLookupFailed {
chain_head,
peer_id,
} => {
// A peer sent an object (block or attestation) that referenced a parent. // A peer sent an object (block or attestation) that referenced a parent.
// On request for this parent the peer indicated it did not have this // The processing of this chain failed.
// block. self.failed_chains.insert(chain_head);
// This is not fatal. Peer's could prune old blocks so we moderately
// tolerate this behaviour.
self.network self.network
.report_peer(peer_id, PeerAction::MidToleranceError); .report_peer(peer_id, PeerAction::MidToleranceError);
} }

View File

@ -0,0 +1,8 @@
[package]
name = "lru_cache"
version = "0.1.0"
authors = ["Sigma Prime <contact@sigmaprime.io>"]
edition = "2018"
[dependencies]
fnv = "1.0.7"

View File

@ -0,0 +1,7 @@
//! A library to provide fast and efficient LRU Cache's without updating.
mod space;
mod time;
pub use space::LRUCache;
pub use time::LRUTimeCache;

View File

@ -0,0 +1,93 @@
///! This implements a time-based LRU cache for fast checking of duplicates
use fnv::FnvHashSet;
use std::collections::VecDeque;
/// Cache that stores keys until the size is used up. Does not update elements for efficiency.
pub struct LRUCache<Key>
where
Key: Eq + std::hash::Hash + Clone,
{
/// The duplicate cache.
map: FnvHashSet<Key>,
/// An ordered list of keys by order.
list: VecDeque<Key>,
// The max size of the cache,
size: usize,
}
impl<Key> LRUCache<Key>
where
Key: Eq + std::hash::Hash + Clone,
{
pub fn new(size: usize) -> Self {
LRUCache {
map: FnvHashSet::default(),
list: VecDeque::new(),
size,
}
}
/// Determines if the key is in the cache.
pub fn contains(&self, key: &Key) -> bool {
self.map.contains(key)
}
// Inserts new elements and removes any expired elements.
//
// If the key was not present this returns `true`. If the value was already present this
// returns `false`.
pub fn insert(&mut self, key: Key) -> bool {
// check the cache before removing elements
let result = self.map.insert(key.clone());
// add the new key to the list, if it doesn't already exist.
if result {
self.list.push_back(key);
}
// remove any overflow keys
self.update();
result
}
/// Removes any expired elements from the cache.
fn update(&mut self) {
// remove any expired results
for _ in 0..self.map.len().saturating_sub(self.size) {
if let Some(key) = self.list.pop_front() {
self.map.remove(&key);
}
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn cache_added_entries_exist() {
let mut cache = LRUCache::new(5);
cache.insert("t");
cache.insert("e");
// Should report that 't' and 't' already exists
assert!(!cache.insert("t"));
assert!(!cache.insert("e"));
}
#[test]
fn cache_entries_get_removed() {
let mut cache = LRUCache::new(2);
cache.insert("t");
assert!(!cache.insert("t"));
cache.insert("e");
assert!(!cache.insert("e"));
// add another element to clear the first key
cache.insert("s");
assert!(!cache.insert("s"));
// should be removed from the cache
assert!(cache.insert("t"));
}
}

View File

@ -0,0 +1,126 @@
///! This implements a time-based LRU cache for fast checking of duplicates
use fnv::FnvHashSet;
use std::collections::VecDeque;
use std::time::{Duration, Instant};
struct Element<Key> {
/// The key being inserted.
key: Key,
/// The instant the key was inserted.
inserted: Instant,
}
pub struct LRUTimeCache<Key> {
/// The duplicate cache.
map: FnvHashSet<Key>,
/// An ordered list of keys by insert time.
list: VecDeque<Element<Key>>,
/// The time elements remain in the cache.
ttl: Duration,
}
impl<Key> LRUTimeCache<Key>
where
Key: Eq + std::hash::Hash + Clone,
{
pub fn new(ttl: Duration) -> Self {
LRUTimeCache {
map: FnvHashSet::default(),
list: VecDeque::new(),
ttl,
}
}
// Inserts new elements and removes any expired elements.
//
// If the key was not present this returns `true`. If the value was already present this
// returns `false`.
pub fn insert_update(&mut self, key: Key) -> bool {
// check the cache before removing elements
let result = self.map.insert(key.clone());
let now = Instant::now();
// remove any expired results
while let Some(element) = self.list.pop_front() {
if element.inserted + self.ttl > now {
self.list.push_front(element);
break;
}
self.map.remove(&element.key);
}
// add the new key to the list, if it doesn't already exist.
if result {
self.list.push_back(Element { key, inserted: now });
}
result
}
// Inserts new element does not expire old elements.
//
// If the key was not present this returns `true`. If the value was already present this
// returns `false`.
pub fn insert(&mut self, key: Key) -> bool {
// check the cache before removing elements
let result = self.map.insert(key.clone());
// add the new key to the list, if it doesn't already exist.
if result {
self.list.push_back(Element {
key,
inserted: Instant::now(),
});
}
result
}
/// Removes any expired elements from the cache.
pub fn update(&mut self) {
let now = Instant::now();
// remove any expired results
while let Some(element) = self.list.pop_front() {
if element.inserted + self.ttl > now {
self.list.push_front(element);
break;
}
self.map.remove(&element.key);
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn cache_added_entries_exist() {
let mut cache = LRUTimeCache::new(Duration::from_secs(10));
cache.insert("t");
cache.insert("e");
// Should report that 't' and 't' already exists
assert!(!cache.insert("t"));
assert!(!cache.insert("e"));
}
#[test]
fn cache_entries_expire() {
let mut cache = LRUTimeCache::new(Duration::from_millis(100));
cache.insert_update("t");
assert!(!cache.insert_update("t"));
cache.insert_update("e");
assert!(!cache.insert_update("t"));
assert!(!cache.insert_update("e"));
// sleep until cache expiry
std::thread::sleep(Duration::from_millis(101));
// add another element to clear previous cache
cache.insert_update("s");
// should be removed from the cache
assert!(cache.insert_update("t"));
}
}