16e4d0e005
This change extends the peer metrics collection: - traces the life-cycle of the peers - meters the peer traffic separately for every peer - creates event feed for the peer events - emits the peer events
434 lines
12 KiB
Go
434 lines
12 KiB
Go
// Copyright 2015 The go-ethereum Authors
|
|
// This file is part of the go-ethereum library.
|
|
//
|
|
// The go-ethereum library is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Lesser General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// The go-ethereum library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package p2p
|
|
|
|
import (
|
|
"container/heap"
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"time"
|
|
|
|
"github.com/ethereum/go-ethereum/log"
|
|
"github.com/ethereum/go-ethereum/p2p/enode"
|
|
"github.com/ethereum/go-ethereum/p2p/netutil"
|
|
)
|
|
|
|
const (
|
|
// This is the amount of time spent waiting in between
|
|
// redialing a certain node.
|
|
dialHistoryExpiration = 30 * time.Second
|
|
|
|
// Discovery lookups are throttled and can only run
|
|
// once every few seconds.
|
|
lookupInterval = 4 * time.Second
|
|
|
|
// If no peers are found for this amount of time, the initial bootnodes are
|
|
// attempted to be connected.
|
|
fallbackInterval = 20 * time.Second
|
|
|
|
// Endpoint resolution is throttled with bounded backoff.
|
|
initialResolveDelay = 60 * time.Second
|
|
maxResolveDelay = time.Hour
|
|
)
|
|
|
|
// NodeDialer is used to connect to nodes in the network, typically by using
|
|
// an underlying net.Dialer but also using net.Pipe in tests
|
|
type NodeDialer interface {
|
|
Dial(*enode.Node) (net.Conn, error)
|
|
}
|
|
|
|
// TCPDialer implements the NodeDialer interface by using a net.Dialer to
|
|
// create TCP connections to nodes in the network
|
|
type TCPDialer struct {
|
|
*net.Dialer
|
|
}
|
|
|
|
// Dial creates a TCP connection to the node
|
|
func (t TCPDialer) Dial(dest *enode.Node) (net.Conn, error) {
|
|
addr := &net.TCPAddr{IP: dest.IP(), Port: dest.TCP()}
|
|
return t.Dialer.Dial("tcp", addr.String())
|
|
}
|
|
|
|
// dialstate schedules dials and discovery lookups.
|
|
// it get's a chance to compute new tasks on every iteration
|
|
// of the main loop in Server.run.
|
|
type dialstate struct {
|
|
maxDynDials int
|
|
ntab discoverTable
|
|
netrestrict *netutil.Netlist
|
|
self enode.ID
|
|
|
|
lookupRunning bool
|
|
dialing map[enode.ID]connFlag
|
|
lookupBuf []*enode.Node // current discovery lookup results
|
|
randomNodes []*enode.Node // filled from Table
|
|
static map[enode.ID]*dialTask
|
|
hist *dialHistory
|
|
|
|
start time.Time // time when the dialer was first used
|
|
bootnodes []*enode.Node // default dials when there are no peers
|
|
}
|
|
|
|
type discoverTable interface {
|
|
Close()
|
|
Resolve(*enode.Node) *enode.Node
|
|
LookupRandom() []*enode.Node
|
|
ReadRandomNodes([]*enode.Node) int
|
|
}
|
|
|
|
// the dial history remembers recent dials.
|
|
type dialHistory []pastDial
|
|
|
|
// pastDial is an entry in the dial history.
|
|
type pastDial struct {
|
|
id enode.ID
|
|
exp time.Time
|
|
}
|
|
|
|
type task interface {
|
|
Do(*Server)
|
|
}
|
|
|
|
// A dialTask is generated for each node that is dialed. Its
|
|
// fields cannot be accessed while the task is running.
|
|
type dialTask struct {
|
|
flags connFlag
|
|
dest *enode.Node
|
|
lastResolved time.Time
|
|
resolveDelay time.Duration
|
|
}
|
|
|
|
// discoverTask runs discovery table operations.
|
|
// Only one discoverTask is active at any time.
|
|
// discoverTask.Do performs a random lookup.
|
|
type discoverTask struct {
|
|
results []*enode.Node
|
|
}
|
|
|
|
// A waitExpireTask is generated if there are no other tasks
|
|
// to keep the loop in Server.run ticking.
|
|
type waitExpireTask struct {
|
|
time.Duration
|
|
}
|
|
|
|
func newDialState(self enode.ID, static []*enode.Node, bootnodes []*enode.Node, ntab discoverTable, maxdyn int, netrestrict *netutil.Netlist) *dialstate {
|
|
s := &dialstate{
|
|
maxDynDials: maxdyn,
|
|
ntab: ntab,
|
|
self: self,
|
|
netrestrict: netrestrict,
|
|
static: make(map[enode.ID]*dialTask),
|
|
dialing: make(map[enode.ID]connFlag),
|
|
bootnodes: make([]*enode.Node, len(bootnodes)),
|
|
randomNodes: make([]*enode.Node, maxdyn/2),
|
|
hist: new(dialHistory),
|
|
}
|
|
copy(s.bootnodes, bootnodes)
|
|
for _, n := range static {
|
|
s.addStatic(n)
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (s *dialstate) addStatic(n *enode.Node) {
|
|
// This overwrites the task instead of updating an existing
|
|
// entry, giving users the opportunity to force a resolve operation.
|
|
s.static[n.ID()] = &dialTask{flags: staticDialedConn, dest: n}
|
|
}
|
|
|
|
func (s *dialstate) removeStatic(n *enode.Node) {
|
|
// This removes a task so future attempts to connect will not be made.
|
|
delete(s.static, n.ID())
|
|
// This removes a previous dial timestamp so that application
|
|
// can force a server to reconnect with chosen peer immediately.
|
|
s.hist.remove(n.ID())
|
|
}
|
|
|
|
func (s *dialstate) newTasks(nRunning int, peers map[enode.ID]*Peer, now time.Time) []task {
|
|
if s.start.IsZero() {
|
|
s.start = now
|
|
}
|
|
|
|
var newtasks []task
|
|
addDial := func(flag connFlag, n *enode.Node) bool {
|
|
if err := s.checkDial(n, peers); err != nil {
|
|
log.Trace("Skipping dial candidate", "id", n.ID(), "addr", &net.TCPAddr{IP: n.IP(), Port: n.TCP()}, "err", err)
|
|
return false
|
|
}
|
|
s.dialing[n.ID()] = flag
|
|
newtasks = append(newtasks, &dialTask{flags: flag, dest: n})
|
|
return true
|
|
}
|
|
|
|
// Compute number of dynamic dials necessary at this point.
|
|
needDynDials := s.maxDynDials
|
|
for _, p := range peers {
|
|
if p.rw.is(dynDialedConn) {
|
|
needDynDials--
|
|
}
|
|
}
|
|
for _, flag := range s.dialing {
|
|
if flag&dynDialedConn != 0 {
|
|
needDynDials--
|
|
}
|
|
}
|
|
|
|
// Expire the dial history on every invocation.
|
|
s.hist.expire(now)
|
|
|
|
// Create dials for static nodes if they are not connected.
|
|
for id, t := range s.static {
|
|
err := s.checkDial(t.dest, peers)
|
|
switch err {
|
|
case errNotWhitelisted, errSelf:
|
|
log.Warn("Removing static dial candidate", "id", t.dest.ID, "addr", &net.TCPAddr{IP: t.dest.IP(), Port: t.dest.TCP()}, "err", err)
|
|
delete(s.static, t.dest.ID())
|
|
case nil:
|
|
s.dialing[id] = t.flags
|
|
newtasks = append(newtasks, t)
|
|
}
|
|
}
|
|
// If we don't have any peers whatsoever, try to dial a random bootnode. This
|
|
// scenario is useful for the testnet (and private networks) where the discovery
|
|
// table might be full of mostly bad peers, making it hard to find good ones.
|
|
if len(peers) == 0 && len(s.bootnodes) > 0 && needDynDials > 0 && now.Sub(s.start) > fallbackInterval {
|
|
bootnode := s.bootnodes[0]
|
|
s.bootnodes = append(s.bootnodes[:0], s.bootnodes[1:]...)
|
|
s.bootnodes = append(s.bootnodes, bootnode)
|
|
|
|
if addDial(dynDialedConn, bootnode) {
|
|
needDynDials--
|
|
}
|
|
}
|
|
// Use random nodes from the table for half of the necessary
|
|
// dynamic dials.
|
|
randomCandidates := needDynDials / 2
|
|
if randomCandidates > 0 {
|
|
n := s.ntab.ReadRandomNodes(s.randomNodes)
|
|
for i := 0; i < randomCandidates && i < n; i++ {
|
|
if addDial(dynDialedConn, s.randomNodes[i]) {
|
|
needDynDials--
|
|
}
|
|
}
|
|
}
|
|
// Create dynamic dials from random lookup results, removing tried
|
|
// items from the result buffer.
|
|
i := 0
|
|
for ; i < len(s.lookupBuf) && needDynDials > 0; i++ {
|
|
if addDial(dynDialedConn, s.lookupBuf[i]) {
|
|
needDynDials--
|
|
}
|
|
}
|
|
s.lookupBuf = s.lookupBuf[:copy(s.lookupBuf, s.lookupBuf[i:])]
|
|
// Launch a discovery lookup if more candidates are needed.
|
|
if len(s.lookupBuf) < needDynDials && !s.lookupRunning {
|
|
s.lookupRunning = true
|
|
newtasks = append(newtasks, &discoverTask{})
|
|
}
|
|
|
|
// Launch a timer to wait for the next node to expire if all
|
|
// candidates have been tried and no task is currently active.
|
|
// This should prevent cases where the dialer logic is not ticked
|
|
// because there are no pending events.
|
|
if nRunning == 0 && len(newtasks) == 0 && s.hist.Len() > 0 {
|
|
t := &waitExpireTask{s.hist.min().exp.Sub(now)}
|
|
newtasks = append(newtasks, t)
|
|
}
|
|
return newtasks
|
|
}
|
|
|
|
var (
|
|
errSelf = errors.New("is self")
|
|
errAlreadyDialing = errors.New("already dialing")
|
|
errAlreadyConnected = errors.New("already connected")
|
|
errRecentlyDialed = errors.New("recently dialed")
|
|
errNotWhitelisted = errors.New("not contained in netrestrict whitelist")
|
|
)
|
|
|
|
func (s *dialstate) checkDial(n *enode.Node, peers map[enode.ID]*Peer) error {
|
|
_, dialing := s.dialing[n.ID()]
|
|
switch {
|
|
case dialing:
|
|
return errAlreadyDialing
|
|
case peers[n.ID()] != nil:
|
|
return errAlreadyConnected
|
|
case n.ID() == s.self:
|
|
return errSelf
|
|
case s.netrestrict != nil && !s.netrestrict.Contains(n.IP()):
|
|
return errNotWhitelisted
|
|
case s.hist.contains(n.ID()):
|
|
return errRecentlyDialed
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *dialstate) taskDone(t task, now time.Time) {
|
|
switch t := t.(type) {
|
|
case *dialTask:
|
|
s.hist.add(t.dest.ID(), now.Add(dialHistoryExpiration))
|
|
delete(s.dialing, t.dest.ID())
|
|
case *discoverTask:
|
|
s.lookupRunning = false
|
|
s.lookupBuf = append(s.lookupBuf, t.results...)
|
|
}
|
|
}
|
|
|
|
func (t *dialTask) Do(srv *Server) {
|
|
if t.dest.Incomplete() {
|
|
if !t.resolve(srv) {
|
|
return
|
|
}
|
|
}
|
|
err := t.dial(srv, t.dest)
|
|
if err != nil {
|
|
log.Trace("Dial error", "task", t, "err", err)
|
|
// Try resolving the ID of static nodes if dialing failed.
|
|
if _, ok := err.(*dialError); ok && t.flags&staticDialedConn != 0 {
|
|
if t.resolve(srv) {
|
|
t.dial(srv, t.dest)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// resolve attempts to find the current endpoint for the destination
|
|
// using discovery.
|
|
//
|
|
// Resolve operations are throttled with backoff to avoid flooding the
|
|
// discovery network with useless queries for nodes that don't exist.
|
|
// The backoff delay resets when the node is found.
|
|
func (t *dialTask) resolve(srv *Server) bool {
|
|
if srv.ntab == nil {
|
|
log.Debug("Can't resolve node", "id", t.dest.ID, "err", "discovery is disabled")
|
|
return false
|
|
}
|
|
if t.resolveDelay == 0 {
|
|
t.resolveDelay = initialResolveDelay
|
|
}
|
|
if time.Since(t.lastResolved) < t.resolveDelay {
|
|
return false
|
|
}
|
|
resolved := srv.ntab.Resolve(t.dest)
|
|
t.lastResolved = time.Now()
|
|
if resolved == nil {
|
|
t.resolveDelay *= 2
|
|
if t.resolveDelay > maxResolveDelay {
|
|
t.resolveDelay = maxResolveDelay
|
|
}
|
|
log.Debug("Resolving node failed", "id", t.dest.ID, "newdelay", t.resolveDelay)
|
|
return false
|
|
}
|
|
// The node was found.
|
|
t.resolveDelay = initialResolveDelay
|
|
t.dest = resolved
|
|
log.Debug("Resolved node", "id", t.dest.ID, "addr", &net.TCPAddr{IP: t.dest.IP(), Port: t.dest.TCP()})
|
|
return true
|
|
}
|
|
|
|
type dialError struct {
|
|
error
|
|
}
|
|
|
|
// dial performs the actual connection attempt.
|
|
func (t *dialTask) dial(srv *Server, dest *enode.Node) error {
|
|
fd, err := srv.Dialer.Dial(dest)
|
|
if err != nil {
|
|
return &dialError{err}
|
|
}
|
|
mfd := newMeteredConn(fd, false, dest.IP())
|
|
return srv.SetupConn(mfd, t.flags, dest)
|
|
}
|
|
|
|
func (t *dialTask) String() string {
|
|
id := t.dest.ID()
|
|
return fmt.Sprintf("%v %x %v:%d", t.flags, id[:8], t.dest.IP(), t.dest.TCP())
|
|
}
|
|
|
|
func (t *discoverTask) Do(srv *Server) {
|
|
// newTasks generates a lookup task whenever dynamic dials are
|
|
// necessary. Lookups need to take some time, otherwise the
|
|
// event loop spins too fast.
|
|
next := srv.lastLookup.Add(lookupInterval)
|
|
if now := time.Now(); now.Before(next) {
|
|
time.Sleep(next.Sub(now))
|
|
}
|
|
srv.lastLookup = time.Now()
|
|
t.results = srv.ntab.LookupRandom()
|
|
}
|
|
|
|
func (t *discoverTask) String() string {
|
|
s := "discovery lookup"
|
|
if len(t.results) > 0 {
|
|
s += fmt.Sprintf(" (%d results)", len(t.results))
|
|
}
|
|
return s
|
|
}
|
|
|
|
func (t waitExpireTask) Do(*Server) {
|
|
time.Sleep(t.Duration)
|
|
}
|
|
func (t waitExpireTask) String() string {
|
|
return fmt.Sprintf("wait for dial hist expire (%v)", t.Duration)
|
|
}
|
|
|
|
// Use only these methods to access or modify dialHistory.
|
|
func (h dialHistory) min() pastDial {
|
|
return h[0]
|
|
}
|
|
func (h *dialHistory) add(id enode.ID, exp time.Time) {
|
|
heap.Push(h, pastDial{id, exp})
|
|
|
|
}
|
|
func (h *dialHistory) remove(id enode.ID) bool {
|
|
for i, v := range *h {
|
|
if v.id == id {
|
|
heap.Remove(h, i)
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
func (h dialHistory) contains(id enode.ID) bool {
|
|
for _, v := range h {
|
|
if v.id == id {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
func (h *dialHistory) expire(now time.Time) {
|
|
for h.Len() > 0 && h.min().exp.Before(now) {
|
|
heap.Pop(h)
|
|
}
|
|
}
|
|
|
|
// heap.Interface boilerplate
|
|
func (h dialHistory) Len() int { return len(h) }
|
|
func (h dialHistory) Less(i, j int) bool { return h[i].exp.Before(h[j].exp) }
|
|
func (h dialHistory) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
|
func (h *dialHistory) Push(x interface{}) {
|
|
*h = append(*h, x.(pastDial))
|
|
}
|
|
func (h *dialHistory) Pop() interface{} {
|
|
old := *h
|
|
n := len(old)
|
|
x := old[n-1]
|
|
*h = old[0 : n-1]
|
|
return x
|
|
}
|