p2p, p2p/discover: add dial metrics (#27621)

This PR adds metrics for p2p dialing, which gives us visibility into the quality of the dial 
candidates  returned by our discovery methods.
This commit is contained in:
lightclient 2023-07-06 08:20:31 -06:00 committed by GitHub
parent ea782809f7
commit cbf2579691
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 277 additions and 51 deletions

View File

@ -38,6 +38,7 @@ import (
"github.com/ethereum/go-ethereum/ethdb"
"github.com/ethereum/go-ethereum/event"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/metrics"
"github.com/ethereum/go-ethereum/p2p"
)
@ -424,6 +425,13 @@ func (h *handler) runSnapExtension(peer *snap.Peer, handler snap.Handler) error
defer h.peerWG.Done()
if err := h.peers.registerSnapExtension(peer); err != nil {
if metrics.Enabled {
if peer.Inbound() {
snap.IngressRegistrationErrorMeter.Mark(1)
} else {
snap.EgressRegistrationErrorMeter.Mark(1)
}
}
peer.Log().Warn("Snapshot extension registration failed", "err", err)
return err
}

View File

@ -17,12 +17,14 @@
package eth
import (
"errors"
"fmt"
"math/big"
"time"
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/core/forkid"
"github.com/ethereum/go-ethereum/metrics"
"github.com/ethereum/go-ethereum/p2p"
)
@ -59,9 +61,11 @@ func (p *Peer) Handshake(network uint64, td *big.Int, head common.Hash, genesis
select {
case err := <-errc:
if err != nil {
markError(p, err)
return err
}
case <-timeout.C:
markError(p, p2p.DiscReadTimeout)
return p2p.DiscReadTimeout
}
}
@ -105,3 +109,25 @@ func (p *Peer) readStatus(network uint64, status *StatusPacket, genesis common.H
}
return nil
}
// markError registers the error with the corresponding metric.
func markError(p *Peer, err error) {
if !metrics.Enabled {
return
}
m := meters.get(p.Inbound())
switch errors.Unwrap(err) {
case errNetworkIDMismatch:
m.networkIDMismatch.Mark(1)
case errProtocolVersionMismatch:
m.protocolVersionMismatch.Mark(1)
case errGenesisMismatch:
m.genesisMismatch.Mark(1)
case errForkIDRejected:
m.forkidRejected.Mark(1)
case p2p.DiscReadTimeout:
m.timeoutError.Mark(1)
default:
m.peerError.Mark(1)
}
}

View File

@ -0,0 +1,81 @@
// Copyright 2023 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package eth
import "github.com/ethereum/go-ethereum/metrics"
// meters stores ingress and egress handshake meters.
var meters bidirectionalMeters
// bidirectionalMeters stores ingress and egress handshake meters.
type bidirectionalMeters struct {
ingress *hsMeters
egress *hsMeters
}
// get returns the corresponding meter depending if ingress or egress is
// desired.
func (h *bidirectionalMeters) get(ingress bool) *hsMeters {
if ingress {
return h.ingress
}
return h.egress
}
// hsMeters is a collection of meters which track metrics related to the
// eth subprotocol handshake.
type hsMeters struct {
// peerError measures the number of errors related to incorrect peer
// behaviour, such as invalid message code, size, encoding, etc.
peerError metrics.Meter
// timeoutError measures the number of timeouts.
timeoutError metrics.Meter
// networkIDMismatch measures the number of network id mismatch errors.
networkIDMismatch metrics.Meter
// protocolVersionMismatch measures the number of differing protocol
// versions.
protocolVersionMismatch metrics.Meter
// genesisMismatch measures the number of differing genesises.
genesisMismatch metrics.Meter
// forkidRejected measures the number of differing forkids.
forkidRejected metrics.Meter
}
// newHandshakeMeters registers and returns handshake meters for the given
// base.
func newHandshakeMeters(base string) *hsMeters {
return &hsMeters{
peerError: metrics.NewRegisteredMeter(base+"error/peer", nil),
timeoutError: metrics.NewRegisteredMeter(base+"error/timeout", nil),
networkIDMismatch: metrics.NewRegisteredMeter(base+"error/network", nil),
protocolVersionMismatch: metrics.NewRegisteredMeter(base+"error/version", nil),
genesisMismatch: metrics.NewRegisteredMeter(base+"error/genesis", nil),
forkidRejected: metrics.NewRegisteredMeter(base+"error/forkid", nil),
}
}
func init() {
meters = bidirectionalMeters{
ingress: newHandshakeMeters("eth/protocols/eth/ingress/handshake/"),
egress: newHandshakeMeters("eth/protocols/eth/egress/handshake/"),
}
}

View File

@ -0,0 +1,29 @@
// Copyright 2023 The go-ethereum Authors
// This file is part of the go-ethereum library.
//
// The go-ethereum library is free software: you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// The go-ethereum library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
package snap
import (
metrics "github.com/ethereum/go-ethereum/metrics"
)
var (
ingressRegistrationErrorName = "eth/protocols/snap/ingress/registration/error"
egressRegistrationErrorName = "eth/protocols/snap/egress/registration/error"
IngressRegistrationErrorMeter = metrics.NewRegisteredMeter(ingressRegistrationErrorName, nil)
EgressRegistrationErrorMeter = metrics.NewRegisteredMeter(egressRegistrationErrorName, nil)
)

View File

@ -521,13 +521,14 @@ func (t *dialTask) resolve(d *dialScheduler) bool {
// dial performs the actual connection attempt.
func (t *dialTask) dial(d *dialScheduler, dest *enode.Node) error {
dialMeter.Mark(1)
fd, err := d.dialer.Dial(d.ctx, t.dest)
if err != nil {
d.log.Trace("Dial error", "id", t.dest.ID(), "addr", nodeAddr(t.dest), "conn", t.flags, "err", cleanupDialErr(err))
dialConnectionError.Mark(1)
return &dialError{err}
}
mfd := newMeteredConn(fd, false, &net.TCPAddr{IP: dest.IP(), Port: dest.TCP()})
return d.setupFunc(mfd, t.flags, dest)
return d.setupFunc(newMeteredConn(fd), t.flags, dest)
}
func (t *dialTask) String() string {

View File

@ -17,6 +17,7 @@
package discover
import (
"fmt"
"net"
"github.com/ethereum/go-ethereum/metrics"
@ -32,10 +33,17 @@ const (
)
var (
bucketsCounter []metrics.Counter
ingressTrafficMeter = metrics.NewRegisteredMeter(ingressMeterName, nil)
egressTrafficMeter = metrics.NewRegisteredMeter(egressMeterName, nil)
)
func init() {
for i := 0; i < nBuckets; i++ {
bucketsCounter = append(bucketsCounter, metrics.NewRegisteredCounter(fmt.Sprintf("%s/bucket/%d/count", moduleName, i), nil))
}
}
// meteredConn is a wrapper around a net.UDPConn that meters both the
// inbound and outbound network traffic.
type meteredUdpConn struct {

View File

@ -34,6 +34,7 @@ import (
"github.com/ethereum/go-ethereum/common"
"github.com/ethereum/go-ethereum/log"
"github.com/ethereum/go-ethereum/metrics"
"github.com/ethereum/go-ethereum/p2p/enode"
"github.com/ethereum/go-ethereum/p2p/netutil"
)
@ -80,7 +81,8 @@ type Table struct {
closeReq chan struct{}
closed chan struct{}
nodeAddedHook func(*node) // for testing
nodeAddedHook func(*bucket, *node)
nodeRemovedHook func(*bucket, *node)
}
// transport is implemented by the UDP transports.
@ -98,6 +100,7 @@ type bucket struct {
entries []*node // live entries, sorted by time of last contact
replacements []*node // recently seen nodes to be used if revalidation fails
ips netutil.DistinctNetSet
index int
}
func newTable(t transport, db *enode.DB, cfg Config) (*Table, error) {
@ -119,6 +122,7 @@ func newTable(t transport, db *enode.DB, cfg Config) (*Table, error) {
}
for i := range tab.buckets {
tab.buckets[i] = &bucket{
index: i,
ips: netutil.DistinctNetSet{Subnet: bucketSubnet, Limit: bucketIPLimit},
}
}
@ -128,6 +132,22 @@ func newTable(t transport, db *enode.DB, cfg Config) (*Table, error) {
return tab, nil
}
func newMeteredTable(t transport, db *enode.DB, cfg Config) (*Table, error) {
tab, err := newTable(t, db, cfg)
if err != nil {
return nil, err
}
if metrics.Enabled {
tab.nodeAddedHook = func(b *bucket, n *node) {
bucketsCounter[b.index].Inc(1)
}
tab.nodeRemovedHook = func(b *bucket, n *node) {
bucketsCounter[b.index].Dec(1)
}
}
return tab, nil
}
// Nodes returns all nodes contained in the table.
func (tab *Table) Nodes() []*enode.Node {
if !tab.isInitDone() {
@ -495,7 +515,7 @@ func (tab *Table) addSeenNode(n *node) {
n.addedAt = time.Now()
if tab.nodeAddedHook != nil {
tab.nodeAddedHook(n)
tab.nodeAddedHook(b, n)
}
}
@ -539,7 +559,7 @@ func (tab *Table) addVerifiedNode(n *node) {
n.addedAt = time.Now()
if tab.nodeAddedHook != nil {
tab.nodeAddedHook(n)
tab.nodeAddedHook(b, n)
}
}
@ -638,8 +658,16 @@ func (tab *Table) bumpInBucket(b *bucket, n *node) bool {
}
func (tab *Table) deleteInBucket(b *bucket, n *node) {
// Check if the node is actually in the bucket so the removed hook
// isn't called multiple times for the same node.
if !contains(b.entries, n.ID()) {
return
}
b.entries = deleteNode(b.entries, n)
tab.removeIP(b, n.IP())
if tab.nodeRemovedHook != nil {
tab.nodeRemovedHook(b, n)
}
}
func contains(ns []*node, id enode.ID) bool {

View File

@ -142,7 +142,7 @@ func ListenV4(c UDPConn, ln *enode.LocalNode, cfg Config) (*UDPv4, error) {
log: cfg.Log,
}
tab, err := newTable(t, ln.Database(), cfg)
tab, err := newMeteredTable(t, ln.Database(), cfg)
if err != nil {
return nil, err
}

View File

@ -394,7 +394,7 @@ func TestUDPv4_pingMatchIP(t *testing.T) {
func TestUDPv4_successfulPing(t *testing.T) {
test := newUDPTest(t)
added := make(chan *node, 1)
test.table.nodeAddedHook = func(n *node) { added <- n }
test.table.nodeAddedHook = func(b *bucket, n *node) { added <- n }
defer test.close()
// The remote side sends a ping packet to initiate the exchange.

View File

@ -174,7 +174,7 @@ func newUDPv5(conn UDPConn, ln *enode.LocalNode, cfg Config) (*UDPv5, error) {
cancelCloseCtx: cancelCloseCtx,
}
t.talk = newTalkSystem(t)
tab, err := newTable(t, t.db, cfg)
tab, err := newMeteredTable(t, t.db, cfg)
if err != nil {
return nil, err
}

View File

@ -19,30 +19,86 @@
package p2p
import (
"errors"
"net"
"github.com/ethereum/go-ethereum/metrics"
)
const (
// HandleHistName is the prefix of the per-packet serving time histograms.
HandleHistName = "p2p/handle"
// ingressMeterName is the prefix of the per-packet inbound metrics.
ingressMeterName = "p2p/ingress"
// egressMeterName is the prefix of the per-packet outbound metrics.
egressMeterName = "p2p/egress"
// HandleHistName is the prefix of the per-packet serving time histograms.
HandleHistName = "p2p/handle"
)
var (
ingressConnectMeter = metrics.NewRegisteredMeter("p2p/serves", nil)
ingressTrafficMeter = metrics.NewRegisteredMeter(ingressMeterName, nil)
egressConnectMeter = metrics.NewRegisteredMeter("p2p/dials", nil)
egressTrafficMeter = metrics.NewRegisteredMeter(egressMeterName, nil)
activePeerGauge = metrics.NewRegisteredGauge("p2p/peers", nil)
activePeerGauge metrics.Gauge = metrics.NilGauge{}
ingressTrafficMeter = metrics.NewRegisteredMeter("p2p/ingress", nil)
egressTrafficMeter = metrics.NewRegisteredMeter("p2p/egress", nil)
// general ingress/egress connection meters
serveMeter metrics.Meter = metrics.NilMeter{}
serveSuccessMeter metrics.Meter = metrics.NilMeter{}
dialMeter metrics.Meter = metrics.NilMeter{}
dialSuccessMeter metrics.Meter = metrics.NilMeter{}
dialConnectionError metrics.Meter = metrics.NilMeter{}
// handshake error meters
dialTooManyPeers = metrics.NewRegisteredMeter("p2p/dials/error/saturated", nil)
dialAlreadyConnected = metrics.NewRegisteredMeter("p2p/dials/error/known", nil)
dialSelf = metrics.NewRegisteredMeter("p2p/dials/error/self", nil)
dialUselessPeer = metrics.NewRegisteredMeter("p2p/dials/error/useless", nil)
dialUnexpectedIdentity = metrics.NewRegisteredMeter("p2p/dials/error/id/unexpected", nil)
dialEncHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/enc", nil)
dialProtoHandshakeError = metrics.NewRegisteredMeter("p2p/dials/error/rlpx/proto", nil)
)
func init() {
if !metrics.Enabled {
return
}
activePeerGauge = metrics.NewRegisteredGauge("p2p/peers", nil)
serveMeter = metrics.NewRegisteredMeter("p2p/serves", nil)
serveSuccessMeter = metrics.NewRegisteredMeter("p2p/serves/success", nil)
dialMeter = metrics.NewRegisteredMeter("p2p/dials", nil)
dialSuccessMeter = metrics.NewRegisteredMeter("p2p/dials/success", nil)
dialConnectionError = metrics.NewRegisteredMeter("p2p/dials/error/connection", nil)
}
// markDialError matches errors that occur while setting up a dial connection
// to the corresponding meter.
func markDialError(err error) {
if !metrics.Enabled {
return
}
if err2 := errors.Unwrap(err); err2 != nil {
err = err2
}
switch err {
case DiscTooManyPeers:
dialTooManyPeers.Mark(1)
case DiscAlreadyConnected:
dialAlreadyConnected.Mark(1)
case DiscSelf:
dialSelf.Mark(1)
case DiscUselessPeer:
dialUselessPeer.Mark(1)
case DiscUnexpectedIdentity:
dialUnexpectedIdentity.Mark(1)
case errEncHandshakeError:
dialEncHandshakeError.Mark(1)
case errProtoHandshakeError:
dialProtoHandshakeError.Mark(1)
}
}
// meteredConn is a wrapper around a net.Conn that meters both the
// inbound and outbound network traffic.
type meteredConn struct {
@ -52,18 +108,10 @@ type meteredConn struct {
// newMeteredConn creates a new metered connection, bumps the ingress or egress
// connection meter and also increases the metered peer count. If the metrics
// system is disabled, function returns the original connection.
func newMeteredConn(conn net.Conn, ingress bool, addr *net.TCPAddr) net.Conn {
// Short circuit if metrics are disabled
func newMeteredConn(conn net.Conn) net.Conn {
if !metrics.Enabled {
return conn
}
// Bump the connection counters and wrap the connection
if ingress {
ingressConnectMeter.Mark(1)
} else {
egressConnectMeter.Mark(1)
}
activePeerGauge.Inc(1)
return &meteredConn{Conn: conn}
}
@ -82,13 +130,3 @@ func (c *meteredConn) Write(b []byte) (n int, err error) {
egressTrafficMeter.Mark(int64(n))
return n, err
}
// Close delegates a close operation to the underlying connection, unregisters
// the peer from the traffic registries and emits close event.
func (c *meteredConn) Close() error {
err := c.Conn.Close()
if err == nil {
activePeerGauge.Dec(1)
}
return err
}

View File

@ -64,7 +64,11 @@ const (
frameWriteTimeout = 20 * time.Second
)
var errServerStopped = errors.New("server stopped")
var (
errServerStopped = errors.New("server stopped")
errEncHandshakeError = errors.New("rlpx enc error")
errProtoHandshakeError = errors.New("rlpx proto error")
)
// Config holds Server options.
type Config struct {
@ -772,7 +776,11 @@ running:
srv.dialsched.peerAdded(c)
if p.Inbound() {
inboundCount++
serveSuccessMeter.Mark(1)
} else {
dialSuccessMeter.Mark(1)
}
activePeerGauge.Inc(1)
}
c.cont <- err
@ -785,6 +793,7 @@ running:
if pd.Inbound() {
inboundCount--
}
activePeerGauge.Dec(1)
}
}
@ -894,11 +903,8 @@ func (srv *Server) listenLoop() {
continue
}
if remoteIP != nil {
var addr *net.TCPAddr
if tcp, ok := fd.RemoteAddr().(*net.TCPAddr); ok {
addr = tcp
}
fd = newMeteredConn(fd, true, addr)
fd = newMeteredConn(fd)
serveMeter.Mark(1)
srv.log.Trace("Accepted connection", "addr", fd.RemoteAddr())
}
go func() {
@ -939,6 +945,9 @@ func (srv *Server) SetupConn(fd net.Conn, flags connFlag, dialDest *enode.Node)
err := srv.setupConn(c, flags, dialDest)
if err != nil {
if !c.is(inboundConn) {
markDialError(err)
}
c.close(err)
}
return err
@ -957,7 +966,7 @@ func (srv *Server) setupConn(c *conn, flags connFlag, dialDest *enode.Node) erro
if dialDest != nil {
dialPubkey := new(ecdsa.PublicKey)
if err := dialDest.Load((*enode.Secp256k1)(dialPubkey)); err != nil {
err = errors.New("dial destination doesn't have a secp256k1 public key")
err = fmt.Errorf("%w: dial destination doesn't have a secp256k1 public key", errEncHandshakeError)
srv.log.Trace("Setting up connection failed", "addr", c.fd.RemoteAddr(), "conn", c.flags, "err", err)
return err
}
@ -967,7 +976,7 @@ func (srv *Server) setupConn(c *conn, flags connFlag, dialDest *enode.Node) erro
remotePubkey, err := c.doEncHandshake(srv.PrivateKey)
if err != nil {
srv.log.Trace("Failed RLPx handshake", "addr", c.fd.RemoteAddr(), "conn", c.flags, "err", err)
return err
return fmt.Errorf("%w: %v", errEncHandshakeError, err)
}
if dialDest != nil {
c.node = dialDest
@ -985,7 +994,7 @@ func (srv *Server) setupConn(c *conn, flags connFlag, dialDest *enode.Node) erro
phs, err := c.doProtoHandshake(srv.ourHandshake)
if err != nil {
clog.Trace("Failed p2p handshake", "err", err)
return err
return fmt.Errorf("%w: %v", errProtoHandshakeError, err)
}
if id := c.node.ID(); !bytes.Equal(crypto.Keccak256(phs.ID), id[:]) {
clog.Trace("Wrong devp2p handshake identity", "phsid", hex.EncodeToString(phs.ID))

View File

@ -370,8 +370,6 @@ func TestServerSetupConn(t *testing.T) {
clientkey, srvkey = newkey(), newkey()
clientpub = &clientkey.PublicKey
srvpub = &srvkey.PublicKey
fooErr = errors.New("foo")
readErr = errors.New("read error")
)
tests := []struct {
dontstart bool
@ -389,10 +387,10 @@ func TestServerSetupConn(t *testing.T) {
wantCloseErr: errServerStopped,
},
{
tt: &setupTransport{pubkey: clientpub, encHandshakeErr: readErr},
tt: &setupTransport{pubkey: clientpub, encHandshakeErr: errEncHandshakeError},
flags: inboundConn,
wantCalls: "doEncHandshake,close,",
wantCloseErr: readErr,
wantCloseErr: errEncHandshakeError,
},
{
tt: &setupTransport{pubkey: clientpub, phs: protoHandshake{ID: randomID().Bytes()}},
@ -402,11 +400,11 @@ func TestServerSetupConn(t *testing.T) {
wantCloseErr: DiscUnexpectedIdentity,
},
{
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: fooErr},
tt: &setupTransport{pubkey: clientpub, protoHandshakeErr: errProtoHandshakeError},
dialDest: enode.NewV4(clientpub, nil, 0, 0),
flags: dynDialedConn,
wantCalls: "doEncHandshake,doProtoHandshake,close,",
wantCloseErr: fooErr,
wantCloseErr: errProtoHandshakeError,
},
{
tt: &setupTransport{pubkey: srvpub, phs: protoHandshake{ID: crypto.FromECDSAPub(srvpub)[1:]}},