package lp2p import ( "context" "encoding/json" "net" "os" "time" host "github.com/libp2p/go-libp2p-core/host" peer "github.com/libp2p/go-libp2p-core/peer" pubsub "github.com/libp2p/go-libp2p-pubsub" pubsub_pb "github.com/libp2p/go-libp2p-pubsub/pb" blake2b "github.com/minio/blake2b-simd" ma "github.com/multiformats/go-multiaddr" "go.opencensus.io/stats" "go.uber.org/fx" "golang.org/x/xerrors" "github.com/filecoin-project/lotus/build" "github.com/filecoin-project/lotus/metrics" "github.com/filecoin-project/lotus/node/config" "github.com/filecoin-project/lotus/node/modules/dtypes" "github.com/filecoin-project/lotus/node/modules/helpers" "github.com/filecoin-project/lotus/node/modules/tracer" ) func init() { // configure larger overlay parameters pubsub.GossipSubD = 8 pubsub.GossipSubDscore = 6 pubsub.GossipSubDout = 3 pubsub.GossipSubDlo = 6 pubsub.GossipSubDhi = 12 pubsub.GossipSubDlazy = 12 pubsub.GossipSubDirectConnectInitialDelay = 30 * time.Second pubsub.GossipSubIWantFollowupTime = 5 * time.Second pubsub.GossipSubHistoryLength = 10 pubsub.GossipSubGossipFactor = 0.1 } const ( GossipScoreThreshold = -500 PublishScoreThreshold = -1000 GraylistScoreThreshold = -2500 AcceptPXScoreThreshold = 1000 OpportunisticGraftScoreThreshold = 3.5 ) func ScoreKeeper() *dtypes.ScoreKeeper { return new(dtypes.ScoreKeeper) } type PeerScoreTracker interface { UpdatePeerScore(scores map[peer.ID]*pubsub.PeerScoreSnapshot) } type peerScoreTracker struct { sk *dtypes.ScoreKeeper lt tracer.LotusTracer } func newPeerScoreTracker(lt tracer.LotusTracer, sk *dtypes.ScoreKeeper) PeerScoreTracker { return &peerScoreTracker{ sk: sk, lt: lt, } } func (pst *peerScoreTracker) UpdatePeerScore(scores map[peer.ID]*pubsub.PeerScoreSnapshot) { if pst.lt != nil { pst.lt.PeerScores(scores) } pst.sk.Update(scores) } type GossipIn struct { fx.In Mctx helpers.MetricsCtx Lc fx.Lifecycle Host host.Host Nn dtypes.NetworkName Bp dtypes.BootstrapPeers Db dtypes.DrandBootstrap Cfg *config.Pubsub Sk *dtypes.ScoreKeeper Dr dtypes.DrandSchedule } func getDrandTopic(chainInfoJSON string) (string, error) { var drandInfo = struct { Hash string `json:"hash"` }{} err := json.Unmarshal([]byte(chainInfoJSON), &drandInfo) if err != nil { return "", xerrors.Errorf("could not unmarshal drand chain info: %w", err) } return "/drand/pubsub/v0.0.0/" + drandInfo.Hash, nil } func GossipSub(in GossipIn) (service *pubsub.PubSub, err error) { bootstrappers := make(map[peer.ID]struct{}) for _, pi := range in.Bp { bootstrappers[pi.ID] = struct{}{} } drandBootstrappers := make(map[peer.ID]struct{}) for _, pi := range in.Db { drandBootstrappers[pi.ID] = struct{}{} } isBootstrapNode := in.Cfg.Bootstrapper drandTopicParams := &pubsub.TopicScoreParams{ // expected 2 beaconsn/min TopicWeight: 0.5, // 5x block topic; max cap is 62.5 // 1 tick per second, maxes at 1 after 1 hour TimeInMeshWeight: 0.00027, // ~1/3600 TimeInMeshQuantum: time.Second, TimeInMeshCap: 1, // deliveries decay after 1 hour, cap at 25 beacons FirstMessageDeliveriesWeight: 5, // max value is 125 FirstMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Hour), FirstMessageDeliveriesCap: 25, // the maximum expected in an hour is ~26, including the decay // Mesh Delivery Failure is currently turned off for beacons // This is on purpose as // - the traffic is very low for meaningful distribution of incoming edges. // - the reaction time needs to be very slow -- in the order of 10 min at least // so we might as well let opportunistic grafting repair the mesh on its own // pace. // - the network is too small, so large asymmetries can be expected between mesh // edges. // We should revisit this once the network grows. // invalid messages decay after 1 hour InvalidMessageDeliveriesWeight: -1000, InvalidMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Hour), } topicParams := map[string]*pubsub.TopicScoreParams{ build.BlocksTopic(in.Nn): { // expected 10 blocks/min TopicWeight: 0.1, // max cap is 50, max mesh penalty is -10, single invalid message is -100 // 1 tick per second, maxes at 1 after 1 hour TimeInMeshWeight: 0.00027, // ~1/3600 TimeInMeshQuantum: time.Second, TimeInMeshCap: 1, // deliveries decay after 1 hour, cap at 100 blocks FirstMessageDeliveriesWeight: 5, // max value is 500 FirstMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Hour), FirstMessageDeliveriesCap: 100, // 100 blocks in an hour // Mesh Delivery Failure is currently turned off for blocks // This is on purpose as // - the traffic is very low for meaningful distribution of incoming edges. // - the reaction time needs to be very slow -- in the order of 10 min at least // so we might as well let opportunistic grafting repair the mesh on its own // pace. // - the network is too small, so large asymmetries can be expected between mesh // edges. // We should revisit this once the network grows. // // // tracks deliveries in the last minute // // penalty activates at 1 minute and expects ~0.4 blocks // MeshMessageDeliveriesWeight: -576, // max penalty is -100 // MeshMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Minute), // MeshMessageDeliveriesCap: 10, // 10 blocks in a minute // MeshMessageDeliveriesThreshold: 0.41666, // 10/12/2 blocks/min // MeshMessageDeliveriesWindow: 10 * time.Millisecond, // MeshMessageDeliveriesActivation: time.Minute, // // // decays after 15 min // MeshFailurePenaltyWeight: -576, // MeshFailurePenaltyDecay: pubsub.ScoreParameterDecay(15 * time.Minute), // invalid messages decay after 1 hour InvalidMessageDeliveriesWeight: -1000, InvalidMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Hour), }, build.MessagesTopic(in.Nn): { // expected > 1 tx/second TopicWeight: 0.1, // max cap is 5, single invalid message is -100 // 1 tick per second, maxes at 1 hour TimeInMeshWeight: 0.0002778, // ~1/3600 TimeInMeshQuantum: time.Second, TimeInMeshCap: 1, // deliveries decay after 10min, cap at 100 tx FirstMessageDeliveriesWeight: 0.5, // max value is 50 FirstMessageDeliveriesDecay: pubsub.ScoreParameterDecay(10 * time.Minute), FirstMessageDeliveriesCap: 100, // 100 messages in 10 minutes // Mesh Delivery Failure is currently turned off for messages // This is on purpose as the network is still too small, which results in // asymmetries and potential unmeshing from negative scores. // // tracks deliveries in the last minute // // penalty activates at 1 min and expects 2.5 txs // MeshMessageDeliveriesWeight: -16, // max penalty is -100 // MeshMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Minute), // MeshMessageDeliveriesCap: 100, // 100 txs in a minute // MeshMessageDeliveriesThreshold: 2.5, // 60/12/2 txs/minute // MeshMessageDeliveriesWindow: 10 * time.Millisecond, // MeshMessageDeliveriesActivation: time.Minute, // // decays after 5min // MeshFailurePenaltyWeight: -16, // MeshFailurePenaltyDecay: pubsub.ScoreParameterDecay(5 * time.Minute), // invalid messages decay after 1 hour InvalidMessageDeliveriesWeight: -1000, InvalidMessageDeliveriesDecay: pubsub.ScoreParameterDecay(time.Hour), }, } pgTopicWeights := map[string]float64{ build.BlocksTopic(in.Nn): 10, build.MessagesTopic(in.Nn): 1, } var drandTopics []string for _, d := range in.Dr { topic, err := getDrandTopic(d.Config.ChainInfoJSON) if err != nil { return nil, err } topicParams[topic] = drandTopicParams pgTopicWeights[topic] = 5 drandTopics = append(drandTopics, topic) } // IP colocation whitelist var ipcoloWhitelist []*net.IPNet for _, cidr := range in.Cfg.IPColocationWhitelist { _, ipnet, err := net.ParseCIDR(cidr) if err != nil { return nil, xerrors.Errorf("error parsing IPColocation subnet %s: %w", cidr, err) } ipcoloWhitelist = append(ipcoloWhitelist, ipnet) } options := []pubsub.Option{ // Gossipsubv1.1 configuration pubsub.WithFloodPublish(true), pubsub.WithMessageIdFn(HashMsgId), pubsub.WithPeerScore( &pubsub.PeerScoreParams{ AppSpecificScore: func(p peer.ID) float64 { // return a heavy positive score for bootstrappers so that we don't unilaterally prune // them and accept PX from them. // we don't do that in the bootstrappers themselves to avoid creating a closed mesh // between them (however we might want to consider doing just that) _, ok := bootstrappers[p] if ok && !isBootstrapNode { return 2500 } _, ok = drandBootstrappers[p] if ok && !isBootstrapNode { return 1500 } // TODO: we want to plug the application specific score to the node itself in order // to provide feedback to the pubsub system based on observed behaviour return 0 }, AppSpecificWeight: 1, // This sets the IP colocation threshold to 5 peers before we apply penalties IPColocationFactorThreshold: 5, IPColocationFactorWeight: -100, IPColocationFactorWhitelist: ipcoloWhitelist, // P7: behavioural penalties, decay after 1hr BehaviourPenaltyThreshold: 6, BehaviourPenaltyWeight: -10, BehaviourPenaltyDecay: pubsub.ScoreParameterDecay(time.Hour), DecayInterval: pubsub.DefaultDecayInterval, DecayToZero: pubsub.DefaultDecayToZero, // this retains non-positive scores for 6 hours RetainScore: 6 * time.Hour, // topic parameters Topics: topicParams, }, &pubsub.PeerScoreThresholds{ GossipThreshold: GossipScoreThreshold, PublishThreshold: PublishScoreThreshold, GraylistThreshold: GraylistScoreThreshold, AcceptPXThreshold: AcceptPXScoreThreshold, OpportunisticGraftThreshold: OpportunisticGraftScoreThreshold, }, ), } // enable Peer eXchange on bootstrappers if isBootstrapNode { // turn off the mesh in bootstrappers -- only do gossip and PX pubsub.GossipSubD = 0 pubsub.GossipSubDscore = 0 pubsub.GossipSubDlo = 0 pubsub.GossipSubDhi = 0 pubsub.GossipSubDout = 0 pubsub.GossipSubDlazy = 64 pubsub.GossipSubGossipFactor = 0.25 pubsub.GossipSubPruneBackoff = 5 * time.Minute // turn on PX options = append(options, pubsub.WithPeerExchange(true)) } // direct peers if in.Cfg.DirectPeers != nil { var directPeerInfo []peer.AddrInfo for _, addr := range in.Cfg.DirectPeers { a, err := ma.NewMultiaddr(addr) if err != nil { return nil, err } pi, err := peer.AddrInfoFromP2pAddr(a) if err != nil { return nil, err } directPeerInfo = append(directPeerInfo, *pi) } options = append(options, pubsub.WithDirectPeers(directPeerInfo)) } // validation queue RED var pgParams *pubsub.PeerGaterParams if isBootstrapNode { pgParams = pubsub.NewPeerGaterParams( 0.33, pubsub.ScoreParameterDecay(2*time.Minute), pubsub.ScoreParameterDecay(10*time.Minute), ).WithTopicDeliveryWeights(pgTopicWeights) } else { pgParams = pubsub.NewPeerGaterParams( 0.33, pubsub.ScoreParameterDecay(2*time.Minute), pubsub.ScoreParameterDecay(time.Hour), ).WithTopicDeliveryWeights(pgTopicWeights) } options = append(options, pubsub.WithPeerGater(pgParams)) allowTopics := []string{ build.BlocksTopic(in.Nn), build.MessagesTopic(in.Nn), } allowTopics = append(allowTopics, drandTopics...) options = append(options, pubsub.WithSubscriptionFilter( pubsub.WrapLimitSubscriptionFilter( pubsub.NewAllowlistSubscriptionFilter(allowTopics...), 100))) var lt tracer.LotusTracer if in.Cfg.JsonTracerFile != "" { out, err := os.OpenFile(in.Cfg.JsonTracerFile, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0660) if err != nil { return nil, err } jsonTransport := tracer.NewJsonTracerTransport(out) lt = tracer.NewLotusTracer(jsonTransport, in.Host.ID()) } // tracer if in.Cfg.RemoteTracer != "" { a, err := ma.NewMultiaddr(in.Cfg.RemoteTracer) if err != nil { return nil, err } pi, err := peer.AddrInfoFromP2pAddr(a) if err != nil { return nil, err } tr, err := pubsub.NewRemoteTracer(context.TODO(), in.Host, *pi) if err != nil { return nil, err } pst := newPeerScoreTracker(lt, in.Sk) trw := newTracerWrapper(tr, lt, build.BlocksTopic(in.Nn)) options = append(options, pubsub.WithEventTracer(trw)) options = append(options, pubsub.WithPeerScoreInspect(pst.UpdatePeerScore, 10*time.Second)) } else { // still instantiate a tracer for collecting metrics trw := newTracerWrapper(nil, nil) options = append(options, pubsub.WithEventTracer(trw)) pst := newPeerScoreTracker(nil, in.Sk) options = append(options, pubsub.WithPeerScoreInspect(pst.UpdatePeerScore, 10*time.Second)) } return pubsub.NewGossipSub(helpers.LifecycleCtx(in.Mctx, in.Lc), in.Host, options...) } func HashMsgId(m *pubsub_pb.Message) string { hash := blake2b.Sum256(m.Data) return string(hash[:]) } func newTracerWrapper( lp2pTracer pubsub.EventTracer, lotusTracer pubsub.EventTracer, topics ...string, ) pubsub.EventTracer { var topicsMap map[string]struct{} if len(topics) > 0 { topicsMap = make(map[string]struct{}) for _, topic := range topics { topicsMap[topic] = struct{}{} } } return &tracerWrapper{lp2pTracer: lp2pTracer, lotusTracer: lotusTracer, topics: topicsMap} } type tracerWrapper struct { lp2pTracer pubsub.EventTracer lotusTracer pubsub.EventTracer topics map[string]struct{} } func (trw *tracerWrapper) traceMessage(topic string) bool { _, ok := trw.topics[topic] return ok } func (trw *tracerWrapper) Trace(evt *pubsub_pb.TraceEvent) { // this filters the trace events reported to the remote tracer to include only // JOIN/LEAVE/GRAFT/PRUNE/PUBLISH/DELIVER. This significantly reduces bandwidth usage and still // collects enough data to recover the state of the mesh and compute message delivery latency // distributions. // Furthermore, we only trace message publication and deliveries for specified topics // (here just the blocks topic). switch evt.GetType() { case pubsub_pb.TraceEvent_PUBLISH_MESSAGE: stats.Record(context.TODO(), metrics.PubsubPublishMessage.M(1)) if trw.traceMessage(evt.GetPublishMessage().GetTopic()) { if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } } case pubsub_pb.TraceEvent_DELIVER_MESSAGE: stats.Record(context.TODO(), metrics.PubsubDeliverMessage.M(1)) if trw.traceMessage(evt.GetDeliverMessage().GetTopic()) { if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } } case pubsub_pb.TraceEvent_REJECT_MESSAGE: stats.Record(context.TODO(), metrics.PubsubRejectMessage.M(1)) case pubsub_pb.TraceEvent_DUPLICATE_MESSAGE: stats.Record(context.TODO(), metrics.PubsubDuplicateMessage.M(1)) case pubsub_pb.TraceEvent_JOIN: if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } case pubsub_pb.TraceEvent_LEAVE: if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } case pubsub_pb.TraceEvent_GRAFT: if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } case pubsub_pb.TraceEvent_PRUNE: if trw.lp2pTracer != nil { trw.lp2pTracer.Trace(evt) } if trw.lotusTracer != nil { trw.lotusTracer.Trace(evt) } case pubsub_pb.TraceEvent_RECV_RPC: stats.Record(context.TODO(), metrics.PubsubRecvRPC.M(1)) case pubsub_pb.TraceEvent_SEND_RPC: stats.Record(context.TODO(), metrics.PubsubSendRPC.M(1)) case pubsub_pb.TraceEvent_DROP_RPC: stats.Record(context.TODO(), metrics.PubsubDropRPC.M(1)) } }