From d36276d85e39f7a0071d3f5d948785e008ca1519 Mon Sep 17 00:00:00 2001 From: Felix Lange Date: Fri, 19 Feb 2021 09:54:46 +0100 Subject: [PATCH] p2p/dnsdisc: fix hot-spin when all trees are empty (#22313) In the random sync algorithm used by the DNS node iterator, we first pick a random tree and then perform one sync action on that tree. This happens in a loop until any node is found. If no trees contain any nodes, the iterator will enter a hot loop spinning at 100% CPU. The fix is complicated. The iterator now checks if a meaningful sync action can be performed on any tree. If there is nothing to do, it waits for the next root record recheck time to arrive and then tries again. Fixes #22306 --- p2p/dnsdisc/client.go | 86 +++++++++++++++++++++++++++++++------- p2p/dnsdisc/client_test.go | 47 +++++++++++++++++++++ p2p/dnsdisc/sync.go | 28 ++++++++++--- 3 files changed, 140 insertions(+), 21 deletions(-) diff --git a/p2p/dnsdisc/client.go b/p2p/dnsdisc/client.go index b87278482..d3e8111ab 100644 --- a/p2p/dnsdisc/client.go +++ b/p2p/dnsdisc/client.go @@ -217,8 +217,11 @@ type randomIterator struct { c *Client mu sync.Mutex - trees map[string]*clientTree // all trees lc linkCache // tracks tree dependencies + trees map[string]*clientTree // all trees + // buffers for syncableTrees + syncableList []*clientTree + disabledList []*clientTree } func (c *Client) newRandomIterator() *randomIterator { @@ -238,10 +241,10 @@ func (it *randomIterator) Node() *enode.Node { // Close closes the iterator. func (it *randomIterator) Close() { + it.cancelFn() + it.mu.Lock() defer it.mu.Unlock() - - it.cancelFn() it.trees = nil } @@ -264,7 +267,7 @@ func (it *randomIterator) addTree(url string) error { // nextNode syncs random tree entries until it finds a node. func (it *randomIterator) nextNode() *enode.Node { for { - ct := it.nextTree() + ct := it.pickTree() if ct == nil { return nil } @@ -282,26 +285,79 @@ func (it *randomIterator) nextNode() *enode.Node { } } -// nextTree returns a random tree. -func (it *randomIterator) nextTree() *clientTree { +// pickTree returns a random tree to sync from. +func (it *randomIterator) pickTree() *clientTree { it.mu.Lock() defer it.mu.Unlock() + // Rebuild the trees map if any links have changed. if it.lc.changed { it.rebuildTrees() it.lc.changed = false } - if len(it.trees) == 0 { - return nil - } - limit := rand.Intn(len(it.trees)) - for _, ct := range it.trees { - if limit == 0 { - return ct + + for { + canSync, trees := it.syncableTrees() + switch { + case canSync: + // Pick a random tree. + return trees[rand.Intn(len(trees))] + case len(trees) > 0: + // No sync action can be performed on any tree right now. The only meaningful + // thing to do is waiting for any root record to get updated. + if !it.waitForRootUpdates(trees) { + // Iterator was closed while waiting. + return nil + } + default: + // There are no trees left, the iterator was closed. + return nil } - limit-- } - return nil +} + +// syncableTrees finds trees on which any meaningful sync action can be performed. +func (it *randomIterator) syncableTrees() (canSync bool, trees []*clientTree) { + // Resize tree lists. + it.syncableList = it.syncableList[:0] + it.disabledList = it.disabledList[:0] + + // Partition them into the two lists. + for _, ct := range it.trees { + if ct.canSyncRandom() { + it.syncableList = append(it.syncableList, ct) + } else { + it.disabledList = append(it.disabledList, ct) + } + } + if len(it.syncableList) > 0 { + return true, it.syncableList + } + return false, it.disabledList +} + +// waitForRootUpdates waits for the closest scheduled root check time on the given trees. +func (it *randomIterator) waitForRootUpdates(trees []*clientTree) bool { + var minTree *clientTree + var nextCheck mclock.AbsTime + for _, ct := range trees { + check := ct.nextScheduledRootCheck() + if minTree == nil || check < nextCheck { + minTree = ct + nextCheck = check + } + } + + sleep := nextCheck.Sub(it.c.clock.Now()) + it.c.cfg.Logger.Debug("DNS iterator waiting for root updates", "sleep", sleep, "tree", minTree.loc.domain) + timeout := it.c.clock.NewTimer(sleep) + defer timeout.Stop() + select { + case <-timeout.C(): + return true + case <-it.ctx.Done(): + return false // Iterator was closed. + } } // rebuildTrees rebuilds the 'trees' map. diff --git a/p2p/dnsdisc/client_test.go b/p2p/dnsdisc/client_test.go index 6a6705abf..741bee423 100644 --- a/p2p/dnsdisc/client_test.go +++ b/p2p/dnsdisc/client_test.go @@ -231,6 +231,53 @@ func TestIteratorRootRecheckOnFail(t *testing.T) { checkIterator(t, it, nodes) } +// This test checks that the iterator works correctly when the tree is initially empty. +func TestIteratorEmptyTree(t *testing.T) { + var ( + clock = new(mclock.Simulated) + nodes = testNodes(nodesSeed1, 1) + resolver = newMapResolver() + c = NewClient(Config{ + Resolver: resolver, + Logger: testlog.Logger(t, log.LvlTrace), + RecheckInterval: 20 * time.Minute, + RateLimit: 500, + }) + ) + c.clock = clock + tree1, url := makeTestTree("n", nil, nil) + tree2, _ := makeTestTree("n", nodes, nil) + resolver.add(tree1.ToTXT("n")) + + // Start the iterator. + node := make(chan *enode.Node) + it, err := c.NewIterator(url) + if err != nil { + t.Fatal(err) + } + go func() { + it.Next() + node <- it.Node() + }() + + // Wait for the client to get stuck in waitForRootUpdates. + clock.WaitForTimers(1) + + // Now update the root. + resolver.add(tree2.ToTXT("n")) + + // Wait for it to pick up the root change. + clock.Run(c.cfg.RecheckInterval) + select { + case n := <-node: + if n.ID() != nodes[0].ID() { + t.Fatalf("wrong node returned") + } + case <-time.After(5 * time.Second): + t.Fatal("it.Next() did not unblock within 5s of real time") + } +} + // updateSomeNodes applies ENR updates to some of the given nodes. func updateSomeNodes(keySeed int64, nodes []*enode.Node) { keys := testKeys(nodesSeed1, len(nodes)) diff --git a/p2p/dnsdisc/sync.go b/p2p/dnsdisc/sync.go index 36f02acba..073547c90 100644 --- a/p2p/dnsdisc/sync.go +++ b/p2p/dnsdisc/sync.go @@ -25,9 +25,9 @@ import ( "github.com/ethereum/go-ethereum/p2p/enode" ) -const ( - rootRecheckFailCount = 5 // update root if this many leaf requests fail -) +// This is the number of consecutive leaf requests that may fail before +// we consider re-resolving the tree root. +const rootRecheckFailCount = 5 // clientTree is a full tree being synced. type clientTree struct { @@ -89,13 +89,22 @@ func (ct *clientTree) syncRandom(ctx context.Context) (n *enode.Node, err error) ct.gcLinks() // Sync next random entry in ENR tree. Once every node has been visited, we simply - // start over. This is fine because entries are cached. + // start over. This is fine because entries are cached internally by the client LRU + // also by DNS resolvers. if ct.enrs.done() { ct.enrs = newSubtreeSync(ct.c, ct.loc, ct.root.eroot, false) } return ct.syncNextRandomENR(ctx) } +// canSyncRandom checks if any meaningful action can be performed by syncRandom. +func (ct *clientTree) canSyncRandom() bool { + // Note: the check for non-zero leaf count is very important here. + // If we're done syncing all nodes, and no leaves were found, the tree + // is empty and we can't use it for sync. + return ct.rootUpdateDue() || !ct.links.done() || !ct.enrs.done() || ct.enrs.leaves != 0 +} + // gcLinks removes outdated links from the global link cache. GC runs once // when the link sync finishes. func (ct *clientTree) gcLinks() { @@ -184,10 +193,14 @@ func (ct *clientTree) updateRoot(ctx context.Context) error { // rootUpdateDue returns true when a root update is needed. func (ct *clientTree) rootUpdateDue() bool { tooManyFailures := ct.leafFailCount > rootRecheckFailCount - scheduledCheck := ct.c.clock.Now().Sub(ct.lastRootCheck) > ct.c.cfg.RecheckInterval + scheduledCheck := ct.c.clock.Now() >= ct.nextScheduledRootCheck() return ct.root == nil || tooManyFailures || scheduledCheck } +func (ct *clientTree) nextScheduledRootCheck() mclock.AbsTime { + return ct.lastRootCheck.Add(ct.c.cfg.RecheckInterval) +} + // slowdownRootUpdate applies a delay to root resolution if is tried // too frequently. This avoids busy polling when the client is offline. // Returns true if the timeout passed, false if sync was canceled. @@ -218,10 +231,11 @@ type subtreeSync struct { root string missing []string // missing tree node hashes link bool // true if this sync is for the link tree + leaves int // counter of synced leaves } func newSubtreeSync(c *Client, loc *linkEntry, root string, link bool) *subtreeSync { - return &subtreeSync{c, loc, root, []string{root}, link} + return &subtreeSync{c, loc, root, []string{root}, link, 0} } func (ts *subtreeSync) done() bool { @@ -253,10 +267,12 @@ func (ts *subtreeSync) resolveNext(ctx context.Context, hash string) (entry, err if ts.link { return nil, errENRInLinkTree } + ts.leaves++ case *linkEntry: if !ts.link { return nil, errLinkInENRTree } + ts.leaves++ case *branchEntry: ts.missing = append(ts.missing, e.children...) }