diff --git a/src/net_processing.h b/src/net_processing.h --- a/src/net_processing.h +++ b/src/net_processing.h @@ -33,6 +33,17 @@ static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_BASE = 15 * 60 * 1000000; // 1ms/header static constexpr int64_t HEADERS_DOWNLOAD_TIMEOUT_PER_HEADER = 1000; +/** + * Protect at least this many outbound peers from disconnection due to + * slow/behind headers chain. + */ +static constexpr int32_t MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT = 4; +/** + * Timeout for (unprotected) outbound peers to sync to our chainwork, in + * seconds. + */ +// 20 minutes +static constexpr int64_t CHAIN_SYNC_TIMEOUT = 20 * 60; class PeerLogicValidation final : public CValidationInterface, public NetEventsInterface { @@ -71,6 +82,8 @@ */ bool SendMessages(const Config &config, CNode *pto, std::atomic &interrupt) override; + + void ConsiderEviction(CNode *pto, int64_t time_in_seconds); }; struct CNodeStateStats { diff --git a/src/net_processing.cpp b/src/net_processing.cpp --- a/src/net_processing.cpp +++ b/src/net_processing.cpp @@ -126,11 +126,16 @@ /** Number of peers from which we're downloading blocks. */ int nPeersWithValidatedDownloads = 0; +/** Number of outbound peers with m_chain_sync.m_protect. */ +int g_outbound_peers_with_protect_from_disconnect = 0; + /** Relay map, protected by cs_main. */ typedef std::map MapRelay; MapRelay mapRelay; -/** Expiration-time ordered list of (expire time, relay map entry) pairs, - * protected by cs_main). */ +/** + * Expiration-time ordered list of (expire time, relay map entry) pairs, + * protected by cs_main). + */ std::deque> vRelayExpiration; } // namespace @@ -208,6 +213,35 @@ */ bool fSupportsDesiredCmpctVersion; + /** + * State used to enforce CHAIN_SYNC_TIMEOUT + * Only in effect for outbound, non-manual connections, + * with m_protect == false + * Algorithm: if a peer's best known block has less work than our tip, set a + * timeout CHAIN_SYNC_TIMEOUT seconds in the future: + * - If at timeout their best known block now has more work than our tip + * when the timeout was set, then either reset the timeout or clear it + * (after comparing against our current tip's work) + * - If at timeout their best known block still has less work than our tip + * did when the timeout was set, then send a getheaders message, and set a + * shorter timeout, HEADERS_RESPONSE_TIME seconds in future. If their best + * known block is still behind when that new timeout is reached, disconnect. + */ + struct ChainSyncTimeoutState { + //! A timeout used for checking whether our peer has sufficiently + //! synced. + int64_t m_timeout; + //! A header with the work we require on our peer's chain. + const CBlockIndex *m_work_header; + //! After timeout is reached, set to true after sending getheaders. + bool m_sent_getheaders; + //! Whether this peer is protected from disconnection due to a bad/slow + //! chain. + bool m_protect; + }; + + ChainSyncTimeoutState m_chain_sync; + CNodeState(CAddress addrIn, std::string addrNameIn) : address(addrIn), name(addrNameIn) { fCurrentlyConnected = false; @@ -229,6 +263,7 @@ fPreferHeaderAndIDs = false; fProvidesHeaderAndIDs = false; fSupportsDesiredCmpctVersion = false; + m_chain_sync = {0, nullptr, false, false}; } }; @@ -587,6 +622,13 @@ } // namespace +// Returns true for outbound peers, excluding manual connections, feelers, and +// one-shots. +bool IsOutboundDisconnectionCandidate(const CNode *node) { + return !(node->fInbound || node->m_manual_connection || node->fFeeler || + node->fOneShot); +} + void PeerLogicValidation::InitializeNode(const Config &config, CNode *pnode) { CAddress addr = pnode->addr; std::string addrName = pnode->GetAddrName(); @@ -625,6 +667,9 @@ nPreferredDownload -= state->fPreferredDownload; nPeersWithValidatedDownloads -= (state->nBlocksInFlightValidHeaders != 0); assert(nPeersWithValidatedDownloads >= 0); + g_outbound_peers_with_protect_from_disconnect -= + state->m_chain_sync.m_protect; + assert(g_outbound_peers_with_protect_from_disconnect >= 0); mapNodeState.erase(nodeid); @@ -633,6 +678,7 @@ assert(mapBlocksInFlight.empty()); assert(nPreferredDownload == 0); assert(nPeersWithValidatedDownloads == 0); + assert(g_outbound_peers_with_protect_from_disconnect == 0); } LogPrint(BCLog::NET, "Cleared nodestate for peer=%d\n", nodeid); } @@ -2650,6 +2696,11 @@ assert(pindexLast); UpdateBlockAvailability(pfrom->GetId(), pindexLast->GetBlockHash()); + // From here, pindexBestKnownBlock should be guaranteed to be + // non-null, because it is set in UpdateBlockAvailability. Some + // nullptr checks are still present, however, as + // belt-and-suspenders. + if (nCount == MAX_HEADERS_RESULTS) { // Headers message had its maximum size; the peer may have more // headers. @@ -2737,12 +2788,50 @@ } } } + // If we're in IBD, we want outbound peers that will serve us a + // useful chain. Disconnect peers that are on chains with + // insufficient work. + if (IsInitialBlockDownload() && nCount != MAX_HEADERS_RESULTS) { + // When nCount < MAX_HEADERS_RESULTS, we know we have no more + // headers to fetch from this peer. + if (nodestate->pindexBestKnownBlock && + nodestate->pindexBestKnownBlock->nChainWork < + nMinimumChainWork) { + // This peer has too little work on their headers chain to + // help us sync -- disconnect if using an outbound slot + // (unless whitelisted or addnode). + // Note: We compare their tip to nMinimumChainWork (rather + // than chainActive.Tip()) because we won't start block + // download until we have a headers chain that has at least + // nMinimumChainWork, even if a peer has a chain past our + // tip, as an anti-DoS measure. + if (IsOutboundDisconnectionCandidate(pfrom)) { + LogPrintf("Disconnecting outbound peer %d -- headers " + "chain has insufficient work\n", + pfrom->GetId()); + pfrom->fDisconnect = true; + } + } + } + if (!pfrom->fDisconnect && + IsOutboundDisconnectionCandidate(pfrom) && + nodestate->pindexBestKnownBlock != nullptr) { + // If this is an outbound peer, check to see if we should + // protect it from the bad/lagging chain logic. + if (g_outbound_peers_with_protect_from_disconnect < + MAX_OUTBOUND_PEERS_TO_PROTECT_FROM_DISCONNECT && + nodestate->pindexBestKnownBlock->nChainWork >= + chainActive.Tip()->nChainWork && + !nodestate->m_chain_sync.m_protect) { + nodestate->m_chain_sync.m_protect = true; + ++g_outbound_peers_with_protect_from_disconnect; + } + } } } - else if (strCommand == NetMsgType::BLOCK && !fImporting && - !fReindex) // Ignore blocks received while importing - { + else if (strCommand == NetMsgType::BLOCK && !fImporting && !fReindex) { + // Ignore blocks received while importing. std::shared_ptr pblock = std::make_shared(); vRecv >> *pblock; @@ -3158,6 +3247,90 @@ return fMoreWork; } +void PeerLogicValidation::ConsiderEviction(CNode *pto, + int64_t time_in_seconds) { + AssertLockHeld(cs_main); + + CNodeState &state = *State(pto->GetId()); + const CNetMsgMaker msgMaker(pto->GetSendVersion()); + + if (!state.m_chain_sync.m_protect && + IsOutboundDisconnectionCandidate(pto) && state.fSyncStarted) { + // This is an outbound peer subject to disconnection if they don't + // announce a block with as much work as the current tip within + // CHAIN_SYNC_TIMEOUT + HEADERS_RESPONSE_TIME seconds (note: if their + // chain has more work than ours, we should sync to it, unless it's + // invalid, in which case we should find that out and disconnect from + // them elsewhere). + if (state.pindexBestKnownBlock != nullptr && + state.pindexBestKnownBlock->nChainWork >= + chainActive.Tip()->nChainWork) { + if (state.m_chain_sync.m_timeout != 0) { + state.m_chain_sync.m_timeout = 0; + state.m_chain_sync.m_work_header = nullptr; + state.m_chain_sync.m_sent_getheaders = false; + } + } else if (state.m_chain_sync.m_timeout == 0 || + (state.m_chain_sync.m_work_header != nullptr && + state.pindexBestKnownBlock != nullptr && + state.pindexBestKnownBlock->nChainWork >= + state.m_chain_sync.m_work_header->nChainWork)) { + // Our best block known by this peer is behind our tip, and we're + // either noticing that for the first time, OR this peer was able to + // catch up to some earlier point where we checked against our tip. + // Either way, set a new timeout based on current tip. + state.m_chain_sync.m_timeout = time_in_seconds + CHAIN_SYNC_TIMEOUT; + state.m_chain_sync.m_work_header = chainActive.Tip(); + state.m_chain_sync.m_sent_getheaders = false; + } else if (state.m_chain_sync.m_timeout > 0 && + time_in_seconds > state.m_chain_sync.m_timeout) { + // No evidence yet that our peer has synced to a chain with work + // equal to that of our tip, when we first detected it was behind. + // Send a single getheaders message to give the peer a chance to + // update us. + if (state.m_chain_sync.m_sent_getheaders) { + // They've run out of time to catch up! + LogPrintf( + "Disconnecting outbound peer %d for old chain, best known " + "block = %s\n", + pto->GetId(), + state.pindexBestKnownBlock != nullptr + ? state.pindexBestKnownBlock->GetBlockHash().ToString() + : ""); + pto->fDisconnect = true; + } else { + LogPrint( + BCLog::NET, "sending getheaders to outbound peer=%d to " + "verify chain work (current best known " + "block:%s, benchmark blockhash: %s)\n", + pto->GetId(), + state.pindexBestKnownBlock != nullptr + ? state.pindexBestKnownBlock->GetBlockHash().ToString() + : "", + state.m_chain_sync.m_work_header->GetBlockHash() + .ToString()); + connman->PushMessage( + pto, + msgMaker.Make(NetMsgType::GETHEADERS, + chainActive.GetLocator( + state.m_chain_sync.m_work_header->pprev), + uint256())); + state.m_chain_sync.m_sent_getheaders = true; + // 2 minutes + constexpr int64_t HEADERS_RESPONSE_TIME = 120; + // Bump the timeout to allow a response, which could clear the + // timeout (if the response shows the peer has synced), reset + // the timeout (if the peer syncs to the required work but not + // to our tip), or result in disconnect (if we advance to the + // timeout and pindexBestKnownBlock has not sufficiently + // progressed) + state.m_chain_sync.m_timeout = + time_in_seconds + HEADERS_RESPONSE_TIME; + } + } + } +} + class CompareInvMempoolOrder { CTxMemPool *mp; @@ -3710,6 +3883,10 @@ } } + // Check that outbound peers have reasonable chains GetTime() is used by + // this anti-DoS logic so we can test this using mocktime. + ConsiderEviction(pto, GetTime()); + // // Message: getdata (blocks) // diff --git a/src/test/DoS_tests.cpp b/src/test/DoS_tests.cpp --- a/src/test/DoS_tests.cpp +++ b/src/test/DoS_tests.cpp @@ -43,6 +43,55 @@ BOOST_FIXTURE_TEST_SUITE(DoS_tests, TestingSetup) +// Test eviction of an outbound peer whose chain never advances +// Mock a node connection, and use mocktime to simulate a peer which never sends +// any headers messages. PeerLogic should decide to evict that outbound peer, +// after the appropriate timeouts. +// Note that we protect 4 outbound nodes from being subject to this logic; this +// test takes advantage of that protection only being applied to nodes which +// send headers with sufficient work. +BOOST_AUTO_TEST_CASE(outbound_slow_chain_eviction) { + const Config &config = GetConfig(); + std::atomic interruptDummy(false); + + // Mock an outbound peer + CAddress addr1(ip(0xa0b0c001), NODE_NONE); + CNode dummyNode1(id++, ServiceFlags(NODE_NETWORK), 0, INVALID_SOCKET, addr1, + 0, 0, CAddress(), "", + /*fInboundIn=*/false); + dummyNode1.SetSendVersion(PROTOCOL_VERSION); + + peerLogic->InitializeNode(config, &dummyNode1); + dummyNode1.nVersion = 1; + dummyNode1.fSuccessfullyConnected = true; + + // This test requires that we have a chain with non-zero work. + BOOST_CHECK(chainActive.Tip() != nullptr); + BOOST_CHECK(chainActive.Tip()->nChainWork > 0); + + // Test starts here + // should result in getheaders + peerLogic->SendMessages(config, &dummyNode1, interruptDummy); + BOOST_CHECK(dummyNode1.vSendMsg.size() > 0); + dummyNode1.vSendMsg.clear(); + + int64_t nStartTime = GetTime(); + // Wait 21 minutes + SetMockTime(nStartTime + 21 * 60); + // should result in getheaders + peerLogic->SendMessages(config, &dummyNode1, interruptDummy); + BOOST_CHECK(dummyNode1.vSendMsg.size() > 0); + // Wait 3 more minutes + SetMockTime(nStartTime + 24 * 60); + // should result in disconnect + peerLogic->SendMessages(config, &dummyNode1, interruptDummy); + BOOST_CHECK(dummyNode1.fDisconnect == true); + SetMockTime(0); + + bool dummy; + peerLogic->FinalizeNode(config, dummyNode1.GetId(), dummy); +} + BOOST_AUTO_TEST_CASE(DoS_banning) { const Config &config = GetConfig(); std::atomic interruptDummy(false); @@ -78,6 +127,10 @@ Misbehaving(dummyNode2.GetId(), 50, ""); peerLogic->SendMessages(config, &dummyNode2, interruptDummy); BOOST_CHECK(connman->IsBanned(addr2)); + + bool dummy; + peerLogic->FinalizeNode(config, dummyNode1.GetId(), dummy); + peerLogic->FinalizeNode(config, dummyNode2.GetId(), dummy); } BOOST_AUTO_TEST_CASE(DoS_banscore) { @@ -104,6 +157,9 @@ peerLogic->SendMessages(config, &dummyNode1, interruptDummy); BOOST_CHECK(connman->IsBanned(addr1)); gArgs.ForceSetArg("-banscore", std::to_string(DEFAULT_BANSCORE_THRESHOLD)); + + bool dummy; + peerLogic->FinalizeNode(config, dummyNode1.GetId(), dummy); } BOOST_AUTO_TEST_CASE(DoS_bantime) { @@ -132,6 +188,9 @@ SetMockTime(nStartTime + 60 * 60 * 24 + 1); BOOST_CHECK(!connman->IsBanned(addr)); + + bool dummy; + peerLogic->FinalizeNode(config, dummyNode.GetId(), dummy); } CTransactionRef RandomOrphan() { diff --git a/test/functional/minchainwork.py b/test/functional/minchainwork.py --- a/test/functional/minchainwork.py +++ b/test/functional/minchainwork.py @@ -81,6 +81,13 @@ self.nodes[0].generate(1) self.log.info("Verifying nodes are all synced") + + # Because nodes in regtest are all manual connections (eg using + # addnode), node1 should not have disconnected node0. If not for that, + # we'd expect node1 to have disconnected node0 for serving an + # insufficient work chain, in which case we'd need to reconnect them to + # continue the test. + self.sync_all() self.log.info("Blockcounts: %s", [ n.getblockcount() for n in self.nodes])