diff --git a/test/functional/dbcrash.py b/test/functional/dbcrash.py index fb930dd7b..f2b23b4fe 100755 --- a/test/functional/dbcrash.py +++ b/test/functional/dbcrash.py @@ -1,305 +1,305 @@ #!/usr/bin/env python3 # Copyright (c) 2017 The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. """Test recovery from a crash during chainstate writing. - 4 nodes * node0, node1, and node2 will have different dbcrash ratios, and different dbcache sizes * node3 will be a regular node, with no crashing. * The nodes will not connect to each other. - use default test framework starting chain. initialize starting_tip_height to tip height. - Main loop: * generate lots of transactions on node3, enough to fill up a block. * uniformly randomly pick a tip height from starting_tip_height to tip_height; with probability 1/(height_difference+4), invalidate this block. * mine enough blocks to overtake tip_height at start of loop. * for each node in [node0,node1,node2]: - for each mined block: * submit block to node * if node crashed on/after submitting: - restart until recovery succeeds - check that utxo matches node3 using gettxoutsetinfo""" import errno import http.client import random import sys import time from test_framework.mininode import * from test_framework.script import * from test_framework.test_framework import BitcoinTestFramework from test_framework.util import * from test_framework.blocktools import create_confirmed_utxos HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest] try: HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected) except AttributeError: pass class ChainstateWriteCrashTest(BitcoinTestFramework): def set_test_params(self): self.num_nodes = 4 self.setup_clean_chain = False # Set -maxmempool=0 to turn off mempool memory sharing with dbcache # Set -rpcservertimeout=900 to reduce socket disconnects in this # long-running test self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"] # Set different crash ratios and cache sizes. Note that not all of # -dbcache goes to pcoinsTip. self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args # Node3 is a normal node with default args, except will mine full blocks - self.node3_args = ["-blockmaxweight=4000000"] + self.node3_args = ["-blockmaxsize=32000000"] self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] def setup_network(self): # Need a bit of extra time for the nodes to start up for this test self.add_nodes(self.num_nodes, extra_args=self.extra_args, timewait=90) self.start_nodes() # Leave them unconnected, we'll use submitblock directly in this test def restart_node(self, node_index, expected_tip): """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash. Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up after 60 seconds. Returns the utxo hash of the given node.""" time_start = time.time() while time.time() - time_start < 120: try: # Any of these RPC calls could throw due to node crash self.start_node(node_index) self.nodes[node_index].waitforblock(expected_tip) utxo_hash = self.nodes[node_index].gettxoutsetinfo()[ 'hash_serialized'] return utxo_hash except: # An exception here should mean the node is about to crash. # If bitcoind exits, then try again. wait_for_node_exit() # should raise an exception if bitcoind doesn't exit. self.wait_for_node_exit(node_index, timeout=10) self.crashed_on_restart += 1 time.sleep(1) # If we got here, bitcoind isn't coming back up on restart. Could be a # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- # perhaps we generated a test case that blew up our cache? # TODO: If this happens a lot, we should try to restart without -dbcrashratio # and make sure that recovery happens. raise AssertionError( "Unable to successfully restart node %d in allotted time", node_index) def submit_block_catch_error(self, node_index, block): """Try submitting a block to the given node. Catch any exceptions that indicate the node has crashed. Returns true if the block was submitted successfully; false otherwise.""" try: self.nodes[node_index].submitblock(block) return True except http.client.BadStatusLine as e: # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error. if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''": self.log.debug( "node %d submitblock raised exception: %s", node_index, e) return False else: raise except tuple(HTTP_DISCONNECT_ERRORS) as e: self.log.debug( "node %d submitblock raised exception: %s", node_index, e) return False except OSError as e: self.log.debug( "node %d submitblock raised OSError exception: errno=%s", node_index, e.errno) if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: # The node has likely crashed return False else: # Unexpected exception, raise raise def sync_node3blocks(self, block_hashes): """Use submitblock to sync node3's chain with the other nodes If submitblock fails, restart the node and get the new utxo hash. If any nodes crash while updating, we'll compare utxo hashes to ensure recovery was successful.""" node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized'] # Retrieve all the blocks from node3 blocks = [] for block_hash in block_hashes: blocks.append( [block_hash, self.nodes[3].getblock(block_hash, False)]) # Deliver each block to each other node for i in range(3): nodei_utxo_hash = None self.log.debug("Syncing blocks to node %d", i) for (block_hash, block) in blocks: # Get the block from node3, and submit to node_i self.log.debug("submitting block %s", block_hash) if not self.submit_block_catch_error(i, block): # TODO: more carefully check that the crash is due to -dbcrashratio # (change the exit code perhaps, and check that here?) self.wait_for_node_exit(i, timeout=30) self.log.debug( "Restarting node %d after block hash %s", i, block_hash) nodei_utxo_hash = self.restart_node(i, block_hash) assert nodei_utxo_hash is not None self.restart_counts[i] += 1 else: # Clear it out after successful submitblock calls -- the cached # utxo hash will no longer be correct nodei_utxo_hash = None # Check that the utxo hash matches node3's utxo set # NOTE: we only check the utxo set if we had to restart the node # after the last block submitted: # - checking the utxo hash causes a cache flush, which we don't # want to do every time; so # - we only update the utxo cache after a node restart, since flushing # the cache is a no-op at that point if nodei_utxo_hash is not None: self.log.debug("Checking txoutsetinfo matches for node %d", i) assert_equal(nodei_utxo_hash, node3_utxo_hash) def verify_utxo_hash(self): """Verify that the utxo hash of each node matches node3. Restart any nodes that crash while querying.""" node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized'] self.log.info("Verifying utxo hash matches for all nodes") for i in range(3): try: nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()[ 'hash_serialized'] except OSError: # probably a crash on db flushing nodei_utxo_hash = self.restart_node( i, self.nodes[3].getbestblockhash()) assert_equal(nodei_utxo_hash, node3_utxo_hash) def generate_small_transactions(self, node, count, utxo_list): FEE = 1000 # TODO: replace this with node relay fee based calculation num_transactions = 0 random.shuffle(utxo_list) while len(utxo_list) >= 2 and num_transactions < count: tx = CTransaction() input_amount = 0 for i in range(2): utxo = utxo_list.pop() tx.vin.append( CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout']))) input_amount += int(utxo['amount'] * COIN) output_amount = (input_amount - FEE) // 3 if output_amount <= 0: # Sanity check -- if we chose inputs that are too small, skip continue for i in range(3): tx.vout.append( CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey']))) # Sign and send the transaction to get into the mempool tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex'] node.sendrawtransaction(tx_signed_hex) num_transactions += 1 def run_test(self): # Track test coverage statistics self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 self.crashed_on_restart = 0 # Track count of crashes during recovery # Start by creating a lot of utxos on node3 initial_height = self.nodes[3].getblockcount() utxo_list = create_confirmed_utxos(self.nodes[3], 5000) self.log.info("Prepped %d utxo entries", len(utxo_list)) # Sync these blocks with the other nodes block_hashes_to_sync = [] for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1): block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync)) # Syncing the blocks could cause nodes to crash, so the test begins here. self.sync_node3blocks(block_hashes_to_sync) starting_tip_height = self.nodes[3].getblockcount() # Main test loop: # each time through the loop, generate a bunch of transactions, # and then either mine a single new block on the tip, or some-sized reorg. for i in range(40): self.log.info( "Iteration %d, generating 2500 transactions %s", i, self.restart_counts) # Generate a bunch of small-ish transactions self.generate_small_transactions(self.nodes[3], 2500, utxo_list) # Pick a random block between current tip, and starting tip current_height = self.nodes[3].getblockcount() random_height = random.randint(starting_tip_height, current_height) self.log.debug("At height %d, considering height %d", current_height, random_height) if random_height > starting_tip_height: # Randomly reorg from this point with some probability (1/4 for # tip, 1/5 for tip-1, ...) if random.random() < 1.0 / (current_height + 4 - random_height): self.log.debug( "Invalidating block at height %d", random_height) self.nodes[3].invalidateblock( self.nodes[3].getblockhash(random_height)) # Now generate new blocks until we pass the old tip height self.log.debug("Mining longer tip") block_hashes = [] while current_height + 1 > self.nodes[3].getblockcount(): block_hashes.extend(self.nodes[3].generate( min(10, current_height + 1 - self.nodes[3].getblockcount()))) self.log.debug("Syncing %d new blocks...", len(block_hashes)) self.sync_node3blocks(block_hashes) utxo_list = self.nodes[3].listunspent() self.log.debug("Node3 utxo count: %d", len(utxo_list)) # Check that the utxo hashes agree with node3 # Useful side effect: each utxo cache gets flushed here, so that we # won't get crashes on shutdown at the end of the test. self.verify_utxo_hash() # Check the test coverage self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart) # If no nodes were restarted, we didn't test anything. assert self.restart_counts != [0, 0, 0] # Make sure we tested the case of crash-during-recovery. assert self.crashed_on_restart > 0 # Warn if any of the nodes escaped restart. for i in range(3): if self.restart_counts[i] == 0: self.log.warn("Node %d never crashed during utxo flush!", i) if __name__ == "__main__": ChainstateWriteCrashTest().main() diff --git a/test/functional/test_framework/test_node.py b/test/functional/test_framework/test_node.py index 16099af55..47b2017eb 100755 --- a/test/functional/test_framework/test_node.py +++ b/test/functional/test_framework/test_node.py @@ -1,219 +1,225 @@ #!/usr/bin/env python3 # Copyright (c) 2017 The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. """Class for bitcoind node under test""" import decimal import errno import http.client import json import logging import os import subprocess import time from .mininode import COIN, ToHex, FromHex, CTransaction from .util import ( assert_equal, get_rpc_proxy, rpc_url, wait_until, ) from .authproxy import JSONRPCException BITCOIND_PROC_WAIT_TIMEOUT = 60 class TestNode(): """A class for representing a bitcoind node under test. This class contains: - state about the node (whether it's running, etc) - a Python subprocess.Popen object representing the running process - an RPC connection to the node To make things easier for the test writer, a bit of magic is happening under the covers. Any unrecognised messages will be dispatched to the RPC connection.""" def __init__(self, i, dirname, extra_args, rpchost, timewait, binary, stderr, mocktime, coverage_dir): self.index = i self.datadir = os.path.join(dirname, "node" + str(i)) self.rpchost = rpchost if timewait: self.rpc_timeout = timewait else: # Wait for up to 60 seconds for the RPC server to respond self.rpc_timeout = 60 if binary is None: self.binary = os.getenv("BITCOIND", "bitcoind") else: self.binary = binary self.stderr = stderr self.coverage_dir = coverage_dir # Most callers will just need to add extra args to the standard list below. For those callers that need more flexibity, they can just set the args property directly. self.extra_args = extra_args self.args = [self.binary, "-datadir=" + self.datadir, "-server", "-keypool=1", "-discover=0", "-rest", "-logtimemicros", "-debug", "-debugexclude=libevent", "-debugexclude=leveldb", "-mocktime=" + str(mocktime), "-uacomment=testnode%d" % i] self.cli = TestNodeCLI( os.getenv("BITCOINCLI", "bitcoin-cli"), self.datadir) self.running = False self.process = None self.rpc_connected = False self.rpc = None self.url = None self.relay_fee_cache = None self.log = logging.getLogger('TestFramework.node%d' % i) def __getattr__(self, *args, **kwargs): """Dispatches any unrecognised messages to the RPC connection.""" assert self.rpc is not None, "Error: RPC not initialized" assert self.rpc_connected, "Error: No RPC connection" return self.rpc.__getattr__(*args, **kwargs) def start(self, extra_args=None, stderr=None): """Start the node.""" if extra_args is None: extra_args = self.extra_args if stderr is None: stderr = self.stderr self.process = subprocess.Popen(self.args + extra_args, stderr=stderr) self.running = True self.log.debug("bitcoind started, waiting for RPC to come up") def wait_for_rpc_connection(self): """Sets up an RPC connection to the bitcoind process. Returns False if unable to connect.""" # Poll at a rate of four times per second poll_per_s = 4 for _ in range(poll_per_s * self.rpc_timeout): assert self.process.poll( ) is None, "bitcoind exited with status %i during initialization" % self.process.returncode try: self.rpc = get_rpc_proxy(rpc_url(self.datadir, self.index, self.rpchost), self.index, timeout=self.rpc_timeout, coveragedir=self.coverage_dir) self.rpc.getblockcount() # If the call to getblockcount() succeeds then the RPC connection is up self.rpc_connected = True self.url = self.rpc.url self.log.debug("RPC successfully started") return except IOError as e: if e.errno != errno.ECONNREFUSED: # Port not yet open? raise # unknown IO error except JSONRPCException as e: # Initialization phase if e.error['code'] != -28: # RPC in warmup? raise # unknown JSON RPC exception except ValueError as e: # cookie file not found and no rpcuser or rpcassword. bitcoind still starting if "No RPC credentials" not in str(e): raise time.sleep(1.0 / poll_per_s) raise AssertionError("Unable to connect to bitcoind") def get_wallet_rpc(self, wallet_name): assert self.rpc_connected assert self.rpc wallet_path = "wallet/%s" % wallet_name return self.rpc / wallet_path def stop_node(self): """Stop the node.""" if not self.running: return self.log.debug("Stopping node") try: self.stop() except http.client.CannotSendRequest: self.log.exception("Unable to stop node.") def is_node_stopped(self): """Checks whether the node has stopped. Returns True if the node has stopped. False otherwise. This method is responsible for freeing resources (self.process).""" if not self.running: return True return_code = self.process.poll() if return_code is None: return False # process has stopped. Assert that it didn't return an error code. assert_equal(return_code, 0) self.running = False self.process = None self.rpc_connected = False self.rpc = None self.log.debug("Node stopped") return True def wait_until_stopped(self, timeout=BITCOIND_PROC_WAIT_TIMEOUT): wait_until(self.is_node_stopped, timeout=timeout) def node_encrypt_wallet(self, passphrase): """"Encrypts the wallet. This causes bitcoind to shutdown, so this method takes care of cleaning up resources.""" self.encryptwallet(passphrase) self.wait_until_stopped() def relay_fee(self, cached=True): if not self.relay_fee_cache or not cached: self.relay_fee_cache = self.getnetworkinfo()["relayfee"] return self.relay_fee_cache def calculate_fee(self, tx): # Relay fee is in satoshis per KB. Thus the 1000, and the COIN added # to get back to an amount of satoshis. - return int(self.relay_fee() / 1000 * tx.billable_size() * COIN) + billable_size_estimate = tx.billable_size() + # Add some padding for signatures + # NOTE: Fees must be calculated before signatures are added, + # so they will never be included in the billable_size above. + billable_size_estimate += len(tx.vin) * 81 + + return int(self.relay_fee() / 1000 * billable_size_estimate * COIN) def calculate_fee_from_txid(self, txid): ctx = FromHex(CTransaction(), self.getrawtransaction(txid)) return self.calculate_fee(ctx) class TestNodeCLI(): """Interface to bitcoin-cli for an individual node""" def __init__(self, binary, datadir): self.args = [] self.binary = binary self.datadir = datadir self.input = None def __call__(self, *args, input=None): # TestNodeCLI is callable with bitcoin-cli command-line args self.args = [str(arg) for arg in args] self.input = input return self def __getattr__(self, command): def dispatcher(*args, **kwargs): return self.send_cli(command, *args, **kwargs) return dispatcher def send_cli(self, command, *args, **kwargs): """Run bitcoin-cli command. Deserializes returned string as python object.""" pos_args = [str(arg) for arg in args] named_args = [str(key) + "=" + str(value) for (key, value) in kwargs.items()] assert not ( pos_args and named_args), "Cannot use positional arguments and named arguments in the same bitcoin-cli call" p_args = [self.binary, "-datadir=" + self.datadir] + self.args if named_args: p_args += ["-named"] p_args += [command] + pos_args + named_args process = subprocess.Popen(p_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) cli_stdout, cli_stderr = process.communicate(input=self.input) returncode = process.poll() if returncode: # Ignore cli_stdout, raise with cli_stderr raise subprocess.CalledProcessError( returncode, self.binary, output=cli_stderr) return json.loads(cli_stdout, parse_float=decimal.Decimal)