Differential D13891 Diff 40299 contrib/linearize/linearize-data.py

Changeset View

Standalone View

contrib/linearize/linearize-data.py

	Show All 19 Lines
	from binascii import unhexlify			from binascii import unhexlify
	from collections import namedtuple			from collections import namedtuple
	from typing import Any, Dict			from typing import Any, Dict

	settings: Dict[str, Any] = {}			settings: Dict[str, Any] = {}


	def hex_switchEndian(s):			def hex_switchEndian(s):
	""" Switches the endianness of a hex string (in pairs of hex chars) """			"""Switches the endianness of a hex string (in pairs of hex chars)"""
	pairList = [s[i:i + 2].encode() for i in range(0, len(s), 2)]			pairList = [s[i : i + 2].encode() for i in range(0, len(s), 2)]
	return b''.join(pairList[::-1]).decode()			return b"".join(pairList[::-1]).decode()


	def uint32(x):			def uint32(x):
	return x & 0xffffffff			return x & 0xFFFFFFFF


	def bytereverse(x):			def bytereverse(x):
	return uint32((((x) << 24) \| (((x) << 8) & 0x00ff0000) \|			return uint32(
	(((x) >> 8) & 0x0000ff00) \| ((x) >> 24)))			(
				((x) << 24)
				\| (((x) << 8) & 0x00FF0000)
				\| (((x) >> 8) & 0x0000FF00)
				\| ((x) >> 24)
				)
				)


	def bufreverse(in_buf):			def bufreverse(in_buf):
	out_words = []			out_words = []
	for i in range(0, len(in_buf), 4):			for i in range(0, len(in_buf), 4):
	word = struct.unpack('@I', in_buf[i:i + 4])[0]			word = struct.unpack("@I", in_buf[i : i + 4])[0]
	out_words.append(struct.pack('@I', bytereverse(word)))			out_words.append(struct.pack("@I", bytereverse(word)))
	return b''.join(out_words)			return b"".join(out_words)


	def wordreverse(in_buf):			def wordreverse(in_buf):
	out_words = []			out_words = []
	for i in range(0, len(in_buf), 4):			for i in range(0, len(in_buf), 4):
	out_words.append(in_buf[i:i + 4])			out_words.append(in_buf[i : i + 4])
	out_words.reverse()			out_words.reverse()
	return b''.join(out_words)			return b"".join(out_words)


	def calc_hdr_hash(blk_hdr):			def calc_hdr_hash(blk_hdr):
	hash1 = hashlib.sha256()			hash1 = hashlib.sha256()
	hash1.update(blk_hdr)			hash1.update(blk_hdr)
	hash1_o = hash1.digest()			hash1_o = hash1.digest()

	hash2 = hashlib.sha256()			hash2 = hashlib.sha256()
	hash2.update(hash1_o)			hash2.update(hash1_o)
	hash2_o = hash2.digest()			hash2_o = hash2.digest()

	return hash2_o			return hash2_o


	def calc_hash_str(blk_hdr):			def calc_hash_str(blk_hdr):
	blockhash = calc_hdr_hash(blk_hdr)			blockhash = calc_hdr_hash(blk_hdr)
	blockhash = bufreverse(blockhash)			blockhash = bufreverse(blockhash)
	blockhash = wordreverse(blockhash)			blockhash = wordreverse(blockhash)
	hash_str = blockhash.hex()			hash_str = blockhash.hex()
	return hash_str			return hash_str


	def get_blk_dt(blk_hdr):			def get_blk_dt(blk_hdr):
	members = struct.unpack("<I", blk_hdr[68:68 + 4])			members = struct.unpack("<I", blk_hdr[68 : 68 + 4])
	nTime = members[0]			nTime = members[0]
	dt = datetime.datetime.fromtimestamp(nTime)			dt = datetime.datetime.fromtimestamp(nTime)
	dt_ym = datetime.datetime(dt.year, dt.month, 1)			dt_ym = datetime.datetime(dt.year, dt.month, 1)
	return (dt_ym, nTime)			return (dt_ym, nTime)


	# When getting the list of block hashes, undo any byte reversals.			# When getting the list of block hashes, undo any byte reversals.


	def get_block_hashes(settings):			def get_block_hashes(settings):
	blkindex = []			blkindex = []
	f = open(settings['hashlist'], "r", encoding="utf8")			f = open(settings["hashlist"], "r", encoding="utf8")
	for line in f:			for line in f:
	line = line.rstrip()			line = line.rstrip()
	if settings['rev_hash_bytes'] == 'true':			if settings["rev_hash_bytes"] == "true":
	line = hex_switchEndian(line)			line = hex_switchEndian(line)
	blkindex.append(line)			blkindex.append(line)

	print("Read " + str(len(blkindex)) + " hashes")			print("Read " + str(len(blkindex)) + " hashes")

	return blkindex			return blkindex


	# The block map shouldn't give or receive byte-reversed hashes.			# The block map shouldn't give or receive byte-reversed hashes.


	def mkblockmap(blkindex):			def mkblockmap(blkindex):
	blkmap = {}			blkmap = {}
	for height, blockhash in enumerate(blkindex):			for height, blockhash in enumerate(blkindex):
	blkmap[blockhash] = height			blkmap[blockhash] = height
	return blkmap			return blkmap


	# Block header and extent on disk			# Block header and extent on disk
	BlockExtent = namedtuple(			BlockExtent = namedtuple("BlockExtent", ["fn", "offset", "inhdr", "blkhdr", "size"])
	'BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])


	class BlockDataCopier:			class BlockDataCopier:
	def __init__(self, settings, blkindex, blkmap):			def __init__(self, settings, blkindex, blkmap):
	self.settings = settings			self.settings = settings
	self.blkindex = blkindex			self.blkindex = blkindex
	self.blkmap = blkmap			self.blkmap = blkmap

	self.inFn = 0			self.inFn = 0
	self.inF = None			self.inF = None
	self.outFn = 0			self.outFn = 0
	self.outsz = 0			self.outsz = 0
	self.outF = None			self.outF = None
	self.outFname = None			self.outFname = None
	self.blkCountIn = 0			self.blkCountIn = 0
	self.blkCountOut = 0			self.blkCountOut = 0

	self.lastDate = datetime.datetime(2000, 1, 1)			self.lastDate = datetime.datetime(2000, 1, 1)
	self.highTS = 1408893517 - 315360000			self.highTS = 1408893517 - 315360000
	self.timestampSplit = False			self.timestampSplit = False
	self.fileOutput = True			self.fileOutput = True
	self.setFileTime = False			self.setFileTime = False
	self.maxOutSz = settings['max_out_sz']			self.maxOutSz = settings["max_out_sz"]
	if 'output' in settings:			if "output" in settings:
	self.fileOutput = False			self.fileOutput = False
	if settings['file_timestamp'] != 0:			if settings["file_timestamp"] != 0:
	self.setFileTime = True			self.setFileTime = True
	if settings['split_timestamp'] != 0:			if settings["split_timestamp"] != 0:
	self.timestampSplit = True			self.timestampSplit = True
	# Extents and cache for out-of-order blocks			# Extents and cache for out-of-order blocks
	self.blockExtents = {}			self.blockExtents = {}
	self.outOfOrderData = {}			self.outOfOrderData = {}
	self.outOfOrderSize = 0 # running total size for items in outOfOrderData			self.outOfOrderSize = 0 # running total size for items in outOfOrderData

	def writeBlock(self, inhdr, blk_hdr, rawblock):			def writeBlock(self, inhdr, blk_hdr, rawblock):
	blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)			blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
	if not self.fileOutput and (			if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
	(self.outsz + blockSizeOnDisk) > self.maxOutSz):
	self.outF.close()			self.outF.close()
	if self.setFileTime:			if self.setFileTime:
	os.utime(self.outFname, (int(time.time()), self.highTS))			os.utime(self.outFname, (int(time.time()), self.highTS))
	self.outF = None			self.outF = None
	self.outFname = None			self.outFname = None
	self.outFn = self.outFn + 1			self.outFn = self.outFn + 1
	self.outsz = 0			self.outsz = 0

	(blkDate, blkTS) = get_blk_dt(blk_hdr)			(blkDate, blkTS) = get_blk_dt(blk_hdr)
	if self.timestampSplit and (blkDate > self.lastDate):			if self.timestampSplit and (blkDate > self.lastDate):
	print("New month " + blkDate.strftime("%Y-%m") +			print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
	" @ " + self.hash_str)
	self.lastDate = blkDate			self.lastDate = blkDate
	if self.outF:			if self.outF:
	self.outF.close()			self.outF.close()
	if self.setFileTime:			if self.setFileTime:
	os.utime(self.outFname, (int(time.time()), self.highTS))			os.utime(self.outFname, (int(time.time()), self.highTS))
	self.outF = None			self.outF = None
	self.outFname = None			self.outFname = None
	self.outFn = self.outFn + 1			self.outFn = self.outFn + 1
	self.outsz = 0			self.outsz = 0

	if not self.outF:			if not self.outF:
	if self.fileOutput:			if self.fileOutput:
	self.outFname = self.settings['output_file']			self.outFname = self.settings["output_file"]
	else:			else:
	self.outFname = os.path.join(			self.outFname = os.path.join(
	self.settings['output'], f"blk{self.outFn:05d}.dat")			self.settings["output"], f"blk{self.outFn:05d}.dat"
				)
	print("Output file " + self.outFname)			print("Output file " + self.outFname)
	self.outF = open(self.outFname, "wb")			self.outF = open(self.outFname, "wb")

	self.outF.write(inhdr)			self.outF.write(inhdr)
	self.outF.write(blk_hdr)			self.outF.write(blk_hdr)
	self.outF.write(rawblock)			self.outF.write(rawblock)
	self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)			self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)

	self.blkCountOut = self.blkCountOut + 1			self.blkCountOut = self.blkCountOut + 1
	if blkTS > self.highTS:			if blkTS > self.highTS:
	self.highTS = blkTS			self.highTS = blkTS

	if (self.blkCountOut % 1000) == 0:			if (self.blkCountOut % 1000) == 0:
	print('{} blocks scanned, {} blocks written (of {}, {:.1f}% complete)'.format(			print(
	self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))			"{} blocks scanned, {} blocks written (of {}, {:.1f}% complete)".format(
				self.blkCountIn,
				self.blkCountOut,
				len(self.blkindex),
				100.0 * self.blkCountOut / len(self.blkindex),
				)
				)

	def inFileName(self, fn):			def inFileName(self, fn):
	return os.path.join(self.settings['input'], f"blk{fn:05d}.dat")			return os.path.join(self.settings["input"], f"blk{fn:05d}.dat")

	def fetchBlock(self, extent):			def fetchBlock(self, extent):
	'''Fetch block contents from disk given extents'''			"""Fetch block contents from disk given extents"""
	with open(self.inFileName(extent.fn), "rb") as f:			with open(self.inFileName(extent.fn), "rb") as f:
	f.seek(extent.offset)			f.seek(extent.offset)
	return f.read(extent.size)			return f.read(extent.size)

	def copyOneBlock(self):			def copyOneBlock(self):
	'''Find the next block to be written in the input, and copy it to the output.'''			"""Find the next block to be written in the input, and copy it to the output."""
	extent = self.blockExtents.pop(self.blkCountOut)			extent = self.blockExtents.pop(self.blkCountOut)
	if self.blkCountOut in self.outOfOrderData:			if self.blkCountOut in self.outOfOrderData:
	# If the data is cached, use it from memory and remove from the			# If the data is cached, use it from memory and remove from the
	# cache			# cache
	rawblock = self.outOfOrderData.pop(self.blkCountOut)			rawblock = self.outOfOrderData.pop(self.blkCountOut)
	self.outOfOrderSize -= len(rawblock)			self.outOfOrderSize -= len(rawblock)
	else: # Otherwise look up data on disk			else: # Otherwise look up data on disk
	rawblock = self.fetchBlock(extent)			rawblock = self.fetchBlock(extent)

	self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)			self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)

	def run(self):			def run(self):
	while self.blkCountOut < len(self.blkindex):			while self.blkCountOut < len(self.blkindex):
	if not self.inF:			if not self.inF:
	fname = self.inFileName(self.inFn)			fname = self.inFileName(self.inFn)
	print("Input file " + fname)			print("Input file " + fname)
	try:			try:
	self.inF = open(fname, "rb")			self.inF = open(fname, "rb")
	except IOError:			except IOError:
	print("Premature end of block data")			print("Premature end of block data")
	return			return

	inhdr = self.inF.read(8)			inhdr = self.inF.read(8)
	if (not inhdr or (inhdr[0] == "\0")):			if not inhdr or (inhdr[0] == "\0"):
	self.inF.close()			self.inF.close()
	self.inF = None			self.inF = None
	self.inFn = self.inFn + 1			self.inFn = self.inFn + 1
	continue			continue

	inMagic = inhdr[:4]			inMagic = inhdr[:4]
	if (inMagic != self.settings['netmagic']):			if inMagic != self.settings["netmagic"]:
	print("Invalid magic: " + inMagic.hex())			print("Invalid magic: " + inMagic.hex())
	return			return
	inLenLE = inhdr[4:]			inLenLE = inhdr[4:]
	su = struct.unpack("<I", inLenLE)			su = struct.unpack("<I", inLenLE)
	inLen = su[0] - 80 # length without header			inLen = su[0] - 80 # length without header
	blk_hdr = self.inF.read(80)			blk_hdr = self.inF.read(80)
	inExtent = BlockExtent(			inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
	self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)

	self.hash_str = calc_hash_str(blk_hdr)			self.hash_str = calc_hash_str(blk_hdr)
	if self.hash_str not in blkmap:			if self.hash_str not in blkmap:
	# Because blocks can be written to files out-of-order as of 0.10, the script			# Because blocks can be written to files out-of-order as of 0.10, the script
	# may encounter blocks it doesn't know about. Treat as debug			# may encounter blocks it doesn't know about. Treat as debug
	# output.			# output.
	if settings['debug_output'] == 'true':			if settings["debug_output"] == "true":
	print("Skipping unknown block " + self.hash_str)			print("Skipping unknown block " + self.hash_str)
	self.inF.seek(inLen, os.SEEK_CUR)			self.inF.seek(inLen, os.SEEK_CUR)
	continue			continue

	blkHeight = self.blkmap[self.hash_str]			blkHeight = self.blkmap[self.hash_str]
	self.blkCountIn += 1			self.blkCountIn += 1

	if self.blkCountOut == blkHeight:			if self.blkCountOut == blkHeight:
	# If in-order block, just copy			# If in-order block, just copy
	rawblock = self.inF.read(inLen)			rawblock = self.inF.read(inLen)
	self.writeBlock(inhdr, blk_hdr, rawblock)			self.writeBlock(inhdr, blk_hdr, rawblock)

	# See if we can catch up to prior out-of-order blocks			# See if we can catch up to prior out-of-order blocks
	while self.blkCountOut in self.blockExtents:			while self.blkCountOut in self.blockExtents:
	self.copyOneBlock()			self.copyOneBlock()

	else: # If out-of-order, skip over block data for now			else: # If out-of-order, skip over block data for now
	self.blockExtents[blkHeight] = inExtent			self.blockExtents[blkHeight] = inExtent
	if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:			if self.outOfOrderSize < self.settings["out_of_order_cache_sz"]:
	# If there is space in the cache, read the data			# If there is space in the cache, read the data
	# Reading the data in file sequence instead of seeking and fetching it later is preferred,			# Reading the data in file sequence instead of seeking and fetching it later is preferred,
	# but we don't want to fill up memory			# but we don't want to fill up memory
	self.outOfOrderData[blkHeight] = self.inF.read(inLen)			self.outOfOrderData[blkHeight] = self.inF.read(inLen)
	self.outOfOrderSize += inLen			self.outOfOrderSize += inLen
	else: # If no space in cache, seek forward			else: # If no space in cache, seek forward
	self.inF.seek(inLen, os.SEEK_CUR)			self.inF.seek(inLen, os.SEEK_CUR)

	print(f"Done ({self.blkCountOut} blocks written)")			print(f"Done ({self.blkCountOut} blocks written)")


	if __name__ == '__main__':			if __name__ == "__main__":
	if len(sys.argv) != 2:			if len(sys.argv) != 2:
	print("Usage: linearize-data.py CONFIG-FILE")			print("Usage: linearize-data.py CONFIG-FILE")
	sys.exit(1)			sys.exit(1)

	f = open(sys.argv[1], encoding="utf8")			f = open(sys.argv[1], encoding="utf8")
	for line in f:			for line in f:
	# skip comment lines			# skip comment lines
	m = re.search(r'^\s*#', line)			m = re.search(r"^\s*#", line)
	if m:			if m:
	continue			continue

	# parse key=value lines			# parse key=value lines
	m = re.search(r'^(\w+)\s=\s(\S.*)$', line)			m = re.search(r"^(\w+)\s=\s(\S.*)$", line)
	if m is None:			if m is None:
	continue			continue
	settings[m.group(1)] = m.group(2)			settings[m.group(1)] = m.group(2)
	f.close()			f.close()

	# Force hash byte format setting to be lowercase to make comparisons easier.			# Force hash byte format setting to be lowercase to make comparisons easier.
	# Also place upfront in case any settings need to know about it.			# Also place upfront in case any settings need to know about it.
	if 'rev_hash_bytes' not in settings:			if "rev_hash_bytes" not in settings:
	settings['rev_hash_bytes'] = 'false'			settings["rev_hash_bytes"] = "false"
	settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()			settings["rev_hash_bytes"] = settings["rev_hash_bytes"].lower()

	if 'netmagic' not in settings:			if "netmagic" not in settings:
	settings['netmagic'] = 'f9beb4d9'			settings["netmagic"] = "f9beb4d9"
	if 'genesis' not in settings:			if "genesis" not in settings:
	settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'			settings["genesis"] = (
	if 'input' not in settings:			"000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f"
	settings['input'] = 'input'			)
	if 'hashlist' not in settings:			if "input" not in settings:
	settings['hashlist'] = 'hashlist.txt'			settings["input"] = "input"
	if 'file_timestamp' not in settings:			if "hashlist" not in settings:
	settings['file_timestamp'] = 0			settings["hashlist"] = "hashlist.txt"
	if 'split_timestamp' not in settings:			if "file_timestamp" not in settings:
	settings['split_timestamp'] = 0			settings["file_timestamp"] = 0
	if 'max_out_sz' not in settings:			if "split_timestamp" not in settings:
	settings['max_out_sz'] = 1000 * 1000 * 1000			settings["split_timestamp"] = 0
	if 'out_of_order_cache_sz' not in settings:			if "max_out_sz" not in settings:
	settings['out_of_order_cache_sz'] = 100 * 1000 * 1000			settings["max_out_sz"] = 1000 * 1000 * 1000
	if 'debug_output' not in settings:			if "out_of_order_cache_sz" not in settings:
	settings['debug_output'] = 'false'			settings["out_of_order_cache_sz"] = 100 * 1000 * 1000
				if "debug_output" not in settings:
	settings['max_out_sz'] = int(settings['max_out_sz'])			settings["debug_output"] = "false"
	settings['split_timestamp'] = int(settings['split_timestamp'])
	settings['file_timestamp'] = int(settings['file_timestamp'])			settings["max_out_sz"] = int(settings["max_out_sz"])
	settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))			settings["split_timestamp"] = int(settings["split_timestamp"])
	settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])			settings["file_timestamp"] = int(settings["file_timestamp"])
	settings['debug_output'] = settings['debug_output'].lower()			settings["netmagic"] = unhexlify(settings["netmagic"].encode("utf-8"))
				settings["out_of_order_cache_sz"] = int(settings["out_of_order_cache_sz"])
				settings["debug_output"] = settings["debug_output"].lower()

	if 'output_file' not in settings and 'output' not in settings:			if "output_file" not in settings and "output" not in settings:
	print("Missing output file / directory")			print("Missing output file / directory")
	sys.exit(1)			sys.exit(1)

	blkindex = get_block_hashes(settings)			blkindex = get_block_hashes(settings)
	blkmap = mkblockmap(blkindex)			blkmap = mkblockmap(blkindex)

	# Block hash map won't be byte-reversed. Neither should the genesis hash.			# Block hash map won't be byte-reversed. Neither should the genesis hash.
	if not settings['genesis'] in blkmap:			if not settings["genesis"] in blkmap:
	print("Genesis block not found in hashlist")			print("Genesis block not found in hashlist")
	else:			else:
	BlockDataCopier(settings, blkindex, blkmap).run()			BlockDataCopier(settings, blkindex, blkmap).run()