Changeset View
Changeset View
Standalone View
Standalone View
contrib/linearize/linearize-data.py
Show All 19 Lines | |||||
from binascii import unhexlify | from binascii import unhexlify | ||||
from collections import namedtuple | from collections import namedtuple | ||||
from typing import Any, Dict | from typing import Any, Dict | ||||
settings: Dict[str, Any] = {} | settings: Dict[str, Any] = {} | ||||
def hex_switchEndian(s): | def hex_switchEndian(s): | ||||
""" Switches the endianness of a hex string (in pairs of hex chars) """ | """Switches the endianness of a hex string (in pairs of hex chars)""" | ||||
pairList = [s[i:i + 2].encode() for i in range(0, len(s), 2)] | pairList = [s[i : i + 2].encode() for i in range(0, len(s), 2)] | ||||
return b''.join(pairList[::-1]).decode() | return b"".join(pairList[::-1]).decode() | ||||
def uint32(x): | def uint32(x): | ||||
return x & 0xffffffff | return x & 0xFFFFFFFF | ||||
def bytereverse(x): | def bytereverse(x): | ||||
return uint32((((x) << 24) | (((x) << 8) & 0x00ff0000) | | return uint32( | ||||
(((x) >> 8) & 0x0000ff00) | ((x) >> 24))) | ( | ||||
((x) << 24) | |||||
| (((x) << 8) & 0x00FF0000) | |||||
| (((x) >> 8) & 0x0000FF00) | |||||
| ((x) >> 24) | |||||
) | |||||
) | |||||
def bufreverse(in_buf): | def bufreverse(in_buf): | ||||
out_words = [] | out_words = [] | ||||
for i in range(0, len(in_buf), 4): | for i in range(0, len(in_buf), 4): | ||||
word = struct.unpack('@I', in_buf[i:i + 4])[0] | word = struct.unpack("@I", in_buf[i : i + 4])[0] | ||||
out_words.append(struct.pack('@I', bytereverse(word))) | out_words.append(struct.pack("@I", bytereverse(word))) | ||||
return b''.join(out_words) | return b"".join(out_words) | ||||
def wordreverse(in_buf): | def wordreverse(in_buf): | ||||
out_words = [] | out_words = [] | ||||
for i in range(0, len(in_buf), 4): | for i in range(0, len(in_buf), 4): | ||||
out_words.append(in_buf[i:i + 4]) | out_words.append(in_buf[i : i + 4]) | ||||
out_words.reverse() | out_words.reverse() | ||||
return b''.join(out_words) | return b"".join(out_words) | ||||
def calc_hdr_hash(blk_hdr): | def calc_hdr_hash(blk_hdr): | ||||
hash1 = hashlib.sha256() | hash1 = hashlib.sha256() | ||||
hash1.update(blk_hdr) | hash1.update(blk_hdr) | ||||
hash1_o = hash1.digest() | hash1_o = hash1.digest() | ||||
hash2 = hashlib.sha256() | hash2 = hashlib.sha256() | ||||
hash2.update(hash1_o) | hash2.update(hash1_o) | ||||
hash2_o = hash2.digest() | hash2_o = hash2.digest() | ||||
return hash2_o | return hash2_o | ||||
def calc_hash_str(blk_hdr): | def calc_hash_str(blk_hdr): | ||||
blockhash = calc_hdr_hash(blk_hdr) | blockhash = calc_hdr_hash(blk_hdr) | ||||
blockhash = bufreverse(blockhash) | blockhash = bufreverse(blockhash) | ||||
blockhash = wordreverse(blockhash) | blockhash = wordreverse(blockhash) | ||||
hash_str = blockhash.hex() | hash_str = blockhash.hex() | ||||
return hash_str | return hash_str | ||||
def get_blk_dt(blk_hdr): | def get_blk_dt(blk_hdr): | ||||
members = struct.unpack("<I", blk_hdr[68:68 + 4]) | members = struct.unpack("<I", blk_hdr[68 : 68 + 4]) | ||||
nTime = members[0] | nTime = members[0] | ||||
dt = datetime.datetime.fromtimestamp(nTime) | dt = datetime.datetime.fromtimestamp(nTime) | ||||
dt_ym = datetime.datetime(dt.year, dt.month, 1) | dt_ym = datetime.datetime(dt.year, dt.month, 1) | ||||
return (dt_ym, nTime) | return (dt_ym, nTime) | ||||
# When getting the list of block hashes, undo any byte reversals. | # When getting the list of block hashes, undo any byte reversals. | ||||
def get_block_hashes(settings): | def get_block_hashes(settings): | ||||
blkindex = [] | blkindex = [] | ||||
f = open(settings['hashlist'], "r", encoding="utf8") | f = open(settings["hashlist"], "r", encoding="utf8") | ||||
for line in f: | for line in f: | ||||
line = line.rstrip() | line = line.rstrip() | ||||
if settings['rev_hash_bytes'] == 'true': | if settings["rev_hash_bytes"] == "true": | ||||
line = hex_switchEndian(line) | line = hex_switchEndian(line) | ||||
blkindex.append(line) | blkindex.append(line) | ||||
print("Read " + str(len(blkindex)) + " hashes") | print("Read " + str(len(blkindex)) + " hashes") | ||||
return blkindex | return blkindex | ||||
# The block map shouldn't give or receive byte-reversed hashes. | # The block map shouldn't give or receive byte-reversed hashes. | ||||
def mkblockmap(blkindex): | def mkblockmap(blkindex): | ||||
blkmap = {} | blkmap = {} | ||||
for height, blockhash in enumerate(blkindex): | for height, blockhash in enumerate(blkindex): | ||||
blkmap[blockhash] = height | blkmap[blockhash] = height | ||||
return blkmap | return blkmap | ||||
# Block header and extent on disk | # Block header and extent on disk | ||||
BlockExtent = namedtuple( | BlockExtent = namedtuple("BlockExtent", ["fn", "offset", "inhdr", "blkhdr", "size"]) | ||||
'BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) | |||||
class BlockDataCopier: | class BlockDataCopier: | ||||
def __init__(self, settings, blkindex, blkmap): | def __init__(self, settings, blkindex, blkmap): | ||||
self.settings = settings | self.settings = settings | ||||
self.blkindex = blkindex | self.blkindex = blkindex | ||||
self.blkmap = blkmap | self.blkmap = blkmap | ||||
self.inFn = 0 | self.inFn = 0 | ||||
self.inF = None | self.inF = None | ||||
self.outFn = 0 | self.outFn = 0 | ||||
self.outsz = 0 | self.outsz = 0 | ||||
self.outF = None | self.outF = None | ||||
self.outFname = None | self.outFname = None | ||||
self.blkCountIn = 0 | self.blkCountIn = 0 | ||||
self.blkCountOut = 0 | self.blkCountOut = 0 | ||||
self.lastDate = datetime.datetime(2000, 1, 1) | self.lastDate = datetime.datetime(2000, 1, 1) | ||||
self.highTS = 1408893517 - 315360000 | self.highTS = 1408893517 - 315360000 | ||||
self.timestampSplit = False | self.timestampSplit = False | ||||
self.fileOutput = True | self.fileOutput = True | ||||
self.setFileTime = False | self.setFileTime = False | ||||
self.maxOutSz = settings['max_out_sz'] | self.maxOutSz = settings["max_out_sz"] | ||||
if 'output' in settings: | if "output" in settings: | ||||
self.fileOutput = False | self.fileOutput = False | ||||
if settings['file_timestamp'] != 0: | if settings["file_timestamp"] != 0: | ||||
self.setFileTime = True | self.setFileTime = True | ||||
if settings['split_timestamp'] != 0: | if settings["split_timestamp"] != 0: | ||||
self.timestampSplit = True | self.timestampSplit = True | ||||
# Extents and cache for out-of-order blocks | # Extents and cache for out-of-order blocks | ||||
self.blockExtents = {} | self.blockExtents = {} | ||||
self.outOfOrderData = {} | self.outOfOrderData = {} | ||||
self.outOfOrderSize = 0 # running total size for items in outOfOrderData | self.outOfOrderSize = 0 # running total size for items in outOfOrderData | ||||
def writeBlock(self, inhdr, blk_hdr, rawblock): | def writeBlock(self, inhdr, blk_hdr, rawblock): | ||||
blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) | blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) | ||||
if not self.fileOutput and ( | if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): | ||||
(self.outsz + blockSizeOnDisk) > self.maxOutSz): | |||||
self.outF.close() | self.outF.close() | ||||
if self.setFileTime: | if self.setFileTime: | ||||
os.utime(self.outFname, (int(time.time()), self.highTS)) | os.utime(self.outFname, (int(time.time()), self.highTS)) | ||||
self.outF = None | self.outF = None | ||||
self.outFname = None | self.outFname = None | ||||
self.outFn = self.outFn + 1 | self.outFn = self.outFn + 1 | ||||
self.outsz = 0 | self.outsz = 0 | ||||
(blkDate, blkTS) = get_blk_dt(blk_hdr) | (blkDate, blkTS) = get_blk_dt(blk_hdr) | ||||
if self.timestampSplit and (blkDate > self.lastDate): | if self.timestampSplit and (blkDate > self.lastDate): | ||||
print("New month " + blkDate.strftime("%Y-%m") + | print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) | ||||
" @ " + self.hash_str) | |||||
self.lastDate = blkDate | self.lastDate = blkDate | ||||
if self.outF: | if self.outF: | ||||
self.outF.close() | self.outF.close() | ||||
if self.setFileTime: | if self.setFileTime: | ||||
os.utime(self.outFname, (int(time.time()), self.highTS)) | os.utime(self.outFname, (int(time.time()), self.highTS)) | ||||
self.outF = None | self.outF = None | ||||
self.outFname = None | self.outFname = None | ||||
self.outFn = self.outFn + 1 | self.outFn = self.outFn + 1 | ||||
self.outsz = 0 | self.outsz = 0 | ||||
if not self.outF: | if not self.outF: | ||||
if self.fileOutput: | if self.fileOutput: | ||||
self.outFname = self.settings['output_file'] | self.outFname = self.settings["output_file"] | ||||
else: | else: | ||||
self.outFname = os.path.join( | self.outFname = os.path.join( | ||||
self.settings['output'], f"blk{self.outFn:05d}.dat") | self.settings["output"], f"blk{self.outFn:05d}.dat" | ||||
) | |||||
print("Output file " + self.outFname) | print("Output file " + self.outFname) | ||||
self.outF = open(self.outFname, "wb") | self.outF = open(self.outFname, "wb") | ||||
self.outF.write(inhdr) | self.outF.write(inhdr) | ||||
self.outF.write(blk_hdr) | self.outF.write(blk_hdr) | ||||
self.outF.write(rawblock) | self.outF.write(rawblock) | ||||
self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) | self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) | ||||
self.blkCountOut = self.blkCountOut + 1 | self.blkCountOut = self.blkCountOut + 1 | ||||
if blkTS > self.highTS: | if blkTS > self.highTS: | ||||
self.highTS = blkTS | self.highTS = blkTS | ||||
if (self.blkCountOut % 1000) == 0: | if (self.blkCountOut % 1000) == 0: | ||||
print('{} blocks scanned, {} blocks written (of {}, {:.1f}% complete)'.format( | print( | ||||
self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) | "{} blocks scanned, {} blocks written (of {}, {:.1f}% complete)".format( | ||||
self.blkCountIn, | |||||
self.blkCountOut, | |||||
len(self.blkindex), | |||||
100.0 * self.blkCountOut / len(self.blkindex), | |||||
) | |||||
) | |||||
def inFileName(self, fn): | def inFileName(self, fn): | ||||
return os.path.join(self.settings['input'], f"blk{fn:05d}.dat") | return os.path.join(self.settings["input"], f"blk{fn:05d}.dat") | ||||
def fetchBlock(self, extent): | def fetchBlock(self, extent): | ||||
'''Fetch block contents from disk given extents''' | """Fetch block contents from disk given extents""" | ||||
with open(self.inFileName(extent.fn), "rb") as f: | with open(self.inFileName(extent.fn), "rb") as f: | ||||
f.seek(extent.offset) | f.seek(extent.offset) | ||||
return f.read(extent.size) | return f.read(extent.size) | ||||
def copyOneBlock(self): | def copyOneBlock(self): | ||||
'''Find the next block to be written in the input, and copy it to the output.''' | """Find the next block to be written in the input, and copy it to the output.""" | ||||
extent = self.blockExtents.pop(self.blkCountOut) | extent = self.blockExtents.pop(self.blkCountOut) | ||||
if self.blkCountOut in self.outOfOrderData: | if self.blkCountOut in self.outOfOrderData: | ||||
# If the data is cached, use it from memory and remove from the | # If the data is cached, use it from memory and remove from the | ||||
# cache | # cache | ||||
rawblock = self.outOfOrderData.pop(self.blkCountOut) | rawblock = self.outOfOrderData.pop(self.blkCountOut) | ||||
self.outOfOrderSize -= len(rawblock) | self.outOfOrderSize -= len(rawblock) | ||||
else: # Otherwise look up data on disk | else: # Otherwise look up data on disk | ||||
rawblock = self.fetchBlock(extent) | rawblock = self.fetchBlock(extent) | ||||
self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) | self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) | ||||
def run(self): | def run(self): | ||||
while self.blkCountOut < len(self.blkindex): | while self.blkCountOut < len(self.blkindex): | ||||
if not self.inF: | if not self.inF: | ||||
fname = self.inFileName(self.inFn) | fname = self.inFileName(self.inFn) | ||||
print("Input file " + fname) | print("Input file " + fname) | ||||
try: | try: | ||||
self.inF = open(fname, "rb") | self.inF = open(fname, "rb") | ||||
except IOError: | except IOError: | ||||
print("Premature end of block data") | print("Premature end of block data") | ||||
return | return | ||||
inhdr = self.inF.read(8) | inhdr = self.inF.read(8) | ||||
if (not inhdr or (inhdr[0] == "\0")): | if not inhdr or (inhdr[0] == "\0"): | ||||
self.inF.close() | self.inF.close() | ||||
self.inF = None | self.inF = None | ||||
self.inFn = self.inFn + 1 | self.inFn = self.inFn + 1 | ||||
continue | continue | ||||
inMagic = inhdr[:4] | inMagic = inhdr[:4] | ||||
if (inMagic != self.settings['netmagic']): | if inMagic != self.settings["netmagic"]: | ||||
print("Invalid magic: " + inMagic.hex()) | print("Invalid magic: " + inMagic.hex()) | ||||
return | return | ||||
inLenLE = inhdr[4:] | inLenLE = inhdr[4:] | ||||
su = struct.unpack("<I", inLenLE) | su = struct.unpack("<I", inLenLE) | ||||
inLen = su[0] - 80 # length without header | inLen = su[0] - 80 # length without header | ||||
blk_hdr = self.inF.read(80) | blk_hdr = self.inF.read(80) | ||||
inExtent = BlockExtent( | inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) | ||||
self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) | |||||
self.hash_str = calc_hash_str(blk_hdr) | self.hash_str = calc_hash_str(blk_hdr) | ||||
if self.hash_str not in blkmap: | if self.hash_str not in blkmap: | ||||
# Because blocks can be written to files out-of-order as of 0.10, the script | # Because blocks can be written to files out-of-order as of 0.10, the script | ||||
# may encounter blocks it doesn't know about. Treat as debug | # may encounter blocks it doesn't know about. Treat as debug | ||||
# output. | # output. | ||||
if settings['debug_output'] == 'true': | if settings["debug_output"] == "true": | ||||
print("Skipping unknown block " + self.hash_str) | print("Skipping unknown block " + self.hash_str) | ||||
self.inF.seek(inLen, os.SEEK_CUR) | self.inF.seek(inLen, os.SEEK_CUR) | ||||
continue | continue | ||||
blkHeight = self.blkmap[self.hash_str] | blkHeight = self.blkmap[self.hash_str] | ||||
self.blkCountIn += 1 | self.blkCountIn += 1 | ||||
if self.blkCountOut == blkHeight: | if self.blkCountOut == blkHeight: | ||||
# If in-order block, just copy | # If in-order block, just copy | ||||
rawblock = self.inF.read(inLen) | rawblock = self.inF.read(inLen) | ||||
self.writeBlock(inhdr, blk_hdr, rawblock) | self.writeBlock(inhdr, blk_hdr, rawblock) | ||||
# See if we can catch up to prior out-of-order blocks | # See if we can catch up to prior out-of-order blocks | ||||
while self.blkCountOut in self.blockExtents: | while self.blkCountOut in self.blockExtents: | ||||
self.copyOneBlock() | self.copyOneBlock() | ||||
else: # If out-of-order, skip over block data for now | else: # If out-of-order, skip over block data for now | ||||
self.blockExtents[blkHeight] = inExtent | self.blockExtents[blkHeight] = inExtent | ||||
if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: | if self.outOfOrderSize < self.settings["out_of_order_cache_sz"]: | ||||
# If there is space in the cache, read the data | # If there is space in the cache, read the data | ||||
# Reading the data in file sequence instead of seeking and fetching it later is preferred, | # Reading the data in file sequence instead of seeking and fetching it later is preferred, | ||||
# but we don't want to fill up memory | # but we don't want to fill up memory | ||||
self.outOfOrderData[blkHeight] = self.inF.read(inLen) | self.outOfOrderData[blkHeight] = self.inF.read(inLen) | ||||
self.outOfOrderSize += inLen | self.outOfOrderSize += inLen | ||||
else: # If no space in cache, seek forward | else: # If no space in cache, seek forward | ||||
self.inF.seek(inLen, os.SEEK_CUR) | self.inF.seek(inLen, os.SEEK_CUR) | ||||
print(f"Done ({self.blkCountOut} blocks written)") | print(f"Done ({self.blkCountOut} blocks written)") | ||||
if __name__ == '__main__': | if __name__ == "__main__": | ||||
if len(sys.argv) != 2: | if len(sys.argv) != 2: | ||||
print("Usage: linearize-data.py CONFIG-FILE") | print("Usage: linearize-data.py CONFIG-FILE") | ||||
sys.exit(1) | sys.exit(1) | ||||
f = open(sys.argv[1], encoding="utf8") | f = open(sys.argv[1], encoding="utf8") | ||||
for line in f: | for line in f: | ||||
# skip comment lines | # skip comment lines | ||||
m = re.search(r'^\s*#', line) | m = re.search(r"^\s*#", line) | ||||
if m: | if m: | ||||
continue | continue | ||||
# parse key=value lines | # parse key=value lines | ||||
m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line) | m = re.search(r"^(\w+)\s*=\s*(\S.*)$", line) | ||||
if m is None: | if m is None: | ||||
continue | continue | ||||
settings[m.group(1)] = m.group(2) | settings[m.group(1)] = m.group(2) | ||||
f.close() | f.close() | ||||
# Force hash byte format setting to be lowercase to make comparisons easier. | # Force hash byte format setting to be lowercase to make comparisons easier. | ||||
# Also place upfront in case any settings need to know about it. | # Also place upfront in case any settings need to know about it. | ||||
if 'rev_hash_bytes' not in settings: | if "rev_hash_bytes" not in settings: | ||||
settings['rev_hash_bytes'] = 'false' | settings["rev_hash_bytes"] = "false" | ||||
settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() | settings["rev_hash_bytes"] = settings["rev_hash_bytes"].lower() | ||||
if 'netmagic' not in settings: | if "netmagic" not in settings: | ||||
settings['netmagic'] = 'f9beb4d9' | settings["netmagic"] = "f9beb4d9" | ||||
if 'genesis' not in settings: | if "genesis" not in settings: | ||||
settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' | settings["genesis"] = ( | ||||
if 'input' not in settings: | "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" | ||||
settings['input'] = 'input' | ) | ||||
if 'hashlist' not in settings: | if "input" not in settings: | ||||
settings['hashlist'] = 'hashlist.txt' | settings["input"] = "input" | ||||
if 'file_timestamp' not in settings: | if "hashlist" not in settings: | ||||
settings['file_timestamp'] = 0 | settings["hashlist"] = "hashlist.txt" | ||||
if 'split_timestamp' not in settings: | if "file_timestamp" not in settings: | ||||
settings['split_timestamp'] = 0 | settings["file_timestamp"] = 0 | ||||
if 'max_out_sz' not in settings: | if "split_timestamp" not in settings: | ||||
settings['max_out_sz'] = 1000 * 1000 * 1000 | settings["split_timestamp"] = 0 | ||||
if 'out_of_order_cache_sz' not in settings: | if "max_out_sz" not in settings: | ||||
settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 | settings["max_out_sz"] = 1000 * 1000 * 1000 | ||||
if 'debug_output' not in settings: | if "out_of_order_cache_sz" not in settings: | ||||
settings['debug_output'] = 'false' | settings["out_of_order_cache_sz"] = 100 * 1000 * 1000 | ||||
if "debug_output" not in settings: | |||||
settings['max_out_sz'] = int(settings['max_out_sz']) | settings["debug_output"] = "false" | ||||
settings['split_timestamp'] = int(settings['split_timestamp']) | |||||
settings['file_timestamp'] = int(settings['file_timestamp']) | settings["max_out_sz"] = int(settings["max_out_sz"]) | ||||
settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8')) | settings["split_timestamp"] = int(settings["split_timestamp"]) | ||||
settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) | settings["file_timestamp"] = int(settings["file_timestamp"]) | ||||
settings['debug_output'] = settings['debug_output'].lower() | settings["netmagic"] = unhexlify(settings["netmagic"].encode("utf-8")) | ||||
settings["out_of_order_cache_sz"] = int(settings["out_of_order_cache_sz"]) | |||||
settings["debug_output"] = settings["debug_output"].lower() | |||||
if 'output_file' not in settings and 'output' not in settings: | if "output_file" not in settings and "output" not in settings: | ||||
print("Missing output file / directory") | print("Missing output file / directory") | ||||
sys.exit(1) | sys.exit(1) | ||||
blkindex = get_block_hashes(settings) | blkindex = get_block_hashes(settings) | ||||
blkmap = mkblockmap(blkindex) | blkmap = mkblockmap(blkindex) | ||||
# Block hash map won't be byte-reversed. Neither should the genesis hash. | # Block hash map won't be byte-reversed. Neither should the genesis hash. | ||||
if not settings['genesis'] in blkmap: | if not settings["genesis"] in blkmap: | ||||
print("Genesis block not found in hashlist") | print("Genesis block not found in hashlist") | ||||
else: | else: | ||||
BlockDataCopier(settings, blkindex, blkmap).run() | BlockDataCopier(settings, blkindex, blkmap).run() |