#!/usr/bin/python3 import base64 import collections import hashlib import re import sys DEBUG = False RE_HEADER = r'_ibdmp:[0-9]+/[0-9]+[|]chunks=[0-9]+,md5=[0-9a-f]+[|]' RE_CHUNK = r'_ibdmp:[0-9]+/[0-9]+[|][0-9]+:[A-Za-z0-9+/=]+[|]' def USAGE(): self = sys.argv[0].split('/')[-1] lines = [ "usage: %s path/to/console.log path/to/target.tar.xz" % self, "", "Decode debug tarball emitted by leapp's initramfs in-band", "console debugger, ibdmp().", ] sys.stderr.writelines('%s\n' % l for l in lines) sys.exit(2) def LOG_DEBUG(msg): if DEBUG: sys.stderr.write('DEBUG: %s\n' % msg) def LOG_WARN(msg): sys.stderr.write('%s\n' % msg) class IbdmpDecodeError(ValueError): pass class UsageError(ValueError): pass class _Chunk: @classmethod def from_raw1(cls, raw_chunk): if not raw_chunk.startswith('_ibdmp:'): LOG_WARN("invalid chunk payload (no '_ibdmp:'?): %s" % raw_chunk) raise IbdmpDecodeError(raw_chunk) areas = raw_chunk.split('|') parts = areas[1].split(':') return cls( ordinal=int(parts[0]), payload=str(parts[1]), ) def __init__(self, ordinal, payload): self.ordinal = ordinal self.payload = payload class Header: """ Chunk set header """ @classmethod def from_rawN(cls, raw_headers): """ Initialize chunk header from header chunk candidates raw_headers is a list of strings that contain encoded chunk parameters for the whole chunk set, ie. number of chunks, number of iterations, and MD5 hash of the content encoded in the chunk set. Raw header chunks can be corrupted so this factory will choose winner based on prevalence. For chunk set example in ChunkCounter.__init__ corresponding raw headers could look similar to this: _ibdmp:1/3|chunks=2,md5=281cc34e13cb4a502abd340fd07c4020| _ibdmp:2/3|chunks=2,md5=281cc34e13cb4a502abd340fd07c4020| _ibdmp:3/3|chun?s=2,md5=281cc34e13cb4a502abd340fd07c4020| In this case, the winner is the first and second one. """ cntr = collections.Counter([ Header._from_raw1(rh) for rh in raw_headers ]) if not cntr: LOG_WARN("no dumps found in this console log") raise IbdmpDecodeError() winner = cntr.most_common()[0][0] LOG_DEBUG("header winner: %s" % winner) return winner @classmethod def _from_raw1(cls, raw_header): parts = raw_header.split('|') _, stats = parts[0].split(':') pairs = parts[1].split(',') if not pairs[0].startswith('chunks='): LOG_WARN("invalid header chunk payload (no chunks=?): %s" % raw_header) raise IbdmpDecodeError(raw_header) if not pairs[1].startswith('md5='): LOG_WARN("invalid header chunk payload (no md5=?): %s" % raw_header) raise IbdmpDecodeError(raw_header) return cls( chunks=int(pairs[0].split('=')[1]), md5=str(pairs[1].split('=')[1]), csets=int(stats.split('/')[1]), ) def __init__(self, chunks, md5, csets): self.chunks = chunks self.md5 = md5 self.csets = csets def __eq__(self, othr): return (self.chunks, self.md5) == (othr.chunks, othr.md5) def __hash__(self): return hash((self.chunks, self.md5)) def __neq__(self, othr): return not self.__eq__(othr) def __str__(self): return ("Header(csets=%r,chunks=%r,md5=%r)" % (self.csets, self.chunks, self.md5)) class ChunkCounter: """ Chunk collector Initialize with Header that you have some condfidence in (see Header.from_rawN), and set of raw chunks. Chunks could be corrupted but they should come in N replicated sets, so for every position in the chunk set, the initializer will select most prevalent variant of the given chunk. Eg. if chunk set was: _ibdmp:1/3|1:A/sl1cEofBASe64/| _ibdmp:1/3|2:paDD3d==========| _ibdmp:2/3|1:A/sl1cEofBASe64/| _ibdmp:2/3|2:paDD3d========!=| _ibdmp:3/3|1:A/sl1cEofBASe64/| _ibdmp:3/3|2:paDD3d==========| on position 2, the corrupted chunk will be removed. Use decode() to get the encoded tarball bytes, or decode_to() to write it to a file. """ def __init__(self, header, raw_chunks): self.header = header self._bagset = collections.defaultdict(collections.Counter) LOG_DEBUG('header.chunks=%r' % header.chunks) for cr in raw_chunks: c = _Chunk.from_raw1(cr) LOG_DEBUG('c.ordinal=%r' % c.ordinal) self._bagset[c.ordinal].update([c.payload]) @property def chunks(self): """ Selected chunks from all known """ out = [] for idx in range(1, self.header.chunks + 1): cbag = self._bagset.get(idx) if not cbag: sys.stderr.write('Missing chunk id: %d/%d\n' % (idx, self.header.chunks)) continue winner, score = cbag.most_common()[0] confidence = 100 * (score / self.header.csets) LOG_DEBUG("chunk position winner: %d: %s (%d%%)" % (idx, winner, confidence)) out.append(winner) return out def decode(self): """ Decode tarball from valid chunk data """ tarball = base64.b64decode(''.join(self.chunks)) tarball_md5 = hashlib.md5(tarball).hexdigest() if not tarball_md5 == self.header.md5: LOG_WARN("MD5 mismatch: %s != %s" % (tarball_md5, self.header.md5)) return tarball def decode_to(self, tarpath): """ Decode and write tarball to *path*. """ with open(tarpath, 'w') as f: f.buffer.write(self.decode()) def readwin2(fh): """ From filehandle *fh*, yield joined lines 1+2, then 2+3, etc. Whitespace is stripped before joining. """ a = fh.readline() if not a: return while True: b = fh.readline() if not b: return out = a.rstrip() + b.rstrip() a, b = b, None yield out def main(args): LOG_DEBUG(args) try: source, target = args except ValueError: raise UsageError() raw_headers = set() raw_chunks = set() with open(source) as f: for jline in readwin2(f): for m in re.findall(RE_HEADER, jline): raw_headers.add(m) for m in re.findall(RE_CHUNK, jline): raw_chunks.add(m) if not raw_headers: LOG_WARN("no headers found") raise IbdmpDecodeError() LOG_DEBUG("raw headers found: %d" % len(raw_headers)) if not raw_chunks: LOG_WARN("no chunks found") raise IbdmpDecodeError() LOG_DEBUG("raw chunks found: %d" % len(raw_chunks)) header = Header.from_rawN(raw_headers) ccounter = ChunkCounter(header, raw_chunks) ccounter.decode_to(target) if __name__ == '__main__': try: main(sys.argv[1:]) except UsageError: USAGE() except IbdmpDecodeError: sys.exit(3)