You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

266 lines
7.3 KiB

11 months ago
#!/usr/bin/python3
import base64
import collections
import hashlib
import re
import sys
DEBUG = False
RE_HEADER = r'_ibdmp:[0-9]+/[0-9]+[|]chunks=[0-9]+,md5=[0-9a-f]+[|]'
RE_CHUNK = r'_ibdmp:[0-9]+/[0-9]+[|][0-9]+:[A-Za-z0-9+/=]+[|]'
def USAGE():
self = sys.argv[0].split('/')[-1]
lines = [
"usage: %s path/to/console.log path/to/target.tar.xz" % self,
"",
"Decode debug tarball emitted by leapp's initramfs in-band",
"console debugger, ibdmp().",
]
sys.stderr.writelines('%s\n' % l for l in lines)
sys.exit(2)
def LOG_DEBUG(msg):
if DEBUG:
sys.stderr.write('DEBUG: %s\n' % msg)
def LOG_WARN(msg):
sys.stderr.write('%s\n' % msg)
class IbdmpDecodeError(ValueError):
pass
class UsageError(ValueError):
pass
class _Chunk:
@classmethod
def from_raw1(cls, raw_chunk):
if not raw_chunk.startswith('_ibdmp:'):
LOG_WARN("invalid chunk payload (no '_ibdmp:'?): %s"
% raw_chunk)
raise IbdmpDecodeError(raw_chunk)
areas = raw_chunk.split('|')
parts = areas[1].split(':')
return cls(
ordinal=int(parts[0]),
payload=str(parts[1]),
)
def __init__(self, ordinal, payload):
self.ordinal = ordinal
self.payload = payload
class Header:
"""
Chunk set header
"""
@classmethod
def from_rawN(cls, raw_headers):
"""
Initialize chunk header from header chunk candidates
raw_headers is a list of strings that contain encoded chunk
parameters for the whole chunk set, ie. number of chunks, number
of iterations, and MD5 hash of the content encoded in the chunk set.
Raw header chunks can be corrupted so this factory will choose
winner based on prevalence.
For chunk set example in ChunkCounter.__init__ corresponding
raw headers could look similar to this:
_ibdmp:1/3|chunks=2,md5=281cc34e13cb4a502abd340fd07c4020|
_ibdmp:2/3|chunks=2,md5=281cc34e13cb4a502abd340fd07c4020|
_ibdmp:3/3|chun?s=2,md5=281cc34e13cb4a502abd340fd07c4020|
In this case, the winner is the first and second one.
"""
cntr = collections.Counter([
Header._from_raw1(rh) for rh in raw_headers
])
if not cntr:
LOG_WARN("no dumps found in this console log")
raise IbdmpDecodeError()
winner = cntr.most_common()[0][0]
LOG_DEBUG("header winner: %s" % winner)
return winner
@classmethod
def _from_raw1(cls, raw_header):
parts = raw_header.split('|')
_, stats = parts[0].split(':')
pairs = parts[1].split(',')
if not pairs[0].startswith('chunks='):
LOG_WARN("invalid header chunk payload (no chunks=?): %s"
% raw_header)
raise IbdmpDecodeError(raw_header)
if not pairs[1].startswith('md5='):
LOG_WARN("invalid header chunk payload (no md5=?): %s"
% raw_header)
raise IbdmpDecodeError(raw_header)
return cls(
chunks=int(pairs[0].split('=')[1]),
md5=str(pairs[1].split('=')[1]),
csets=int(stats.split('/')[1]),
)
def __init__(self, chunks, md5, csets):
self.chunks = chunks
self.md5 = md5
self.csets = csets
def __eq__(self, othr):
return (self.chunks, self.md5) == (othr.chunks, othr.md5)
def __hash__(self):
return hash((self.chunks, self.md5))
def __neq__(self, othr):
return not self.__eq__(othr)
def __str__(self):
return ("Header(csets=%r,chunks=%r,md5=%r)"
% (self.csets, self.chunks, self.md5))
class ChunkCounter:
"""
Chunk collector
Initialize with Header that you have some condfidence in
(see Header.from_rawN), and set of raw chunks.
Chunks could be corrupted but they should come in N replicated
sets, so for every position in the chunk set, the initializer
will select most prevalent variant of the given chunk.
Eg. if chunk set was:
_ibdmp:1/3|1:A/sl1cEofBASe64/|
_ibdmp:1/3|2:paDD3d==========|
_ibdmp:2/3|1:A/sl1cEofBASe64/|
_ibdmp:2/3|2:paDD3d========!=|
_ibdmp:3/3|1:A/sl1cEofBASe64/|
_ibdmp:3/3|2:paDD3d==========|
on position 2, the corrupted chunk will be removed.
Use decode() to get the encoded tarball bytes, or decode_to()
to write it to a file.
"""
def __init__(self, header, raw_chunks):
self.header = header
self._bagset = collections.defaultdict(collections.Counter)
LOG_DEBUG('header.chunks=%r' % header.chunks)
for cr in raw_chunks:
c = _Chunk.from_raw1(cr)
LOG_DEBUG('c.ordinal=%r' % c.ordinal)
self._bagset[c.ordinal].update([c.payload])
@property
def chunks(self):
"""
Selected chunks from all known
"""
out = []
for idx in range(1, self.header.chunks + 1):
cbag = self._bagset.get(idx)
if not cbag:
sys.stderr.write('Missing chunk id: %d/%d\n'
% (idx, self.header.chunks))
continue
winner, score = cbag.most_common()[0]
confidence = 100 * (score / self.header.csets)
LOG_DEBUG("chunk position winner: %d: %s (%d%%)"
% (idx, winner, confidence))
out.append(winner)
return out
def decode(self):
"""
Decode tarball from valid chunk data
"""
tarball = base64.b64decode(''.join(self.chunks))
tarball_md5 = hashlib.md5(tarball).hexdigest()
if not tarball_md5 == self.header.md5:
LOG_WARN("MD5 mismatch: %s != %s" % (tarball_md5, self.header.md5))
return tarball
def decode_to(self, tarpath):
"""
Decode and write tarball to *path*.
"""
with open(tarpath, 'w') as f:
f.buffer.write(self.decode())
def readwin2(fh):
"""
From filehandle *fh*, yield joined lines 1+2, then 2+3,
etc. Whitespace is stripped before joining.
"""
a = fh.readline()
if not a:
return
while True:
b = fh.readline()
if not b:
return
out = a.rstrip() + b.rstrip()
a, b = b, None
yield out
def main(args):
LOG_DEBUG(args)
try:
source, target = args
except ValueError:
raise UsageError()
raw_headers = set()
raw_chunks = set()
with open(source) as f:
for jline in readwin2(f):
for m in re.findall(RE_HEADER, jline):
raw_headers.add(m)
for m in re.findall(RE_CHUNK, jline):
raw_chunks.add(m)
if not raw_headers:
LOG_WARN("no headers found")
raise IbdmpDecodeError()
LOG_DEBUG("raw headers found: %d" % len(raw_headers))
if not raw_chunks:
LOG_WARN("no chunks found")
raise IbdmpDecodeError()
LOG_DEBUG("raw chunks found: %d" % len(raw_chunks))
header = Header.from_rawN(raw_headers)
ccounter = ChunkCounter(header, raw_chunks)
ccounter.decode_to(target)
if __name__ == '__main__':
try:
main(sys.argv[1:])
except UsageError:
USAGE()
except IbdmpDecodeError:
sys.exit(3)