From 066df4fd454d6ff9be66e80b2a65995b10af174f Mon Sep 17 00:00:00 2001 From: John Jolly Date: Tue, 30 Jan 2018 01:51:35 -0700 Subject: [PATCH] bpo-22908: Add seek and tell functionality to ZipExtFile (GH-4966) This allows for nested zip files, tar files within zip files, zip files within tar files, etc. Contributed by: John Jolly --- Doc/library/zipfile.rst | 6 +- Lib/test/test_zipfile.py | 34 ++++++++ Lib/zipfile.py | 82 +++++++++++++++++++ .../2017-12-21-22-00-11.bpo-22908.cVm89I.rst | 2 + 4 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index d58efe0b417516..7c9a8c80225491 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -246,9 +246,9 @@ ZipFile Objects With *mode* ``'r'`` the file-like object (``ZipExtFile``) is read-only and provides the following methods: :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, - :meth:`~io.IOBase.readlines`, :meth:`__iter__`, - :meth:`~iterator.__next__`. These objects can operate independently of - the ZipFile. + :meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`, + :meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`. + These objects can operate independently of the ZipFile. With ``mode='w'``, a writable file handle is returned, which supports the :meth:`~io.BufferedIOBase.write` method. While a writable file handle is open, diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 94db858a1517c4..61c3e349a69ef4 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1628,6 +1628,40 @@ def test_open_conflicting_handles(self): self.assertEqual(zipf.read('baz'), msg3) self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz']) + def test_seek_tell(self): + # Test seek functionality + txt = b"Where's Bruce?" + bloc = txt.find(b"Bruce") + # Check seek on a file + with zipfile.ZipFile(TESTFN, "w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(TESTFN, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + # Check seek on memory file + data = io.BytesIO() + with zipfile.ZipFile(data, mode="w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(data, mode="r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + def tearDown(self): unlink(TESTFN) unlink(TESTFN2) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index f9db45f58a2bde..5df7b1bf75b9d9 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -696,6 +696,18 @@ def __init__(self, file, pos, close, lock, writing): self._close = close self._lock = lock self._writing = writing + self.seekable = file.seekable + self.tell = file.tell + + def seek(self, offset, whence=0): + with self._lock: + if self.writing(): + raise ValueError("Can't reposition in the ZIP file while " + "there is an open writing handle on it. " + "Close the writing handle before trying to read.") + self._file.seek(self._pos) + self._pos = self._file.tell() + return self._pos def read(self, n=-1): with self._lock: @@ -746,6 +758,9 @@ class ZipExtFile(io.BufferedIOBase): # Read from compressed files in 4k blocks. MIN_READ_SIZE = 4096 + # Chunk size to read during seek + MAX_SEEK_READ = 1 << 24 + def __init__(self, fileobj, mode, zipinfo, decrypter=None, close_fileobj=False): self._fileobj = fileobj @@ -778,6 +793,17 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None, else: self._expected_crc = None + self._seekable = False + try: + if fileobj.seekable(): + self._orig_compress_start = fileobj.tell() + self._orig_compress_size = zipinfo.compress_size + self._orig_file_size = zipinfo.file_size + self._orig_start_crc = self._running_crc + self._seekable = True + except AttributeError: + pass + def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)] @@ -963,6 +989,62 @@ def close(self): finally: super().close() + def seekable(self): + return self._seekable + + def seek(self, offset, whence=0): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + curr_pos = self.tell() + if whence == 0: # Seek from start of file + new_pos = offset + elif whence == 1: # Seek from current position + new_pos = curr_pos + offset + elif whence == 2: # Seek from EOF + new_pos = self._orig_file_size + offset + else: + raise ValueError("whence must be os.SEEK_SET (0), " + "os.SEEK_CUR (1), or os.SEEK_END (2)") + + if new_pos > self._orig_file_size: + new_pos = self._orig_file_size + + if new_pos < 0: + new_pos = 0 + + read_offset = new_pos - curr_pos + buff_offset = read_offset + self._offset + + if buff_offset >= 0 and buff_offset < len(self._readbuffer): + # Just move the _offset index if the new position is in the _readbuffer + self._offset = buff_offset + read_offset = 0 + elif read_offset < 0: + # Position is before the current position. Reset the ZipExtFile + + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = zipfile._get_decompressor(self._compress_type) + self._eof = False + read_offset = new_pos + + while read_offset > 0: + read_len = min(self.MAX_SEEK_READ, read_offset) + self.read(read_len) + read_offset -= read_len + + return self.tell() + + def tell(self): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset + return filepos + class _ZipWriteFile(io.BufferedIOBase): def __init__(self, zf, zinfo, zip64): diff --git a/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst new file mode 100644 index 00000000000000..4f3cc0166019f1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst @@ -0,0 +1,2 @@ +Added seek and tell to the ZipExtFile class. This only works if the file +object used to open the zipfile is seekable. From 55beb125db2942b5362454e05542e9661e964a65 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 23 Apr 2024 14:29:31 +0200 Subject: [PATCH] gh-109858: Protect zipfile from "quoted-overlap" zipbomb (GH-110016) (GH-113916) Raise BadZipFile when try to read an entry that overlaps with other entry or central directory. (cherry picked from commit 66363b9a7b9fe7c99eba3a185b74c5fdbf842eba) --- Lib/test/test_zipfile.py | 60 +++++++++++++++++++ Lib/zipfile.py | 12 ++++ ...-09-28-13-15-51.gh-issue-109858.43e2dg.rst | 3 + 3 files changed, 75 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-09-28-13-15-51.gh-issue-109858.43e2dg.rst diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 7f82586..0379909 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1644,6 +1644,66 @@ class OtherTests(unittest.TestCase): fp.seek(0, os.SEEK_END) self.assertEqual(fp.tell(), len(txt)) + @requires_zlib + def test_full_overlap(self): + data = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e' + b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00a\xed' + b'\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\d\x0b`P' + b'K\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2' + b'\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00aPK' + b'\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e' + b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00bPK\x05' + b'\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00\x00/\x00\x00' + b'\x00\x00\x00' + ) + with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf: + self.assertEqual(zipf.namelist(), ['a', 'b']) + zi = zipf.getinfo('a') + self.assertEqual(zi.header_offset, 0) + self.assertEqual(zi.compress_size, 16) + self.assertEqual(zi.file_size, 1033) + zi = zipf.getinfo('b') + self.assertEqual(zi.header_offset, 0) + self.assertEqual(zi.compress_size, 16) + self.assertEqual(zi.file_size, 1033) + self.assertEqual(len(zipf.read('a')), 1033) + with self.assertRaisesRegex(zipfile.BadZipFile, 'File name.*differ'): + zipf.read('b') + + @requires_zlib + def test_quoted_overlap(self): + data = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05Y\xfc' + b'8\x044\x00\x00\x00(\x04\x00\x00\x01\x00\x00\x00a\x00' + b'\x1f\x00\xe0\xffPK\x03\x04\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00b\xed\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\' + b'd\x0b`PK\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0' + b'lH\x05Y\xfc8\x044\x00\x00\x00(\x04\x00\x00\x01' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + b'\x00aPK\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00' + b'bPK\x05\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00' + b'\x00S\x00\x00\x00\x00\x00' + ) + with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf: + self.assertEqual(zipf.namelist(), ['a', 'b']) + zi = zipf.getinfo('a') + self.assertEqual(zi.header_offset, 0) + self.assertEqual(zi.compress_size, 52) + self.assertEqual(zi.file_size, 1064) + zi = zipf.getinfo('b') + self.assertEqual(zi.header_offset, 36) + self.assertEqual(zi.compress_size, 16) + self.assertEqual(zi.file_size, 1033) + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zipf.read('a') + self.assertEqual(len(zipf.read('b')), 1033) + def tearDown(self): unlink(TESTFN) unlink(TESTFN2) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 0ab9fac..e6d7676 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -338,6 +338,7 @@ class ZipInfo (object): 'compress_size', 'file_size', '_raw_time', + '_end_offset', ) def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): @@ -376,6 +377,7 @@ class ZipInfo (object): self.volume = 0 # Volume number of file header self.internal_attr = 0 # Internal attributes self.external_attr = 0 # External file attributes + self._end_offset = None # Start of the next local header or central directory # Other attributes are set by class ZipFile: # header_offset Byte offset to the file header # CRC CRC-32 of the uncompressed file @@ -1346,6 +1348,12 @@ class ZipFile: if self.debug > 2: print("total", total) + end_offset = self.start_dir + for zinfo in sorted(self.filelist, + key=lambda zinfo: zinfo.header_offset, + reverse=True): + zinfo._end_offset = end_offset + end_offset = zinfo.header_offset def namelist(self): """Return a list of file names in the archive.""" @@ -1500,6 +1508,10 @@ class ZipFile: 'File name in directory %r and header %r differ.' % (zinfo.orig_filename, fname)) + if (zinfo._end_offset is not None and + zef_file.tell() + zinfo.compress_size > zinfo._end_offset): + raise BadZipFile(f"Overlapped entries: {zinfo.orig_filename!r} (possible zip bomb)") + # check for encrypted flag & handle password is_encrypted = zinfo.flag_bits & 0x1 zd = None diff --git a/Misc/NEWS.d/next/Library/2023-09-28-13-15-51.gh-issue-109858.43e2dg.rst b/Misc/NEWS.d/next/Library/2023-09-28-13-15-51.gh-issue-109858.43e2dg.rst new file mode 100644 index 0000000..be279ca --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-09-28-13-15-51.gh-issue-109858.43e2dg.rst @@ -0,0 +1,3 @@ +Protect :mod:`zipfile` from "quoted-overlap" zipbomb. It now raises +BadZipFile when try to read an entry that overlaps with other entry or +central directory. -- 2.44.0