Skip to content

Commit 066df4f

Browse files
jjollygpshead
authored andcommitted
bpo-22908: Add seek and tell functionality to ZipExtFile (GH-4966)
This allows for nested zip files, tar files within zip files, zip files within tar files, etc. Contributed by: John Jolly
1 parent 2e0ecde commit 066df4f

File tree

4 files changed

+121
-3
lines changed

4 files changed

+121
-3
lines changed

Doc/library/zipfile.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,9 @@ ZipFile Objects
246246
With *mode* ``'r'`` the file-like object
247247
(``ZipExtFile``) is read-only and provides the following methods:
248248
:meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`,
249-
:meth:`~io.IOBase.readlines`, :meth:`__iter__`,
250-
:meth:`~iterator.__next__`. These objects can operate independently of
251-
the ZipFile.
249+
:meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`,
250+
:meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`.
251+
These objects can operate independently of the ZipFile.
252252

253253
With ``mode='w'``, a writable file handle is returned, which supports the
254254
:meth:`~io.BufferedIOBase.write` method. While a writable file handle is open,

Lib/test/test_zipfile.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,6 +1628,40 @@ def test_open_conflicting_handles(self):
16281628
self.assertEqual(zipf.read('baz'), msg3)
16291629
self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz'])
16301630

1631+
def test_seek_tell(self):
1632+
# Test seek functionality
1633+
txt = b"Where's Bruce?"
1634+
bloc = txt.find(b"Bruce")
1635+
# Check seek on a file
1636+
with zipfile.ZipFile(TESTFN, "w") as zipf:
1637+
zipf.writestr("foo.txt", txt)
1638+
with zipfile.ZipFile(TESTFN, "r") as zipf:
1639+
with zipf.open("foo.txt", "r") as fp:
1640+
fp.seek(bloc, os.SEEK_SET)
1641+
self.assertEqual(fp.tell(), bloc)
1642+
fp.seek(-bloc, os.SEEK_CUR)
1643+
self.assertEqual(fp.tell(), 0)
1644+
fp.seek(bloc, os.SEEK_CUR)
1645+
self.assertEqual(fp.tell(), bloc)
1646+
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
1647+
fp.seek(0, os.SEEK_END)
1648+
self.assertEqual(fp.tell(), len(txt))
1649+
# Check seek on memory file
1650+
data = io.BytesIO()
1651+
with zipfile.ZipFile(data, mode="w") as zipf:
1652+
zipf.writestr("foo.txt", txt)
1653+
with zipfile.ZipFile(data, mode="r") as zipf:
1654+
with zipf.open("foo.txt", "r") as fp:
1655+
fp.seek(bloc, os.SEEK_SET)
1656+
self.assertEqual(fp.tell(), bloc)
1657+
fp.seek(-bloc, os.SEEK_CUR)
1658+
self.assertEqual(fp.tell(), 0)
1659+
fp.seek(bloc, os.SEEK_CUR)
1660+
self.assertEqual(fp.tell(), bloc)
1661+
self.assertEqual(fp.read(5), txt[bloc:bloc+5])
1662+
fp.seek(0, os.SEEK_END)
1663+
self.assertEqual(fp.tell(), len(txt))
1664+
16311665
def tearDown(self):
16321666
unlink(TESTFN)
16331667
unlink(TESTFN2)

Lib/zipfile.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,18 @@ def __init__(self, file, pos, close, lock, writing):
696696
self._close = close
697697
self._lock = lock
698698
self._writing = writing
699+
self.seekable = file.seekable
700+
self.tell = file.tell
701+
702+
def seek(self, offset, whence=0):
703+
with self._lock:
704+
if self.writing():
705+
raise ValueError("Can't reposition in the ZIP file while "
706+
"there is an open writing handle on it. "
707+
"Close the writing handle before trying to read.")
708+
self._file.seek(self._pos)
709+
self._pos = self._file.tell()
710+
return self._pos
699711

700712
def read(self, n=-1):
701713
with self._lock:
@@ -746,6 +758,9 @@ class ZipExtFile(io.BufferedIOBase):
746758
# Read from compressed files in 4k blocks.
747759
MIN_READ_SIZE = 4096
748760

761+
# Chunk size to read during seek
762+
MAX_SEEK_READ = 1 << 24
763+
749764
def __init__(self, fileobj, mode, zipinfo, decrypter=None,
750765
close_fileobj=False):
751766
self._fileobj = fileobj
@@ -778,6 +793,17 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None,
778793
else:
779794
self._expected_crc = None
780795

796+
self._seekable = False
797+
try:
798+
if fileobj.seekable():
799+
self._orig_compress_start = fileobj.tell()
800+
self._orig_compress_size = zipinfo.compress_size
801+
self._orig_file_size = zipinfo.file_size
802+
self._orig_start_crc = self._running_crc
803+
self._seekable = True
804+
except AttributeError:
805+
pass
806+
781807
def __repr__(self):
782808
result = ['<%s.%s' % (self.__class__.__module__,
783809
self.__class__.__qualname__)]
@@ -963,6 +989,62 @@ def close(self):
963989
finally:
964990
super().close()
965991

992+
def seekable(self):
993+
return self._seekable
994+
995+
def seek(self, offset, whence=0):
996+
if not self._seekable:
997+
raise io.UnsupportedOperation("underlying stream is not seekable")
998+
curr_pos = self.tell()
999+
if whence == 0: # Seek from start of file
1000+
new_pos = offset
1001+
elif whence == 1: # Seek from current position
1002+
new_pos = curr_pos + offset
1003+
elif whence == 2: # Seek from EOF
1004+
new_pos = self._orig_file_size + offset
1005+
else:
1006+
raise ValueError("whence must be os.SEEK_SET (0), "
1007+
"os.SEEK_CUR (1), or os.SEEK_END (2)")
1008+
1009+
if new_pos > self._orig_file_size:
1010+
new_pos = self._orig_file_size
1011+
1012+
if new_pos < 0:
1013+
new_pos = 0
1014+
1015+
read_offset = new_pos - curr_pos
1016+
buff_offset = read_offset + self._offset
1017+
1018+
if buff_offset >= 0 and buff_offset < len(self._readbuffer):
1019+
# Just move the _offset index if the new position is in the _readbuffer
1020+
self._offset = buff_offset
1021+
read_offset = 0
1022+
elif read_offset < 0:
1023+
# Position is before the current position. Reset the ZipExtFile
1024+
1025+
self._fileobj.seek(self._orig_compress_start)
1026+
self._running_crc = self._orig_start_crc
1027+
self._compress_left = self._orig_compress_size
1028+
self._left = self._orig_file_size
1029+
self._readbuffer = b''
1030+
self._offset = 0
1031+
self._decompressor = zipfile._get_decompressor(self._compress_type)
1032+
self._eof = False
1033+
read_offset = new_pos
1034+
1035+
while read_offset > 0:
1036+
read_len = min(self.MAX_SEEK_READ, read_offset)
1037+
self.read(read_len)
1038+
read_offset -= read_len
1039+
1040+
return self.tell()
1041+
1042+
def tell(self):
1043+
if not self._seekable:
1044+
raise io.UnsupportedOperation("underlying stream is not seekable")
1045+
filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset
1046+
return filepos
1047+
9661048

9671049
class _ZipWriteFile(io.BufferedIOBase):
9681050
def __init__(self, zf, zinfo, zip64):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Added seek and tell to the ZipExtFile class. This only works if the file
2+
object used to open the zipfile is seekable.

0 commit comments

Comments
 (0)