Skip to content

Commit 8f70b16

Browse files
authored
gh-86094: Add support for Unicode Path Extra Field in ZipFile (gh-102566)
1 parent a28d4ed commit 8f70b16

File tree

4 files changed

+67
-13
lines changed

4 files changed

+67
-13
lines changed

Lib/test/test_zipfile/test_core.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,6 +1616,33 @@ def test_write_unicode_filenames(self):
16161616
self.assertEqual(zf.filelist[0].filename, "foo.txt")
16171617
self.assertEqual(zf.filelist[1].filename, "\xf6.txt")
16181618

1619+
@requires_zlib()
1620+
def test_read_zipfile_containing_unicode_path_extra_field(self):
1621+
with zipfile.ZipFile(TESTFN, mode='w') as zf:
1622+
# create a file with a non-ASCII name
1623+
filename = '이름.txt'
1624+
filename_encoded = filename.encode('utf-8')
1625+
1626+
# create a ZipInfo object with Unicode path extra field
1627+
zip_info = zipfile.ZipInfo(filename)
1628+
1629+
tag_for_unicode_path = b'\x75\x70'
1630+
version_of_unicode_path = b'\x01'
1631+
1632+
import zlib
1633+
filename_crc = struct.pack('<L', zlib.crc32(filename_encoded))
1634+
1635+
extra_data = version_of_unicode_path + filename_crc + filename_encoded
1636+
tsize = len(extra_data).to_bytes(2, 'little')
1637+
1638+
zip_info.extra = tag_for_unicode_path + tsize + extra_data
1639+
1640+
# add the file to the ZIP archive
1641+
zf.writestr(zip_info, b'Hello World!')
1642+
1643+
with zipfile.ZipFile(TESTFN, "r") as zf:
1644+
self.assertEqual(zf.filelist[0].filename, "이름.txt")
1645+
16191646
def test_read_after_write_unicode_filenames(self):
16201647
with zipfile.ZipFile(TESTFN2, 'w') as zipfp:
16211648
zipfp.writestr('приклад', b'sample')

Lib/zipfile/__init__.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,22 @@ def _EndRecData(fpin):
338338
# Unable to find a valid end of central directory structure
339339
return None
340340

341+
def _sanitize_filename(filename):
342+
"""Terminate the file name at the first null byte and
343+
ensure paths always use forward slashes as the directory separator."""
344+
345+
# Terminate the file name at the first null byte. Null bytes in file
346+
# names are used as tricks by viruses in archives.
347+
null_byte = filename.find(chr(0))
348+
if null_byte >= 0:
349+
filename = filename[0:null_byte]
350+
# This is used to ensure paths in generated ZIP files always use
351+
# forward slashes as the directory separator, as required by the
352+
# ZIP format specification.
353+
if os.sep != "/" and os.sep in filename:
354+
filename = filename.replace(os.sep, "/")
355+
return filename
356+
341357

342358
class ZipInfo (object):
343359
"""Class with attributes describing each file in the ZIP archive."""
@@ -368,16 +384,9 @@ class ZipInfo (object):
368384
def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
369385
self.orig_filename = filename # Original file name in archive
370386

371-
# Terminate the file name at the first null byte. Null bytes in file
372-
# names are used as tricks by viruses in archives.
373-
null_byte = filename.find(chr(0))
374-
if null_byte >= 0:
375-
filename = filename[0:null_byte]
376-
# This is used to ensure paths in generated ZIP files always use
377-
# forward slashes as the directory separator, as required by the
378-
# ZIP format specification.
379-
if os.sep != "/" and os.sep in filename:
380-
filename = filename.replace(os.sep, "/")
387+
# Terminate the file name at the first null byte and
388+
# ensure paths always use forward slashes as the directory separator.
389+
filename = _sanitize_filename(filename)
381390

382391
self.filename = filename # Normalized file name
383392
self.date_time = date_time # year, month, day, hour, min, sec
@@ -482,7 +491,7 @@ def _encodeFilenameFlags(self):
482491
except UnicodeEncodeError:
483492
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
484493

485-
def _decodeExtra(self):
494+
def _decodeExtra(self, filename_crc):
486495
# Try to decode the extra field.
487496
extra = self.extra
488497
unpack = struct.unpack
@@ -508,6 +517,21 @@ def _decodeExtra(self):
508517
except struct.error:
509518
raise BadZipFile(f"Corrupt zip64 extra field. "
510519
f"{field} not found.") from None
520+
elif tp == 0x7075:
521+
data = extra[4:ln+4]
522+
# Unicode Path Extra Field
523+
try:
524+
up_version, up_name_crc = unpack('<BL', data[:5])
525+
if up_version == 1 and up_name_crc == filename_crc:
526+
up_unicode_name = data[5:].decode('utf-8')
527+
if up_unicode_name:
528+
self.filename = _sanitize_filename(up_unicode_name)
529+
else:
530+
warnings.warn("Empty unicode path extra field (0x7075)", stacklevel=2)
531+
except struct.error as e:
532+
raise BadZipFile("Corrupt unicode path extra field (0x7075)") from e
533+
except UnicodeDecodeError as e:
534+
raise BadZipFile('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes') from e
511535

512536
extra = extra[ln+4:]
513537

@@ -1409,6 +1433,7 @@ def _RealGetContents(self):
14091433
if self.debug > 2:
14101434
print(centdir)
14111435
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
1436+
orig_filename_crc = crc32(filename)
14121437
flags = centdir[_CD_FLAG_BITS]
14131438
if flags & _MASK_UTF_FILENAME:
14141439
# UTF-8 file names extension
@@ -1432,8 +1457,7 @@ def _RealGetContents(self):
14321457
x._raw_time = t
14331458
x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
14341459
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
1435-
1436-
x._decodeExtra()
1460+
x._decodeExtra(orig_filename_crc)
14371461
x.header_offset = x.header_offset + concat
14381462
self.filelist.append(x)
14391463
self.NameToInfo[x.filename] = x

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ Julian Gindi
627627
Yannick Gingras
628628
Neil Girdhar
629629
Matt Giuca
630+
Andrea Giudiceandrea
630631
Franz Glasner
631632
Wim Glenn
632633
Michael Goderbauer
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Add support for Unicode Path Extra Field in ZipFile. Patch by Yeojin Kim
2+
and Andrea Giudiceandrea

0 commit comments

Comments
 (0)