Skip to content

bpo-32248 - Implement ResourceReader and get_resource_reader() for zipimport #5248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Jan 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c1d7720
Checkpointing
warsaw Jan 5, 2018
d645574
Implement get_resource_reader() API
warsaw Jan 5, 2018
fce8017
Checkpointing
warsaw Jan 8, 2018
ef4f773
Merge branch 'master' into resource-reader
warsaw Jan 11, 2018
6d347b8
Merge branch 'master' into resource-reader
warsaw Jan 12, 2018
29cc5f7
Revert changes to zipimport; we'll do those separately
warsaw Jan 12, 2018
bec7821
Change the sense to a positive test
warsaw Jan 12, 2018
d5d2446
Merge branch 'master' into resource-reader
warsaw Jan 13, 2018
547f078
ResourceReader needs to be an ABC.
warsaw Jan 13, 2018
c165144
Merge branch 'master' into resource-reader
warsaw Jan 15, 2018
3ea98cf
WIP: restore zip importer work
warsaw Jan 15, 2018
5ca6bc5
Merge branch 'master' into zipresources
warsaw Jan 16, 2018
9f2a284
Merge branch 'master' into zipresources
warsaw Jan 17, 2018
a70f4e4
Merge branch 'master' into zipresources
warsaw Jan 19, 2018
2459adb
Hook in ZipImporter.get_resource_reader()
warsaw Jan 19, 2018
a40d6e7
Flesh out the trampoline
warsaw Jan 19, 2018
d1af1f4
open_resource(), resource_path(), is_resource()
warsaw Jan 19, 2018
ba359e1
Complete the implementation for contents()
warsaw Jan 20, 2018
0d4ea2a
Merge branch 'master' into zipresources
warsaw Jan 20, 2018
3101ef8
Add news
warsaw Jan 20, 2018
f97bd09
Merge branch 'master' into zipresources
warsaw Jan 20, 2018
b34d15f
Merge branch 'master' into zipresources
warsaw Jan 24, 2018
e5e3879
Shorten the NEWS file entry and expand a bit on the What's New entry.
warsaw Jan 24, 2018
e2ea845
PyObject_CallMethod() simplifies things
warsaw Jan 24, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Doc/whatsnew/3.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,8 +328,12 @@ importlib.resources
This module provides several new APIs and one new ABC for access to, opening,
and reading *resources* inside packages. Resources are roughly akin to files
inside of packages, but they needn't be actual files on the physical file
system. Module loaders can provide :class:`importlib.abc.ResourceReader`
implementations to support this new module's API.
system. Module loaders can provide a :meth:`get_resource_reader()` function
which returns a :class:`importlib.abc.ResourceReader` instance to support this
new API. Built-in file path loaders and zip file loaders both support this.
(see the PyPI package
`importlib_resources <http://importlib-resources.readthedocs.io/en/latest/>`_
as a compatible back port for older Python versions).


Improved Modules
Expand Down
150 changes: 83 additions & 67 deletions Lib/importlib/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Iterator, Optional, Set, Union # noqa: F401
from typing import cast
from typing.io import BinaryIO, TextIO
from zipfile import ZipFile
from zipimport import ZipImportError


Package = Union[str, ModuleType]
Expand Down Expand Up @@ -216,38 +216,7 @@ def is_resource(package: Package, name: str) -> bool:
# contents doesn't necessarily mean it's a resource. Directories are not
# resources, so let's try to find out if it's a directory or not.
path = Path(package.__spec__.origin).parent / name
if path.is_file():
return True
if path.is_dir():
return False
# If it's not a file and it's not a directory, what is it? Well, this
# means the file doesn't exist on the file system, so it probably lives
# inside a zip file. We have to crack open the zip, look at its table of
# contents, and make sure that this entry doesn't have sub-entries.
archive_path = package.__spec__.loader.archive # type: ignore
package_directory = Path(package.__spec__.origin).parent
with ZipFile(archive_path) as zf:
toc = zf.namelist()
relpath = package_directory.relative_to(archive_path)
candidate_path = relpath / name
for entry in toc:
try:
relative_to_candidate = Path(entry).relative_to(candidate_path)
except ValueError:
# The two paths aren't relative to each other so we can ignore it.
continue
# Since directories aren't explicitly listed in the zip file, we must
# infer their 'directory-ness' by looking at the number of path
# components in the path relative to the package resource we're
# looking up. If there are zero additional parts, it's a file, i.e. a
# resource. If there are more than zero it's a directory, i.e. not a
# resource. It has to be one of these two cases.
return len(relative_to_candidate.parts) == 0
# I think it's impossible to get here. It would mean that we are looking
# for a resource in a zip file, there's an entry matching it in the return
# value of contents(), but we never actually found it in the zip's table of
# contents.
raise AssertionError('Impossible situation')
return path.is_file()


def contents(package: Package) -> Iterator[str]:
Expand All @@ -268,38 +237,85 @@ def contents(package: Package) -> Iterator[str]:
not package.__spec__.has_location):
return []
package_directory = Path(package.__spec__.origin).parent
try:
yield from os.listdir(str(package_directory))
except (NotADirectoryError, FileNotFoundError):
# The package is probably in a zip file.
archive_path = getattr(package.__spec__.loader, 'archive', None)
if archive_path is None:
raise
relpath = package_directory.relative_to(archive_path)
with ZipFile(archive_path) as zf:
toc = zf.namelist()
subdirs_seen = set() # type: Set
for filename in toc:
path = Path(filename)
# Strip off any path component parts that are in common with the
# package directory, relative to the zip archive's file system
# path. This gives us all the parts that live under the named
# package inside the zip file. If the length of these subparts is
# exactly 1, then it is situated inside the package. The resulting
# length will be 0 if it's above the package, and it will be
# greater than 1 if it lives in a subdirectory of the package
# directory.
#
# However, since directories themselves don't appear in the zip
# archive as a separate entry, we need to return the first path
# component for any case that has > 1 subparts -- but only once!
if path.parts[:len(relpath.parts)] != relpath.parts:
yield from os.listdir(str(package_directory))


# Private implementation of ResourceReader and get_resource_reader() for
# zipimport. Don't use these directly! We're implementing these in Python
# because 1) it's easier, 2) zipimport will likely get rewritten in Python
# itself at some point, so doing this all in C would just be a waste of
# effort.

class _ZipImportResourceReader(resources_abc.ResourceReader):
"""Private class used to support ZipImport.get_resource_reader().

This class is allowed to reference all the innards and private parts of
the zipimporter.
"""

def __init__(self, zipimporter, fullname):
self.zipimporter = zipimporter
self.fullname = fullname

def open_resource(self, resource):
path = f'{self.fullname}/{resource}'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does zip importer require the forward slash? Or does it expect os.sep? (You're using pathlib below on what looks like it's probably the same style of path, but without digging further I'm not 100% sure.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure either, but AFAICT, the paths inside the zip all have forward slashes.

I suppose that could pose a problem with using pathlib on Windows, but given Appveyor passing, I think we can trust that it isn't a problem in practice, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is pathlib.PurePath.as_posix() to guarantee the forward slashes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't os.fspath() be called on resource? And is there any worry of people accidentally passing in a subdirectory since there is no argument check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to do those things since people will be accessing these through the higher level API (I.e. importlib.resources). I think the validation done at that level means we can trust what gets passed to us. I also don't think we need to force forward slashes in the argument passed to self.zipimporter.get_data() since we're using forward slash in the f-string and zips only support forward slashes in their internal file names.

try:
return BytesIO(self.zipimporter.get_data(path))
except OSError:
raise FileNotFoundError

def resource_path(self, resource):
# All resources are in the zip file, so there is no path to the file.
# Raising FileNotFoundError tells the higher level API to extract the
# binary data and create a temporary file.
raise FileNotFoundError

def is_resource(self, name):
# Maybe we could do better, but if we can get the data, it's a
# resource. Otherwise it isn't.
path = f'{self.fullname}/{name}'
try:
self.zipimporter.get_data(path)
except OSError:
return False
return True

def contents(self):
# This is a bit convoluted, because fullname will be a module path,
# but _files is a list of file names relative to the top of the
# archive's namespace. We want to compare file paths to find all the
# names of things inside the module represented by fullname. So we
# turn the module path of fullname into a file path relative to the
# top of the archive, and then we iterate through _files looking for
# names inside that "directory".
fullname_path = Path(self.zipimporter.get_filename(self.fullname))
relative_path = fullname_path.relative_to(self.zipimporter.archive)
# Don't forget that fullname names a package, so its path will include
# __init__.py, which we want to ignore.
assert relative_path.name == '__init__.py'
package_path = relative_path.parent
subdirs_seen = set()
for filename in self.zipimporter._files:
try:
relative = Path(filename).relative_to(package_path)
except ValueError:
continue
subparts = path.parts[len(relpath.parts):]
if len(subparts) == 1:
yield subparts[0]
elif len(subparts) > 1:
subdir = subparts[0]
if subdir not in subdirs_seen:
subdirs_seen.add(subdir)
yield subdir
# If the path of the file (which is relative to the top of the zip
# namespace), relative to the package given when the resource
# reader was created, has a parent, then it's a name in a
# subdirectory and thus we skip it.
parent_name = relative.parent.name
if len(parent_name) == 0:
yield relative.name
elif parent_name not in subdirs_seen:
subdirs_seen.add(parent_name)
yield parent_name


def _zipimport_get_resource_reader(zipimporter, fullname):
try:
if not zipimporter.is_package(fullname):
return None
except ZipImportError:
return None
return _ZipImportResourceReader(zipimporter, fullname)
19 changes: 6 additions & 13 deletions Misc/NEWS.d/next/Library/2017-12-15-15-34-12.bpo-32248.zmO8G2.rst
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
Add :class:`importlib.abc.ResourceReader` as an ABC to provide a
unified API for reading resources contained within packages. Loaders
wishing to support resource reading are expected to implement the
``get_resource_reader(fullname)`` method.

Also add :mod:`importlib.resources` as the stdlib port of the
``importlib_resources`` PyPI package. The modules provides a high-level
API for end-users to read resources in a nicer fashion than having to
directly interact with low-level details such as loaders.

Thanks to this work, :class:`importlib.abc.ResourceLoader` has now
been documented as deprecated due to its under-specified nature and
lack of features as provided by :class:`importlib.abc.ResourceReader`.
Add :mod:`importlib.resources` and :class:`importlib.abc.ResourceReader` as
the unified API for reading resources contained within packages. Loaders
wishing to support resource reading must implement the
:meth:`get_resource_reader()` method. File-based and zipimport-based loaders
both implement these APIs. :class:`importlib.abc.ResourceLoader` is
deprecated in favor of these new APIs.
33 changes: 32 additions & 1 deletion Modules/clinic/zipimport.c.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,4 +291,35 @@ zipimport_zipimporter_get_source(ZipImporter *self, PyObject *arg)
exit:
return return_value;
}
/*[clinic end generated code: output=93cb62a3a9752b9f input=a9049054013a1b77]*/

PyDoc_STRVAR(zipimport_zipimporter_get_resource_reader__doc__,
"get_resource_reader($self, fullname, /)\n"
"--\n"
"\n"
"Return the ResourceReader for a package in a zip file.\n"
"\n"
"If \'fullname\' is a package within the zip file, return the \'ResourceReader\'\n"
"object for the package. Otherwise return None.");

#define ZIPIMPORT_ZIPIMPORTER_GET_RESOURCE_READER_METHODDEF \
{"get_resource_reader", (PyCFunction)zipimport_zipimporter_get_resource_reader, METH_O, zipimport_zipimporter_get_resource_reader__doc__},

static PyObject *
zipimport_zipimporter_get_resource_reader_impl(ZipImporter *self,
PyObject *fullname);

static PyObject *
zipimport_zipimporter_get_resource_reader(ZipImporter *self, PyObject *arg)
{
PyObject *return_value = NULL;
PyObject *fullname;

if (!PyArg_Parse(arg, "U:get_resource_reader", &fullname)) {
goto exit;
}
return_value = zipimport_zipimporter_get_resource_reader_impl(self, fullname);

exit:
return return_value;
}
/*[clinic end generated code: output=0b57adfe21373512 input=a9049054013a1b77]*/
30 changes: 30 additions & 0 deletions Modules/zipimport.c
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,35 @@ zipimport_zipimporter_get_source_impl(ZipImporter *self, PyObject *fullname)
Py_RETURN_NONE;
}

/*[clinic input]
zipimport.zipimporter.get_resource_reader

fullname: unicode
/

Return the ResourceReader for a package in a zip file.

If 'fullname' is a package within the zip file, return the 'ResourceReader'
object for the package. Otherwise return None.

[clinic start generated code]*/

static PyObject *
zipimport_zipimporter_get_resource_reader_impl(ZipImporter *self,
PyObject *fullname)
/*[clinic end generated code: output=5e367d431f830726 input=bfab94d736e99151]*/
{
PyObject *module = PyImport_ImportModule("importlib.resources");
if (module == NULL) {
return NULL;
}
PyObject *retval = PyObject_CallMethod(
module, "_zipimport_get_resource_reader",
"OO", (PyObject *)self, fullname);
Py_DECREF(module);
return retval;
}


static PyMethodDef zipimporter_methods[] = {
ZIPIMPORT_ZIPIMPORTER_FIND_MODULE_METHODDEF
Expand All @@ -794,6 +823,7 @@ static PyMethodDef zipimporter_methods[] = {
ZIPIMPORT_ZIPIMPORTER_GET_DATA_METHODDEF
ZIPIMPORT_ZIPIMPORTER_GET_CODE_METHODDEF
ZIPIMPORT_ZIPIMPORTER_GET_SOURCE_METHODDEF
ZIPIMPORT_ZIPIMPORTER_GET_RESOURCE_READER_METHODDEF
{NULL, NULL} /* sentinel */
};

Expand Down