Skip to content

GH-113528: Deoptimise pathlib._abc.PurePathBase #113559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 103 additions & 1 deletion Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,20 @@ class PurePath(_abc.PurePathBase):
"""

__slots__ = (
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',

# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',

# The `_str_normcase_cached` slot stores the string path with
# normalized case. It is set when the `_str_normcase` property is
# accessed for the first time. It's used to implement `__eq__()`
Expand Down Expand Up @@ -196,6 +210,94 @@ def __ge__(self, other):
return NotImplemented
return self._parts_normcase >= other._parts_normcase

def __str__(self):
"""Return the string representation of the path, suitable for
passing to system calls."""
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._tail) or '.'
return self._str

@classmethod
def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
return drv + root + cls.pathmod.sep.join(tail)
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
tail = ['.'] + tail
return cls.pathmod.sep.join(tail)

def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path._drv = drv
path._root = root
path._tail_cached = tail
return path

@classmethod
def _parse_path(cls, path):
if not path:
return '', '', []
sep = cls.pathmod.sep
altsep = cls.pathmod.altsep
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls.pathmod.splitroot(path)
if not root and drv.startswith(sep) and not drv.endswith(sep):
drv_parts = drv.split(sep)
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
# e.g. //server/share
root = sep
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed

def _load_parts(self):
paths = self._raw_paths
if len(paths) == 0:
path = ''
elif len(paths) == 1:
path = paths[0]
else:
path = self.pathmod.join(*paths)
self._drv, self._root, self._tail_cached = self._parse_path(path)

@property
def drive(self):
"""The drive prefix (letter or UNC path), if any."""
try:
return self._drv
except AttributeError:
self._load_parts()
return self._drv

@property
def root(self):
"""The root of the path, if any."""
try:
return self._root
except AttributeError:
self._load_parts()
return self._root

@property
def _tail(self):
try:
return self._tail_cached
except AttributeError:
self._load_parts()
return self._tail_cached

@property
def anchor(self):
"""The concatenation of the drive and root, or ''."""
return self.drive + self.root

@property
def parts(self):
"""An object providing sequence-like access to the
Expand Down Expand Up @@ -416,7 +518,7 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _make_child_entry(self, entry):
def _make_child_entry(self, entry, is_dir=False):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
Expand Down
142 changes: 29 additions & 113 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import functools
import ntpath
import posixpath
import sys
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO

Expand Down Expand Up @@ -82,7 +81,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
if match(entry.name):
yield parent_path._make_child_entry(entry)
yield parent_path._make_child_entry(entry, dir_only)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
Expand All @@ -105,7 +104,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
for entry in entries:
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry))
paths.append(path._make_child_entry(entry, dir_only))
continue
except OSError:
pass
Expand Down Expand Up @@ -147,20 +146,6 @@ class PurePathBase:
# in the `__init__()` method.
'_raw_paths',

# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',

# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',

# The '_resolving' slot stores a boolean indicating whether the path
# is being processed by `PathBase.resolve()`. This prevents duplicate
# work from occurring when `resolve()` calls `stat()` or `readlink()`.
Expand All @@ -179,65 +164,16 @@ def with_segments(self, *pathsegments):
"""
return type(self)(*pathsegments)

@classmethod
def _parse_path(cls, path):
if not path:
return '', '', []
sep = cls.pathmod.sep
altsep = cls.pathmod.altsep
if altsep:
path = path.replace(altsep, sep)
drv, root, rel = cls.pathmod.splitroot(path)
if not root and drv.startswith(sep) and not drv.endswith(sep):
drv_parts = drv.split(sep)
if len(drv_parts) == 4 and drv_parts[2] not in '?.':
# e.g. //server/share
root = sep
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed

def _load_parts(self):
paths = self._raw_paths
if len(paths) == 0:
path = ''
elif len(paths) == 1:
path = paths[0]
else:
path = self.pathmod.join(*paths)
drv, root, tail = self._parse_path(path)
self._drv = drv
self._root = root
self._tail_cached = tail

def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path._drv = drv
path._root = root
path._tail_cached = tail
return path

@classmethod
def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
return drv + root + cls.pathmod.sep.join(tail)
elif tail and cls.pathmod.splitdrive(tail[0])[0]:
tail = ['.'] + tail
return cls.pathmod.sep.join(tail)

def __str__(self):
"""Return the string representation of the path, suitable for
passing to system calls."""
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._tail) or '.'
return self._str
paths = self._raw_paths
if len(paths) == 1:
return paths[0]
elif paths:
return self.pathmod.join(*paths)
else:
return ''

def as_posix(self):
"""Return the string representation of the path with forward (/)
Expand All @@ -247,42 +183,23 @@ def as_posix(self):
@property
def drive(self):
"""The drive prefix (letter or UNC path), if any."""
try:
return self._drv
except AttributeError:
self._load_parts()
return self._drv
return self.pathmod.splitdrive(str(self))[0]

@property
def root(self):
"""The root of the path, if any."""
try:
return self._root
except AttributeError:
self._load_parts()
return self._root

@property
def _tail(self):
try:
return self._tail_cached
except AttributeError:
self._load_parts()
return self._tail_cached
return self.pathmod.splitroot(str(self))[1]

@property
def anchor(self):
"""The concatenation of the drive and root, or ''."""
anchor = self.drive + self.root
return anchor
drive, root, _ = self.pathmod.splitroot(str(self))
return drive + root

@property
def name(self):
"""The final path component, if any."""
path_str = str(self)
if not path_str or path_str == '.':
return ''
return self.pathmod.basename(path_str)
return self.pathmod.basename(str(self))

@property
def suffix(self):
Expand Down Expand Up @@ -323,13 +240,10 @@ def stem(self):

def with_name(self, name):
"""Return a new path with the file name changed."""
m = self.pathmod
if not name or m.sep in name or (m.altsep and m.altsep in name) or name == '.':
dirname = self.pathmod.dirname
if dirname(name):
raise ValueError(f"Invalid name {name!r}")
parent, old_name = m.split(str(self))
if not old_name or old_name == '.':
raise ValueError(f"{self!r} has an empty name")
return self.with_segments(parent, name)
return self.with_segments(dirname(str(self)), name)

def with_stem(self, stem):
"""Return a new path with the stem changed."""
Expand Down Expand Up @@ -480,7 +394,7 @@ def is_absolute(self):
def is_reserved(self):
"""Return True if the path contains one of the special names reserved
by the system, if any."""
if self.pathmod is posixpath or not self._tail:
if self.pathmod is posixpath or not self.name:
return False

# NOTE: the rules for reserved names seem somewhat complicated
Expand All @@ -490,7 +404,7 @@ def is_reserved(self):
if self.drive.startswith('\\\\'):
# UNC paths are never reserved.
return False
name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
name = self.name.partition('.')[0].partition(':')[0].rstrip(' ')
return name.upper() in _WIN_RESERVED_NAMES

def match(self, path_pattern, *, case_sensitive=None):
Expand All @@ -503,9 +417,9 @@ def match(self, path_pattern, *, case_sensitive=None):
case_sensitive = _is_case_sensitive(self.pathmod)
sep = path_pattern.pathmod.sep
pattern_str = str(path_pattern)
if path_pattern.drive or path_pattern.root:
if path_pattern.anchor:
pass
elif path_pattern._tail:
elif path_pattern.parts:
pattern_str = f'**{sep}{pattern_str}'
else:
raise ValueError("empty pattern")
Expand Down Expand Up @@ -780,8 +694,10 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_entry(self, entry):
def _make_child_entry(self, entry, is_dir=False):
# Transform an entry yielded from _scandir() into a path object.
if is_dir:
return entry.joinpath('')
return entry

def _make_child_relpath(self, name):
Expand All @@ -792,13 +708,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
kind, including directories) matching the given relative pattern.
"""
path_pattern = self.with_segments(pattern)
if path_pattern.drive or path_pattern.root:
if path_pattern.anchor:
raise NotImplementedError("Non-relative patterns are unsupported")
elif not path_pattern._tail:
elif not path_pattern.parts:
raise ValueError("Unacceptable pattern: {!r}".format(pattern))

pattern_parts = path_pattern._tail.copy()
if pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
pattern_parts = list(path_pattern.parts)
if not self.pathmod.basename(pattern):
# GH-65238: pathlib doesn't preserve trailing slash. Add it back.
pattern_parts.append('')

Expand All @@ -816,7 +732,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
paths = iter([self.joinpath('')] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
part = pattern_parts[part_idx]
Expand Down
Loading