python · giampaolo · Jun 12, 2018 · May 22, 2018 · May 22, 2018 · May 22, 2018
diff --git a/Doc/library/shutil.rst b/Doc/library/shutil.rst
@@ -51,7 +51,9 @@ Directory and files operations
 .. function:: copyfile(src, dst, *, follow_symlinks=True)
 
    Copy the contents (no metadata) of the file named *src* to a file named
-   *dst* and return *dst*.  *src* and *dst* are path names given as strings.
+   *dst* and return *dst* in the most efficient way possible.
+   *src* and *dst* are path names given as strings.
+
    *dst* must be the complete target file name; look at :func:`shutil.copy`
    for a copy that accepts a target directory path.  If *src* and *dst*
    specify the same file, :exc:`SameFileError` is raised.
@@ -74,6 +76,10 @@ Directory and files operations
       Raise :exc:`SameFileError` instead of :exc:`Error`.  Since the former is
       a subclass of the latter, this change is backward compatible.
 
+   .. versionchanged:: 3.8
+      Platform-specific fast-copy syscalls may be used internally in order to
+      copy the file more efficiently. See
+      :ref:`shutil-platform-dependent-efficient-copy-operations` section.
 
 .. exception:: SameFileError
 
@@ -163,6 +169,11 @@ Directory and files operations
       Added *follow_symlinks* argument.
       Now returns path to the newly created file.
 
+   .. versionchanged:: 3.8
+      Platform-specific fast-copy syscalls may be used internally in order to
+      copy the file more efficiently. See
+      :ref:`shutil-platform-dependent-efficient-copy-operations` section.
+
 .. function:: copy2(src, dst, *, follow_symlinks=True)
 
    Identical to :func:`~shutil.copy` except that :func:`copy2`
@@ -185,6 +196,11 @@ Directory and files operations
       file system attributes too (currently Linux only).
       Now returns path to the newly created file.
 
+   .. versionchanged:: 3.8
+      Platform-specific fast-copy syscalls may be used internally in order to
+      copy the file more efficiently. See
+      :ref:`shutil-platform-dependent-efficient-copy-operations` section.
+
 .. function:: ignore_patterns(\*patterns)
 
    This factory function creates a function that can be used as a callable for
@@ -241,6 +257,10 @@ Directory and files operations
       Added the *ignore_dangling_symlinks* argument to silent dangling symlinks
       errors when *symlinks* is false.
 
+   .. versionchanged:: 3.8
+      Platform-specific fast-copy syscalls may be used internally in order to
+      copy the file more efficiently. See
+      :ref:`shutil-platform-dependent-efficient-copy-operations` section.
 
 .. function:: rmtree(path, ignore_errors=False, onerror=None)
 
@@ -314,6 +334,11 @@ Directory and files operations
    .. versionchanged:: 3.5
       Added the *copy_function* keyword argument.
 
+   .. versionchanged:: 3.8
+      Platform-specific fast-copy syscalls may be used internally in order to
+      copy the file more efficiently. See
+      :ref:`shutil-platform-dependent-efficient-copy-operations` section.
+
 .. function:: disk_usage(path)
 
    Return disk usage statistics about the given path as a :term:`named tuple`
@@ -370,6 +395,28 @@ Directory and files operations
    operation. For :func:`copytree`, the exception argument is a list of 3-tuples
    (*srcname*, *dstname*, *exception*).
 
+.. _shutil-platform-dependent-efficient-copy-operations:
+
+Platform-dependent efficient copy operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Starting from Python 3.8 all functions involving a file copy (:func:`copyfile`,
+:func:`copy`, :func:`copy2`, :func:`copytree`, and :func:`move`) may use
+platform-specific "fast-copy" syscalls in order to copy the file more
+efficiently (see :issue:`33671`).
+"fast-copy" means that the copying operation occurs within the kernel, avoiding
+the use of userspace buffers in Python as in "``outfd.write(infd.read())``".
+
+On OSX `fcopyfile`_ is used to copy the file content (not metadata).
+
+On Linux, Solaris and other POSIX platforms where :func:`os.sendfile` supports
+copies between 2 regular file descriptors :func:`os.sendfile` is used.
+
+If the fast-copy operation fails and no data was written in the destination
+file then shutil will silently fallback on using less efficient
+:func:`copyfileobj` function internally.
+
+.. versionchanged:: 3.8
 
 .. _shutil-copytree-example:
 
@@ -654,6 +701,8 @@ Querying the size of the output terminal
 
    .. versionadded:: 3.3
 
+.. _`fcopyfile`:
+   http://www.manpagez.com/man/3/copyfile/
+
 .. _`Other Environment Variables`:
    http://pubs.opengroup.org/onlinepubs/7908799/xbd/envvar.html#tag_002_003
-
diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
@@ -90,10 +90,27 @@ New Modules
 Improved Modules
 ================
 
-
 Optimizations
 =============
 
+* :func:`shutil.copyfile`, :func:`shutil.copy`, :func:`shutil.copy2`,
+  :func:`shutil.copytree` and :func:`shutil.move` use platform-specific
+  "fast-copy" syscalls on Linux, OSX and Solaris in order to copy the file more
+  efficiently.
+  "fast-copy" means that the copying operation occurs within the kernel,
+  avoiding the use of userspace buffers in Python as in
+  "``outfd.write(infd.read())``".
+  All other platforms not using such technique will rely on a faster
+  :func:`shutil.copyfile` implementation using :func:`memoryview`,
+  :class:`bytearray` and
+  :meth:`BufferedIOBase.readinto() <io.BufferedIOBase.readinto>`.
+  Finally, :func:`shutil.copyfile` default buffer size on Windows was increased
+  from 16KB to 1MB.
+  The speedup for copying a 512MB file within the same partition is about +26%
+  on Linux, +50% on OSX and +38% on Windows. Also, much less CPU cycles are
+  consumed.
+  (Contributed by Giampaolo Rodola' in :issue:`25427`.)
+
 * The default protocol in the :mod:`pickle` module is now Protocol 4,
   first introduced in Python 3.4.  It offers better performance and smaller
   size compared to Protocol 3 available since Python 3.0.

diff --git a/Lib/shutil.py b/Lib/shutil.py
@@ -10,6 +10,7 @@
 import fnmatch
 import collections
 import errno
+import io
 
 try:
     import zlib
@@ -42,6 +43,16 @@
 except ImportError:
     getgrnam = None
 
+posix = nt = None
+if os.name == 'posix':
+    import posix
+elif os.name == 'nt':
+    import nt
+
+COPY_BUFSIZE = 1024 * 1024 if os.name == 'nt' else 16 * 1024
+_HAS_SENDFILE = posix and hasattr(os, "sendfile")
+_HAS_FCOPYFILE = posix and hasattr(posix, "_fcopyfile")  # OSX
+
 __all__ = ["copyfileobj", "copyfile", "copymode", "copystat", "copy", "copy2",
            "copytree", "move", "rmtree", "Error", "SpecialFileError",
            "ExecError", "make_archive", "get_archive_formats",
@@ -72,14 +83,124 @@ class RegistryError(Exception):
     """Raised when a registry operation with the archiving
     and unpacking registries fails"""
 
+class _GiveupOnFastCopy(Exception):
+    """Raised as a signal to fallback on using raw read()/write()
+    file copy when fast-copy functions fail to do so.
+    """
+
+def _fastcopy_osx(fsrc, fdst, flags):
+    """Copy a regular file content or metadata by using high-performance
+    fcopyfile(3) syscall (OSX).
+    """
+    try:
+        infd = fsrc.fileno()
+        outfd = fdst.fileno()
+    except Exception as err:
+        raise _GiveupOnFastCopy(err)  # not a regular file
+
+    try:
+        posix._fcopyfile(infd, outfd, flags)
+    except OSError as err:
+        err.filename = fsrc.name
+        err.filename2 = fdst.name
+        if err.errno in {errno.EINVAL, errno.ENOTSUP}:
+            raise _GiveupOnFastCopy(err)
+        else:
+            raise err from None
+
+def _fastcopy_sendfile(fsrc, fdst):
+    """Copy data from one regular mmap-like fd to another by using
+    high-performance sendfile(2) syscall.
+    This should work on Linux >= 2.6.33 and Solaris only.
+    """
+    # Note: copyfileobj() is left alone in order to not introduce any
+    # unexpected breakage. Possible risks by using zero-copy calls
+    # in copyfileobj() are:
+    # - fdst cannot be open in "a"(ppend) mode
+    # - fsrc and fdst may be open in "t"(ext) mode
+    # - fsrc may be a BufferedReader (which hides unread data in a buffer),
+    #   GzipFile (which decompresses data), HTTPResponse (which decodes
+    #   chunks).
+    # - possibly others (e.g. encrypted fs/partition?)
+    global _HAS_SENDFILE
+    try:
+        infd = fsrc.fileno()
+        outfd = fdst.fileno()
+    except Exception as err:
+        raise _GiveupOnFastCopy(err)  # not a regular file
+
+    # Hopefully the whole file will be copied in a single call.
+    # sendfile() is called in a loop 'till EOF is reached (0 return)
+    # so a bufsize smaller or bigger than the actual file size
+    # should not make any difference, also in case the file content
+    # changes while being copied.
+    try:
+        blocksize = max(os.fstat(infd).st_size, 2 ** 23)  # min 8MB
+    except Exception:
+        blocksize = 2 ** 27  # 128MB
+
+    offset = 0
+    while True:
+        try:
+            sent = os.sendfile(outfd, infd, offset, blocksize)
+        except OSError as err:
+            # ...in oder to have a more informative exception.
+            err.filename = fsrc.name
+            err.filename2 = fdst.name
+
+            if err.errno == errno.ENOTSOCK:
+                # sendfile() on this platform (probably Linux < 2.6.33)
+                # does not support copies between regular files (only
+                # sockets).
+                _HAS_SENDFILE = False
+                raise _GiveupOnFastCopy(err)
+
+            if err.errno == errno.ENOSPC:  # filesystem is full
+                raise err from None
+
+            # Give up on first call and if no data was copied.
+            if offset == 0 and os.lseek(outfd, 0, os.SEEK_CUR) == 0:
+                raise _GiveupOnFastCopy(err)
+
+            raise err
+        else:
+            if sent == 0:
+                break  # EOF
+            offset += sent
+
+def _copybinfileobj(fsrc, fdst, length=COPY_BUFSIZE):
+    """Copy 2 regular file objects open in binary mode."""
+    # Localize variable access to minimize overhead.
+    fsrc_readinto = fsrc.readinto
+    fdst_write = fdst.write
+    with memoryview(bytearray(length)) as mv:
+        while True:
+            n = fsrc_readinto(mv)
+            if not n:
+                break
+            elif n < length:
+                fdst_write(mv[:n])
+            else:
+                fdst_write(mv)
+
+def _is_binary_files_pair(fsrc, fdst):
+    return hasattr(fsrc, 'readinto') and \
+        isinstance(fsrc, io.BytesIO) or 'b' in getattr(fsrc, 'mode', '') and \
+        isinstance(fdst, io.BytesIO) or 'b' in getattr(fdst, 'mode', '')
 
-def copyfileobj(fsrc, fdst, length=16*1024):
+def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE):
     """copy data from file-like object fsrc to file-like object fdst"""
-    while 1:
-        buf = fsrc.read(length)
-        if not buf:
-            break
-        fdst.write(buf)
+    if _is_binary_files_pair(fsrc, fdst):
+        _copybinfileobj(fsrc, fdst, length=length)
+    else:
+        # Localize variable access to minimize overhead.
+        fsrc_read = fsrc.read
+        fdst_write = fdst.write
+        while 1:
+            buf = fsrc_read(length)
+            if not buf:
+                break
+            fdst_write(buf)
 
 def _samefile(src, dst):
     # Macintosh, Unix.
@@ -117,9 +238,23 @@ def copyfile(src, dst, *, follow_symlinks=True):
     if not follow_symlinks and os.path.islink(src):
         os.symlink(os.readlink(src), dst)
     else:
-        with open(src, 'rb') as fsrc:
-            with open(dst, 'wb') as fdst:
-                copyfileobj(fsrc, fdst)
+        with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst:
+            if _HAS_SENDFILE:
+                try:
+                    _fastcopy_sendfile(fsrc, fdst)
+                    return dst
+                except _GiveupOnFastCopy:
+                    pass
+
+            if _HAS_FCOPYFILE:
+                try:
+                    _fastcopy_osx(fsrc, fdst, posix._COPYFILE_DATA)
+                    return dst
+                except _GiveupOnFastCopy:
+                    pass
+
+            _copybinfileobj(fsrc, fdst)
+
     return dst
 
 def copymode(src, dst, *, follow_symlinks=True):
@@ -244,13 +379,12 @@ def copy(src, dst, *, follow_symlinks=True):
 
 def copy2(src, dst, *, follow_symlinks=True):
     """Copy data and all stat info ("cp -p src dst"). Return the file's
-    destination."
+    destination.
 
     The destination may be a directory.
 
     If follow_symlinks is false, symlinks won't be followed. This
     resembles GNU's "cp -P src dst".
-
     """
     if os.path.isdir(dst):
         dst = os.path.join(dst, os.path.basename(src))
@@ -1015,7 +1149,6 @@ def disk_usage(path):
 
 elif os.name == 'nt':
 
-    import nt
     __all__.append('disk_usage')
     _ntuple_diskusage = collections.namedtuple('usage', 'total used free')