Fine-grained: Treat files changed only if md5 changes (#4505)

JukkaL · web-flow · commit 71d85d7972d2 · 2018-01-24T20:17:54.000Z
Previously the daemon could do a lot of extra work if you switched to another git branch and immediately back to the original branch, since many file timestamps would change. Now this isn't sufficient to consider a file as changed. Also add a new cached file system abstraction (that is currently only used in dmypy) to avoid redundant file system operations and to make file system state easier to reason about. The idea is that we cache output of previous file system operations during a single increment, for both consistency and performance. My plan is to eventually use it about everywhere. This should also make it slightly easier to switch to listening to file system events instead of stat()ing everything, but that's not a priority yet. Fixes #4499.
diff --git a/mypy/dmypy_server.py b/mypy/dmypy_server.py
@@ -23,6 +23,8 @@
 import mypy.server.update
 from mypy.dmypy_util import STATUS_FILE, receive
 from mypy.gclogger import GcLogger
+from mypy.fscache import FileSystemCache
+from mypy.fswatcher import FileSystemWatcher
 
 
 def daemonize(func: Callable[[], None], log_file: Optional[str] = None) -> int:
@@ -243,14 +245,11 @@ def check_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str,
             return self.fine_grained_increment(sources)
 
     def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
-        self.file_modified = {}  # type: Dict[str, float]
-        for source in sources:
-            assert source.path
-            try:
-                self.file_modified[source.path] = os.stat(source.path).st_mtime
-            except FileNotFoundError:
-                # Don't crash if passed a non-existent file.
-                pass
+        self.fscache = FileSystemCache(self.options.python_version)
+        self.fswatcher = FileSystemWatcher(self.fscache)
+        self.update_sources(sources)
+        # Stores the initial state of sources as a side effect.
+        self.fswatcher.find_changed()
         try:
             # TODO: alt_lib_path
             result = mypy.build.build(sources=sources,
@@ -270,9 +269,11 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
         self.previous_messages = messages[:]
         self.fine_grained_initialized = True
         self.previous_sources = sources
+        self.fscache.flush()
         return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}
 
     def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
+        self.update_sources(sources)
         changed = self.find_changed(sources)
         if not changed:
             # Nothing changed -- just produce the same result as before.
@@ -282,36 +283,26 @@ def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[
         status = 1 if messages else 0
         self.previous_messages = messages[:]
         self.previous_sources = sources
+        self.fscache.flush()
         return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}
 
+    def update_sources(self, sources: List[mypy.build.BuildSource]) -> None:
+        paths = [source.path for source in sources if source.path is not None]
+        self.fswatcher.add_watched_paths(paths)
+
     def find_changed(self, sources: List[mypy.build.BuildSource]) -> List[Tuple[str, str]]:
-        changed = []
-        for source in sources:
-            path = source.path
-            assert path
-            try:
-                mtime = os.stat(path).st_mtime
-            except FileNotFoundError:
-                # A non-existent file was included on the command line.
-                #
-                # TODO: Generate error if file is missing (if not ignoring missing imports)
-                if path in self.file_modified:
-                    changed.append((source.module, path))
-            else:
-                if path not in self.file_modified or self.file_modified[path] != mtime:
-                    self.file_modified[path] = mtime
-                    changed.append((source.module, path))
+        changed_paths = self.fswatcher.find_changed()
+        changed = [(source.module, source.path)
+                   for source in sources
+                   if source.path in changed_paths]
         modules = {source.module for source in sources}
         omitted = [source for source in self.previous_sources if source.module not in modules]
         for source in omitted:
             path = source.path
             assert path
-            # Note that a file could be removed from the list of root sources but still continue
-            # to exist on the file system.
-            if not os.path.isfile(path):
+            # Note that a file could be removed from the list of root sources but have no changes.
+            if path in changed_paths:
                 changed.append((source.module, path))
-                if source.path in self.file_modified:
-                    del self.file_modified[source.path]
         return changed
 
     def cmd_hang(self) -> Dict[str, object]:
diff --git a/mypy/fscache.py b/mypy/fscache.py
@@ -0,0 +1,117 @@
+"""Interface for accessing the file system with automatic caching.
+
+The idea is to cache the results of any file system state reads during
+a single transaction. This has two main benefits:
+
+* This avoids redundant syscalls, as we won't perform the same OS
+  operations multiple times.
+
+* This makes it easier to reason about concurrent FS updates, as different
+  operations targeting the same paths can't report different state during
+  a transaction.
+
+Note that this only deals with reading state, not writing.
+
+Properties maintained by the API:
+
+* The contents of the file are always from the same or later time compared
+  to the reported mtime of the file, even if mtime is queried after reading
+  a file.
+
+* Repeating an operation produces the same result as the first one during
+  a transaction.
+
+* Call flush() to start a new transaction (flush the caches).
+
+The API is a bit limited. It's easy to add new cached operations, however.
+You should perform all file system reads through the API to actually take
+advantage of the benefits.
+"""
+
+import os
+import stat
+from typing import Tuple, Dict, List
+
+from mypy.build import read_with_python_encoding
+from mypy.errors import DecodeError
+
+
+class FileSystemCache:
+    def __init__(self, pyversion: Tuple[int, int]) -> None:
+        self.pyversion = pyversion
+        self.flush()
+
+    def flush(self) -> None:
+        """Start another transaction and empty all caches."""
+        self.stat_cache = {}  # type: Dict[str, os.stat_result]
+        self.stat_error_cache = {}  # type: Dict[str, Exception]
+        self.read_cache = {}  # type: Dict[str, str]
+        self.read_error_cache = {}  # type: Dict[str, Exception]
+        self.hash_cache = {}  # type: Dict[str, str]
+        self.listdir_cache = {}  # type: Dict[str, List[str]]
+        self.listdir_error_cache = {}  # type: Dict[str, Exception]
+
+    def read_with_python_encoding(self, path: str) -> str:
+        if path in self.read_cache:
+            return self.read_cache[path]
+        if path in self.read_error_cache:
+            raise self.read_error_cache[path]
+
+        # Need to stat first so that the contents of file are from no
+        # earlier instant than the mtime reported by self.stat().
+        self.stat(path)
+
+        try:
+            data, md5hash = read_with_python_encoding(path, self.pyversion)
+        except Exception as err:
+            self.read_error_cache[path] = err
+            raise
+        self.read_cache[path] = data
+        self.hash_cache[path] = md5hash
+        return data
+
+    def stat(self, path: str) -> os.stat_result:
+        if path in self.stat_cache:
+            return self.stat_cache[path]
+        if path in self.stat_error_cache:
+            raise self.stat_error_cache[path]
+        try:
+            st = os.stat(path)
+        except Exception as err:
+            self.stat_error_cache[path] = err
+            raise
+        self.stat_cache[path] = st
+        return st
+
+    def listdir(self, path: str) -> List[str]:
+        if path in self.listdir_cache:
+            return self.listdir_cache[path]
+        if path in self.listdir_error_cache:
+            raise self.listdir_error_cache[path]
+        try:
+            results = os.listdir(path)
+        except Exception as err:
+            self.listdir_error_cache[path] = err
+            raise err
+        self.listdir_cache[path] = results
+        return results
+
+    def isfile(self, path: str) -> bool:
+        st = self.stat(path)
+        return stat.S_ISREG(st.st_mode)
+
+    def isdir(self, path: str) -> bool:
+        st = self.stat(path)
+        return stat.S_ISDIR(st.st_mode)
+
+    def exists(self, path: str) -> bool:
+        try:
+            self.stat(path)
+        except FileNotFoundError:
+            return False
+        return True
+
+    def md5(self, path: str) -> str:
+        if path not in self.hash_cache:
+            self.read_with_python_encoding(path)
+        return self.hash_cache[path]
diff --git a/mypy/fswatcher.py b/mypy/fswatcher.py
@@ -0,0 +1,83 @@
+"""Watch parts of the file system for changes."""
+
+from mypy.fscache import FileSystemCache
+from typing import NamedTuple, Set, AbstractSet, Iterable, Dict, Optional
+
+
+FileData = NamedTuple('FileData', [('st_mtime', float),
+                                   ('st_size', int),
+                                   ('md5', str)])
+
+
+class FileSystemWatcher:
+    """Watcher for file system changes among specific paths.
+
+    All file system access is performed using FileSystemCache. We
+    detect changed files by stat()ing them all and comparing md5 hashes
+    of potentially changed files. If a file has both size and mtime
+    unmodified, the file is assumed to be unchanged.
+
+    An important goal of this class is to make it easier to eventually
+    use file system events to detect file changes.
+
+    Note: This class doesn't flush the file system cache. If you don't
+    manually flush it, changes won't be seen.
+    """
+
+    # TODO: Watching directories?
+    # TODO: Handle non-files
+
+    def __init__(self, fs: FileSystemCache) -> None:
+        self.fs = fs
+        self._paths = set()  # type: Set[str]
+        self._file_data = {}  # type: Dict[str, Optional[FileData]]
+
+    @property
+    def paths(self) -> AbstractSet[str]:
+        return self._paths
+
+    def add_watched_paths(self, paths: Iterable[str]) -> None:
+        for path in paths:
+            if path not in self._paths:
+                # By storing None this path will get reported as changed by
+                # find_changed if it exists.
+                self._file_data[path] = None
+        self._paths |= set(paths)
+
+    def remove_watched_paths(self, paths: Iterable[str]) -> None:
+        for path in paths:
+            if path in self._file_data:
+                del self._file_data[path]
+        self._paths -= set(paths)
+
+    def _update(self, path: str) -> None:
+        st = self.fs.stat(path)
+        md5 = self.fs.md5(path)
+        self._file_data[path] = FileData(st.st_mtime, st.st_size, md5)
+
+    def find_changed(self) -> Set[str]:
+        """Return paths that have changes since the last call, in the watched set."""
+        changed = set()
+        for path in self._paths:
+            old = self._file_data[path]
+            try:
+                st = self.fs.stat(path)
+            except FileNotFoundError:
+                if old is not None:
+                    # File was deleted.
+                    changed.add(path)
+                    self._file_data[path] = None
+            else:
+                if old is None:
+                    # File is new.
+                    changed.add(path)
+                    self._update(path)
+                elif st.st_size != old.st_size or st.st_mtime != old.st_mtime:
+                    # Only look for changes if size or mtime has changed as an
+                    # optimization, since calculating md5 is expensive.
+                    new_md5 = self.fs.md5(path)
+                    if st.st_size != old.st_size or new_md5 != old.md5:
+                        # Changed file.
+                        changed.add(path)
+                        self._update(path)
+        return changed
diff --git a/mypy/server/update.py b/mypy/server/update.py
@@ -114,6 +114,8 @@
 Major todo items:
 
 - Fully support multiple type checking passes
+- Use mypy.fscache to access file system
+- Don't use load_graph() and update the import graph incrementally
 """
 
 import os.path
diff --git a/test-data/unit/check-dmypy-fine-grained.test b/test-data/unit/check-dmypy-fine-grained.test
@@ -97,7 +97,7 @@ tmp/a.py:1: error: invalid syntax
 tmp/b.py:2: error: Incompatible return value type (got "str", expected "int")
 [out3]
 
-[case testNoOpUpdateFineGrainedIncremental]
+[case testNoOpUpdateFineGrainedIncremental1]
 # cmd: mypy -m a
 [file a.py]
 1()
@@ -111,6 +111,23 @@ tmp/a.py:1: error: "int" not callable
 tmp/a.py:1: error: "int" not callable
 [out3]
 
+[case testNoOpUpdateFineGrainedIncremental2]
+# cmd: mypy -m a
+[file a.py]
+1()
+[file a.py.2]
+1()
+[file a.py.3]
+x = 1
+[file a.py.4]
+x = 1
+[out1]
+tmp/a.py:1: error: "int" not callable
+[out2]
+tmp/a.py:1: error: "int" not callable
+[out3]
+[out4]
+
 [case testNonExistentFileOnCommandLineFineGrainedIncremental1]
 # cmd: mypy -m a nonexistent
 # NOTE: 'nonexistent' is a magic module name understood by mypy.test.testdmypy