Skip to content

Commit 71d85d7

Browse files
authored
Fine-grained: Treat files changed only if md5 changes (#4505)
Previously the daemon could do a lot of extra work if you switched to another git branch and immediately back to the original branch, since many file timestamps would change. Now this isn't sufficient to consider a file as changed. Also add a new cached file system abstraction (that is currently only used in dmypy) to avoid redundant file system operations and to make file system state easier to reason about. The idea is that we cache output of previous file system operations during a single increment, for both consistency and performance. My plan is to eventually use it about everywhere. This should also make it slightly easier to switch to listening to file system events instead of stat()ing everything, but that's not a priority yet. Fixes #4499.
1 parent 7192a75 commit 71d85d7

File tree

5 files changed

+240
-30
lines changed

5 files changed

+240
-30
lines changed

mypy/dmypy_server.py

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import mypy.server.update
2424
from mypy.dmypy_util import STATUS_FILE, receive
2525
from mypy.gclogger import GcLogger
26+
from mypy.fscache import FileSystemCache
27+
from mypy.fswatcher import FileSystemWatcher
2628

2729

2830
def daemonize(func: Callable[[], None], log_file: Optional[str] = None) -> int:
@@ -243,14 +245,11 @@ def check_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str,
243245
return self.fine_grained_increment(sources)
244246

245247
def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
246-
self.file_modified = {} # type: Dict[str, float]
247-
for source in sources:
248-
assert source.path
249-
try:
250-
self.file_modified[source.path] = os.stat(source.path).st_mtime
251-
except FileNotFoundError:
252-
# Don't crash if passed a non-existent file.
253-
pass
248+
self.fscache = FileSystemCache(self.options.python_version)
249+
self.fswatcher = FileSystemWatcher(self.fscache)
250+
self.update_sources(sources)
251+
# Stores the initial state of sources as a side effect.
252+
self.fswatcher.find_changed()
254253
try:
255254
# TODO: alt_lib_path
256255
result = mypy.build.build(sources=sources,
@@ -270,9 +269,11 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
270269
self.previous_messages = messages[:]
271270
self.fine_grained_initialized = True
272271
self.previous_sources = sources
272+
self.fscache.flush()
273273
return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}
274274

275275
def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
276+
self.update_sources(sources)
276277
changed = self.find_changed(sources)
277278
if not changed:
278279
# Nothing changed -- just produce the same result as before.
@@ -282,36 +283,26 @@ def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[
282283
status = 1 if messages else 0
283284
self.previous_messages = messages[:]
284285
self.previous_sources = sources
286+
self.fscache.flush()
285287
return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}
286288

289+
def update_sources(self, sources: List[mypy.build.BuildSource]) -> None:
290+
paths = [source.path for source in sources if source.path is not None]
291+
self.fswatcher.add_watched_paths(paths)
292+
287293
def find_changed(self, sources: List[mypy.build.BuildSource]) -> List[Tuple[str, str]]:
288-
changed = []
289-
for source in sources:
290-
path = source.path
291-
assert path
292-
try:
293-
mtime = os.stat(path).st_mtime
294-
except FileNotFoundError:
295-
# A non-existent file was included on the command line.
296-
#
297-
# TODO: Generate error if file is missing (if not ignoring missing imports)
298-
if path in self.file_modified:
299-
changed.append((source.module, path))
300-
else:
301-
if path not in self.file_modified or self.file_modified[path] != mtime:
302-
self.file_modified[path] = mtime
303-
changed.append((source.module, path))
294+
changed_paths = self.fswatcher.find_changed()
295+
changed = [(source.module, source.path)
296+
for source in sources
297+
if source.path in changed_paths]
304298
modules = {source.module for source in sources}
305299
omitted = [source for source in self.previous_sources if source.module not in modules]
306300
for source in omitted:
307301
path = source.path
308302
assert path
309-
# Note that a file could be removed from the list of root sources but still continue
310-
# to exist on the file system.
311-
if not os.path.isfile(path):
303+
# Note that a file could be removed from the list of root sources but have no changes.
304+
if path in changed_paths:
312305
changed.append((source.module, path))
313-
if source.path in self.file_modified:
314-
del self.file_modified[source.path]
315306
return changed
316307

317308
def cmd_hang(self) -> Dict[str, object]:

mypy/fscache.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""Interface for accessing the file system with automatic caching.
2+
3+
The idea is to cache the results of any file system state reads during
4+
a single transaction. This has two main benefits:
5+
6+
* This avoids redundant syscalls, as we won't perform the same OS
7+
operations multiple times.
8+
9+
* This makes it easier to reason about concurrent FS updates, as different
10+
operations targeting the same paths can't report different state during
11+
a transaction.
12+
13+
Note that this only deals with reading state, not writing.
14+
15+
Properties maintained by the API:
16+
17+
* The contents of the file are always from the same or later time compared
18+
to the reported mtime of the file, even if mtime is queried after reading
19+
a file.
20+
21+
* Repeating an operation produces the same result as the first one during
22+
a transaction.
23+
24+
* Call flush() to start a new transaction (flush the caches).
25+
26+
The API is a bit limited. It's easy to add new cached operations, however.
27+
You should perform all file system reads through the API to actually take
28+
advantage of the benefits.
29+
"""
30+
31+
import os
32+
import stat
33+
from typing import Tuple, Dict, List
34+
35+
from mypy.build import read_with_python_encoding
36+
from mypy.errors import DecodeError
37+
38+
39+
class FileSystemCache:
40+
def __init__(self, pyversion: Tuple[int, int]) -> None:
41+
self.pyversion = pyversion
42+
self.flush()
43+
44+
def flush(self) -> None:
45+
"""Start another transaction and empty all caches."""
46+
self.stat_cache = {} # type: Dict[str, os.stat_result]
47+
self.stat_error_cache = {} # type: Dict[str, Exception]
48+
self.read_cache = {} # type: Dict[str, str]
49+
self.read_error_cache = {} # type: Dict[str, Exception]
50+
self.hash_cache = {} # type: Dict[str, str]
51+
self.listdir_cache = {} # type: Dict[str, List[str]]
52+
self.listdir_error_cache = {} # type: Dict[str, Exception]
53+
54+
def read_with_python_encoding(self, path: str) -> str:
55+
if path in self.read_cache:
56+
return self.read_cache[path]
57+
if path in self.read_error_cache:
58+
raise self.read_error_cache[path]
59+
60+
# Need to stat first so that the contents of file are from no
61+
# earlier instant than the mtime reported by self.stat().
62+
self.stat(path)
63+
64+
try:
65+
data, md5hash = read_with_python_encoding(path, self.pyversion)
66+
except Exception as err:
67+
self.read_error_cache[path] = err
68+
raise
69+
self.read_cache[path] = data
70+
self.hash_cache[path] = md5hash
71+
return data
72+
73+
def stat(self, path: str) -> os.stat_result:
74+
if path in self.stat_cache:
75+
return self.stat_cache[path]
76+
if path in self.stat_error_cache:
77+
raise self.stat_error_cache[path]
78+
try:
79+
st = os.stat(path)
80+
except Exception as err:
81+
self.stat_error_cache[path] = err
82+
raise
83+
self.stat_cache[path] = st
84+
return st
85+
86+
def listdir(self, path: str) -> List[str]:
87+
if path in self.listdir_cache:
88+
return self.listdir_cache[path]
89+
if path in self.listdir_error_cache:
90+
raise self.listdir_error_cache[path]
91+
try:
92+
results = os.listdir(path)
93+
except Exception as err:
94+
self.listdir_error_cache[path] = err
95+
raise err
96+
self.listdir_cache[path] = results
97+
return results
98+
99+
def isfile(self, path: str) -> bool:
100+
st = self.stat(path)
101+
return stat.S_ISREG(st.st_mode)
102+
103+
def isdir(self, path: str) -> bool:
104+
st = self.stat(path)
105+
return stat.S_ISDIR(st.st_mode)
106+
107+
def exists(self, path: str) -> bool:
108+
try:
109+
self.stat(path)
110+
except FileNotFoundError:
111+
return False
112+
return True
113+
114+
def md5(self, path: str) -> str:
115+
if path not in self.hash_cache:
116+
self.read_with_python_encoding(path)
117+
return self.hash_cache[path]

mypy/fswatcher.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""Watch parts of the file system for changes."""
2+
3+
from mypy.fscache import FileSystemCache
4+
from typing import NamedTuple, Set, AbstractSet, Iterable, Dict, Optional
5+
6+
7+
FileData = NamedTuple('FileData', [('st_mtime', float),
8+
('st_size', int),
9+
('md5', str)])
10+
11+
12+
class FileSystemWatcher:
13+
"""Watcher for file system changes among specific paths.
14+
15+
All file system access is performed using FileSystemCache. We
16+
detect changed files by stat()ing them all and comparing md5 hashes
17+
of potentially changed files. If a file has both size and mtime
18+
unmodified, the file is assumed to be unchanged.
19+
20+
An important goal of this class is to make it easier to eventually
21+
use file system events to detect file changes.
22+
23+
Note: This class doesn't flush the file system cache. If you don't
24+
manually flush it, changes won't be seen.
25+
"""
26+
27+
# TODO: Watching directories?
28+
# TODO: Handle non-files
29+
30+
def __init__(self, fs: FileSystemCache) -> None:
31+
self.fs = fs
32+
self._paths = set() # type: Set[str]
33+
self._file_data = {} # type: Dict[str, Optional[FileData]]
34+
35+
@property
36+
def paths(self) -> AbstractSet[str]:
37+
return self._paths
38+
39+
def add_watched_paths(self, paths: Iterable[str]) -> None:
40+
for path in paths:
41+
if path not in self._paths:
42+
# By storing None this path will get reported as changed by
43+
# find_changed if it exists.
44+
self._file_data[path] = None
45+
self._paths |= set(paths)
46+
47+
def remove_watched_paths(self, paths: Iterable[str]) -> None:
48+
for path in paths:
49+
if path in self._file_data:
50+
del self._file_data[path]
51+
self._paths -= set(paths)
52+
53+
def _update(self, path: str) -> None:
54+
st = self.fs.stat(path)
55+
md5 = self.fs.md5(path)
56+
self._file_data[path] = FileData(st.st_mtime, st.st_size, md5)
57+
58+
def find_changed(self) -> Set[str]:
59+
"""Return paths that have changes since the last call, in the watched set."""
60+
changed = set()
61+
for path in self._paths:
62+
old = self._file_data[path]
63+
try:
64+
st = self.fs.stat(path)
65+
except FileNotFoundError:
66+
if old is not None:
67+
# File was deleted.
68+
changed.add(path)
69+
self._file_data[path] = None
70+
else:
71+
if old is None:
72+
# File is new.
73+
changed.add(path)
74+
self._update(path)
75+
elif st.st_size != old.st_size or st.st_mtime != old.st_mtime:
76+
# Only look for changes if size or mtime has changed as an
77+
# optimization, since calculating md5 is expensive.
78+
new_md5 = self.fs.md5(path)
79+
if st.st_size != old.st_size or new_md5 != old.md5:
80+
# Changed file.
81+
changed.add(path)
82+
self._update(path)
83+
return changed

mypy/server/update.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@
114114
Major todo items:
115115
116116
- Fully support multiple type checking passes
117+
- Use mypy.fscache to access file system
118+
- Don't use load_graph() and update the import graph incrementally
117119
"""
118120

119121
import os.path

test-data/unit/check-dmypy-fine-grained.test

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ tmp/a.py:1: error: invalid syntax
9797
tmp/b.py:2: error: Incompatible return value type (got "str", expected "int")
9898
[out3]
9999

100-
[case testNoOpUpdateFineGrainedIncremental]
100+
[case testNoOpUpdateFineGrainedIncremental1]
101101
# cmd: mypy -m a
102102
[file a.py]
103103
1()
@@ -111,6 +111,23 @@ tmp/a.py:1: error: "int" not callable
111111
tmp/a.py:1: error: "int" not callable
112112
[out3]
113113

114+
[case testNoOpUpdateFineGrainedIncremental2]
115+
# cmd: mypy -m a
116+
[file a.py]
117+
1()
118+
[file a.py.2]
119+
1()
120+
[file a.py.3]
121+
x = 1
122+
[file a.py.4]
123+
x = 1
124+
[out1]
125+
tmp/a.py:1: error: "int" not callable
126+
[out2]
127+
tmp/a.py:1: error: "int" not callable
128+
[out3]
129+
[out4]
130+
114131
[case testNonExistentFileOnCommandLineFineGrainedIncremental1]
115132
# cmd: mypy -m a nonexistent
116133
# NOTE: 'nonexistent' is a magic module name understood by mypy.test.testdmypy

0 commit comments

Comments
 (0)