Skip to content

Commit 4103892

Browse files
committed
Use orjson instead of json, when available
For `mypy -c 'import torch'`, the cache load time goes from 0.44s to 0.25s as measured by manager's data_json_load_time If I time dump times specifically, I see a saving of 0.65s to 0.07s. Overall, a pretty reasonable perf win -- should we make it a required dependency? I don't know if the sqlite cache path is used at all, but let me know if I need a cleverer migration than renaming the table
1 parent 1a074b6 commit 4103892

File tree

4 files changed

+64
-54
lines changed

4 files changed

+64
-54
lines changed

mypy/build.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
from mypy.stubinfo import legacy_bundled_packages, non_bundled_packages, stub_distribution_name
9696
from mypy.types import Type
9797
from mypy.typestate import reset_global_state, type_state
98+
from mypy.util import json_dumps, json_loads
9899
from mypy.version import __version__
99100

100101
# Switch to True to produce debug output related to fine-grained incremental
@@ -858,7 +859,7 @@ def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]:
858859
t0 = time.time()
859860
if id in self.fg_deps_meta:
860861
# TODO: Assert deps file wasn't changed.
861-
deps = json.loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
862+
deps = json_loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
862863
else:
863864
deps = {}
864865
val = {k: set(v) for k, v in deps.items()}
@@ -911,8 +912,8 @@ def stats_summary(self) -> Mapping[str, object]:
911912
return self.stats
912913

913914

914-
def deps_to_json(x: dict[str, set[str]]) -> str:
915-
return json.dumps({k: list(v) for k, v in x.items()}, separators=(",", ":"))
915+
def deps_to_json(x: dict[str, set[str]]) -> bytes:
916+
return json_dumps({k: list(v) for k, v in x.items()})
916917

917918

918919
# File for storing metadata about all the fine-grained dependency caches
@@ -980,7 +981,7 @@ def write_deps_cache(
980981

981982
meta = {"snapshot": meta_snapshot, "deps_meta": fg_deps_meta}
982983

983-
if not metastore.write(DEPS_META_FILE, json.dumps(meta, separators=(",", ":"))):
984+
if not metastore.write(DEPS_META_FILE, json_dumps(meta)):
984985
manager.log(f"Error writing fine-grained deps meta JSON file {DEPS_META_FILE}")
985986
error = True
986987

@@ -1048,7 +1049,7 @@ def generate_deps_for_cache(manager: BuildManager, graph: Graph) -> dict[str, di
10481049

10491050
def write_plugins_snapshot(manager: BuildManager) -> None:
10501051
"""Write snapshot of versions and hashes of currently active plugins."""
1051-
snapshot = json.dumps(manager.plugins_snapshot, separators=(",", ":"))
1052+
snapshot = json_dumps(manager.plugins_snapshot)
10521053
if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, snapshot):
10531054
manager.errors.set_file(_cache_dir_prefix(manager.options), None, manager.options)
10541055
manager.errors.report(0, 0, "Error writing plugins snapshot", blocker=True)
@@ -1079,8 +1080,8 @@ def read_quickstart_file(
10791080
# just ignore it.
10801081
raw_quickstart: dict[str, Any] = {}
10811082
try:
1082-
with open(options.quickstart_file) as f:
1083-
raw_quickstart = json.load(f)
1083+
with open(options.quickstart_file, "rb") as f:
1084+
raw_quickstart = json_loads(f.read())
10841085

10851086
quickstart = {}
10861087
for file, (x, y, z) in raw_quickstart.items():
@@ -1148,10 +1149,10 @@ def _load_json_file(
11481149
manager.add_stats(metastore_read_time=time.time() - t0)
11491150
# Only bother to compute the log message if we are logging it, since it could be big
11501151
if manager.verbosity() >= 2:
1151-
manager.trace(log_success + data.rstrip())
1152+
manager.trace(log_success + data.rstrip().decode())
11521153
try:
11531154
t1 = time.time()
1154-
result = json.loads(data)
1155+
result = json_loads(data)
11551156
manager.add_stats(data_json_load_time=time.time() - t1)
11561157
except json.JSONDecodeError:
11571158
manager.errors.set_file(file, None, manager.options)
@@ -1343,8 +1344,8 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | No
13431344
# So that plugins can return data with tuples in it without
13441345
# things silently always invalidating modules, we round-trip
13451346
# the config data. This isn't beautiful.
1346-
plugin_data = json.loads(
1347-
json.dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
1347+
plugin_data = json_loads(
1348+
json_dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
13481349
)
13491350
if m.plugin_data != plugin_data:
13501351
manager.log(f"Metadata abandoned for {id}: plugin configuration differs")
@@ -1478,18 +1479,15 @@ def validate_meta(
14781479
"ignore_all": meta.ignore_all,
14791480
"plugin_data": meta.plugin_data,
14801481
}
1481-
if manager.options.debug_cache:
1482-
meta_str = json.dumps(meta_dict, indent=2, sort_keys=True)
1483-
else:
1484-
meta_str = json.dumps(meta_dict, separators=(",", ":"))
1482+
meta_bytes = json_dumps(meta_dict, manager.options.debug_cache)
14851483
meta_json, _, _ = get_cache_names(id, path, manager.options)
14861484
manager.log(
14871485
"Updating mtime for {}: file {}, meta {}, mtime {}".format(
14881486
id, path, meta_json, meta.mtime
14891487
)
14901488
)
14911489
t1 = time.time()
1492-
manager.metastore.write(meta_json, meta_str) # Ignore errors, just an optimization.
1490+
manager.metastore.write(meta_json, meta_bytes) # Ignore errors, just an optimization.
14931491
manager.add_stats(validate_update_time=time.time() - t1, validate_munging_time=t1 - t0)
14941492
return meta
14951493

@@ -1507,13 +1505,6 @@ def compute_hash(text: str) -> str:
15071505
return hash_digest(text.encode("utf-8"))
15081506

15091507

1510-
def json_dumps(obj: Any, debug_cache: bool) -> str:
1511-
if debug_cache:
1512-
return json.dumps(obj, indent=2, sort_keys=True)
1513-
else:
1514-
return json.dumps(obj, sort_keys=True, separators=(",", ":"))
1515-
1516-
15171508
def write_cache(
15181509
id: str,
15191510
path: str,
@@ -1566,8 +1557,8 @@ def write_cache(
15661557

15671558
# Serialize data and analyze interface
15681559
data = tree.serialize()
1569-
data_str = json_dumps(data, manager.options.debug_cache)
1570-
interface_hash = compute_hash(data_str)
1560+
data_bytes = json_dumps(data, manager.options.debug_cache)
1561+
interface_hash = hash_digest(data_bytes)
15711562

15721563
plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False))
15731564

@@ -1591,7 +1582,7 @@ def write_cache(
15911582
manager.trace(f"Interface for {id} is unchanged")
15921583
else:
15931584
manager.trace(f"Interface for {id} has changed")
1594-
if not metastore.write(data_json, data_str):
1585+
if not metastore.write(data_json, data_bytes):
15951586
# Most likely the error is the replace() call
15961587
# (see https://github.com/python/mypy/issues/3215).
15971588
manager.log(f"Error writing data JSON file {data_json}")
@@ -3566,4 +3557,4 @@ def write_undocumented_ref_info(
35663557
assert not ref_info_file.startswith(".")
35673558

35683559
deps_json = get_undocumented_ref_info_json(state.tree, type_map)
3569-
metastore.write(ref_info_file, json.dumps(deps_json, separators=(",", ":")))
3560+
metastore.write(ref_info_file, json_dumps(deps_json))

mypy/metastore.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ def getmtime(self, name: str) -> float:
3333
"""
3434

3535
@abstractmethod
36-
def read(self, name: str) -> str:
36+
def read(self, name: str) -> bytes:
3737
"""Read the contents of a metadata entry.
3838
3939
Raises FileNotFound if the entry does not exist.
4040
"""
4141

4242
@abstractmethod
43-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
43+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
4444
"""Write a metadata entry.
4545
4646
If mtime is specified, set it as the mtime of the entry. Otherwise,
@@ -86,16 +86,16 @@ def getmtime(self, name: str) -> float:
8686

8787
return int(os.path.getmtime(os.path.join(self.cache_dir_prefix, name)))
8888

89-
def read(self, name: str) -> str:
89+
def read(self, name: str) -> bytes:
9090
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
9191

9292
if not self.cache_dir_prefix:
9393
raise FileNotFoundError()
9494

95-
with open(os.path.join(self.cache_dir_prefix, name)) as f:
95+
with open(os.path.join(self.cache_dir_prefix, name), "rb") as f:
9696
return f.read()
9797

98-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
98+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
9999
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
100100

101101
if not self.cache_dir_prefix:
@@ -105,7 +105,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
105105
tmp_filename = path + "." + random_string()
106106
try:
107107
os.makedirs(os.path.dirname(path), exist_ok=True)
108-
with open(tmp_filename, "w") as f:
108+
with open(tmp_filename, "wb") as f:
109109
f.write(data)
110110
os.replace(tmp_filename, path)
111111
if mtime is not None:
@@ -135,27 +135,20 @@ def list_all(self) -> Iterable[str]:
135135

136136

137137
SCHEMA = """
138-
CREATE TABLE IF NOT EXISTS files (
138+
CREATE TABLE IF NOT EXISTS files2 (
139139
path TEXT UNIQUE NOT NULL,
140140
mtime REAL,
141-
data TEXT
141+
data BLOB
142142
);
143-
CREATE INDEX IF NOT EXISTS path_idx on files(path);
143+
CREATE INDEX IF NOT EXISTS path_idx on files2(path);
144144
"""
145-
# No migrations yet
146-
MIGRATIONS: list[str] = []
147145

148146

149147
def connect_db(db_file: str) -> sqlite3.Connection:
150148
import sqlite3.dbapi2
151149

152150
db = sqlite3.dbapi2.connect(db_file)
153151
db.executescript(SCHEMA)
154-
for migr in MIGRATIONS:
155-
try:
156-
db.executescript(migr)
157-
except sqlite3.OperationalError:
158-
pass
159152
return db
160153

161154

@@ -188,12 +181,12 @@ def getmtime(self, name: str) -> float:
188181
assert isinstance(mtime, float)
189182
return mtime
190183

191-
def read(self, name: str) -> str:
184+
def read(self, name: str) -> bytes:
192185
data = self._query(name, "data")
193-
assert isinstance(data, str)
186+
assert isinstance(data, bytes)
194187
return data
195188

196-
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
189+
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
197190
import sqlite3
198191

199192
if not self.db:
@@ -202,7 +195,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
202195
if mtime is None:
203196
mtime = time.time()
204197
self.db.execute(
205-
"INSERT OR REPLACE INTO files(path, mtime, data) VALUES(?, ?, ?)",
198+
"INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
206199
(name, mtime, data),
207200
)
208201
except sqlite3.OperationalError:
@@ -213,13 +206,13 @@ def remove(self, name: str) -> None:
213206
if not self.db:
214207
raise FileNotFoundError()
215208

216-
self.db.execute("DELETE FROM files WHERE path = ?", (name,))
209+
self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
217210

218211
def commit(self) -> None:
219212
if self.db:
220213
self.db.commit()
221214

222215
def list_all(self) -> Iterable[str]:
223216
if self.db:
224-
for row in self.db.execute("SELECT path FROM files"):
217+
for row in self.db.execute("SELECT path FROM files2"):
225218
yield row[0]

mypy/util.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,23 @@
44

55
import hashlib
66
import io
7+
import json
78
import os
89
import pathlib
910
import re
1011
import shutil
1112
import sys
1213
import time
1314
from importlib import resources as importlib_resources
14-
from typing import IO, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
15+
from typing import IO, Any, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
1516
from typing_extensions import Literal
1617

18+
orjson: Any
19+
try:
20+
import orjson # type: ignore[import-not-found, no-redef, unused-ignore]
21+
except ImportError:
22+
orjson = None
23+
1724
try:
1825
import curses
1926

@@ -874,3 +881,22 @@ def quote_docstring(docstr: str) -> str:
874881
return f"''{docstr_repr}''"
875882
else:
876883
return f'""{docstr_repr}""'
884+
885+
886+
def json_dumps(obj: object, debug: bool = False) -> bytes:
887+
if orjson is not None:
888+
if debug:
889+
return orjson.dumps(obj, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) # type: ignore[no-any-return]
890+
else:
891+
return orjson.dumps(obj) # type: ignore[no-any-return]
892+
893+
if debug:
894+
return json.dumps(obj, indent=2, sort_keys=True).encode("utf-8")
895+
else:
896+
return json.dumps(obj, separators=(",", ":")).encode("utf-8")
897+
898+
899+
def json_loads(data: bytes) -> Any:
900+
if orjson is not None:
901+
return orjson.loads(data)
902+
return json.loads(data)

mypyc/codegen/emitmodule.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from mypy.nodes import MypyFile
2525
from mypy.options import Options
2626
from mypy.plugin import Plugin, ReportConfigContext
27-
from mypy.util import hash_digest
27+
from mypy.util import hash_digest, json_dumps
2828
from mypyc.codegen.cstring import c_string_initializer
2929
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
3030
from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
@@ -154,7 +154,7 @@ def report_config_data(self, ctx: ReportConfigContext) -> tuple[str | None, list
154154
ir_data = json.loads(ir_json)
155155

156156
# Check that the IR cache matches the metadata cache
157-
if compute_hash(meta_json) != ir_data["meta_hash"]:
157+
if hash_digest(meta_json) != ir_data["meta_hash"]:
158158
return None
159159

160160
# Check that all of the source files are present and as
@@ -369,11 +369,11 @@ def write_cache(
369369
newpath = get_state_ir_cache_name(st)
370370
ir_data = {
371371
"ir": module.serialize(),
372-
"meta_hash": compute_hash(meta_data),
372+
"meta_hash": hash_digest(meta_data),
373373
"src_hashes": hashes[group_map[id]],
374374
}
375375

376-
result.manager.metastore.write(newpath, json.dumps(ir_data, separators=(",", ":")))
376+
result.manager.metastore.write(newpath, json_dumps(ir_data))
377377

378378
result.manager.metastore.commit()
379379

0 commit comments

Comments
 (0)