Skip to content

Commit 09bc2d6

Browse files
committed
perf: hash data files during combining to avoid unneeded work. #1483
When generating many parallel data files, often some data files will be exact copies of each other. Checking the hashes, we can avoid combining the duplicates, speeding the process. On a coverage.py metacov, we had 651 duplicates out of 2189 files (29%). The time to combine was reduced by 17%.
1 parent bc630b5 commit 09bc2d6

File tree

6 files changed

+57
-33
lines changed

6 files changed

+57
-33
lines changed

CHANGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ Unreleased
2929
- Using ``--format=total`` will write a single total number to the
3030
output. This can be useful for making badges or writing status updates.
3131

32+
- Combining data files with ``coverage combine`` now quickly hashes the data
33+
files to skip files that provide no new information. This can reduce the
34+
time needed. For coverage.py's own test suite, combining was about 17%
35+
faster.
36+
3237
- An empty file has a coverage total of 100%, but used to fail with
3338
``--fail-under``. This has been fixed, closing `issue 1470`_.
3439

coverage/data.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"""
1212

1313
import glob
14+
import hashlib
1415
import os.path
1516

1617
from coverage.exceptions import CoverageException, NoDataError
@@ -110,42 +111,60 @@ def combine_parallel_data(
110111
if strict and not files_to_combine:
111112
raise NoDataError("No data to combine")
112113

113-
files_combined = 0
114+
file_hashes = set()
115+
combined_any = False
116+
114117
for f in files_to_combine:
115118
if f == data.data_filename():
116119
# Sometimes we are combining into a file which is one of the
117120
# parallel files. Skip that file.
118121
if data._debug.should('dataio'):
119122
data._debug.write(f"Skipping combining ourself: {f!r}")
120123
continue
121-
if data._debug.should('dataio'):
122-
data._debug.write(f"Combining data file {f!r}")
124+
123125
try:
124-
new_data = CoverageData(f, debug=data._debug)
125-
new_data.read()
126-
except CoverageException as exc:
127-
if data._warn:
128-
# The CoverageException has the file name in it, so just
129-
# use the message as the warning.
130-
data._warn(str(exc))
126+
rel_file_name = os.path.relpath(f)
127+
except ValueError:
128+
# ValueError can be raised under Windows when os.getcwd() returns a
129+
# folder from a different drive than the drive of f, in which case
130+
# we print the original value of f instead of its relative path
131+
rel_file_name = f
132+
133+
with open(f, "rb") as fobj:
134+
hasher = hashlib.new("sha3_256")
135+
hasher.update(fobj.read())
136+
sha = hasher.digest()
137+
combine_this_one = sha not in file_hashes
138+
139+
delete_this_one = not keep
140+
if combine_this_one:
141+
if data._debug.should('dataio'):
142+
data._debug.write(f"Combining data file {f!r}")
143+
file_hashes.add(sha)
144+
try:
145+
new_data = CoverageData(f, debug=data._debug)
146+
new_data.read()
147+
except CoverageException as exc:
148+
if data._warn:
149+
# The CoverageException has the file name in it, so just
150+
# use the message as the warning.
151+
data._warn(str(exc))
152+
delete_this_one = False
153+
else:
154+
data.update(new_data, aliases=aliases)
155+
combined_any = True
156+
if message:
157+
message(f"Combined data file {rel_file_name}")
131158
else:
132-
data.update(new_data, aliases=aliases)
133-
files_combined += 1
134159
if message:
135-
try:
136-
file_name = os.path.relpath(f)
137-
except ValueError:
138-
# ValueError can be raised under Windows when os.getcwd() returns a
139-
# folder from a different drive than the drive of f, in which case
140-
# we print the original value of f instead of its relative path
141-
file_name = f
142-
message(f"Combined data file {file_name}")
143-
if not keep:
144-
if data._debug.should('dataio'):
145-
data._debug.write(f"Deleting combined data file {f!r}")
146-
file_be_gone(f)
147-
148-
if strict and not files_combined:
160+
message(f"Skipping duplicate data {rel_file_name}")
161+
162+
if delete_this_one:
163+
if data._debug.should('dataio'):
164+
data._debug.write(f"Deleting data file {f!r}")
165+
file_be_gone(f)
166+
167+
if strict and not combined_any:
149168
raise NoDataError("No usable data files")
150169

151170

coverage/sqldata.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"""SQLite coverage data."""
55

66
import collections
7-
import datetime
87
import functools
98
import glob
109
import itertools
@@ -56,7 +55,6 @@
5655
-- 'has_arcs' boolean -- Is this data recording branches?
5756
-- 'sys_argv' text -- The coverage command line that recorded the data.
5857
-- 'version' text -- The version of coverage.py that made the file.
59-
-- 'when' text -- Datetime when the file was created.
6058
);
6159
6260
CREATE TABLE file (
@@ -305,7 +303,6 @@ def _init_db(self, db):
305303
[
306304
("sys_argv", str(getattr(sys, "argv", None))),
307305
("version", __version__),
308-
("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
309306
]
310307
)
311308

doc/dbschema.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ This is the database schema:
7070
-- 'has_arcs' boolean -- Is this data recording branches?
7171
-- 'sys_argv' text -- The coverage command line that recorded the data.
7272
-- 'version' text -- The version of coverage.py that made the file.
73-
-- 'when' text -- Datetime when the file was created.
7473
);
7574
7675
CREATE TABLE file (
@@ -116,7 +115,7 @@ This is the database schema:
116115
foreign key (file_id) references file (id)
117116
);
118117
119-
.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
118+
.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
120119
121120
122121
.. _numbits:

tests/test_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self):
13621362

13631363
# Make bogus data files.
13641364
self.make_file(".coverage.bad1", "This isn't a coverage data file.")
1365-
self.make_file(".coverage.bad2", "This isn't a coverage data file.")
1365+
self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
13661366

13671367
# Combine the parallel coverage data files into .coverage, but nothing is readable.
13681368
cov = coverage.Coverage()

tests/test_concurrency.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -484,9 +484,13 @@ def try_multiprocessing_code(
484484
out_lines = out.splitlines()
485485
assert len(out_lines) == nprocs + 1
486486
assert all(
487-
re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
487+
re.fullmatch(
488+
r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
489+
line
490+
)
488491
for line in out_lines
489492
)
493+
assert len(glob.glob(".coverage.*")) == 0
490494
out = self.run_command("coverage report -m")
491495

492496
last_line = self.squeezed_lines(out)[-1]

0 commit comments

Comments
 (0)