Skip to content

Commit 97ec545

Browse files
committed
Overhaul Benchmarking pipeline to use complete sample data, not summaries
The Swift benchmarking harness now has two distinct output formats: * Default: Formatted text that's intended for human consumption. Right now, this is just the minimum value, but we can augment that. * `--json`: each output line is a JSON-encoded object that contains raw data This information is intended for use by python scripts that aggregate or compare multiple independent tests. Previously, we tried to use the same output for both purposes. This required the python scripts to do more complex parsing of textual layouts, and also meant that the python scripts had only summary data to work with instead of full raw sample information. This in turn made it almost impossible to derive meaningful comparisons between runs or to aggregate multiple runs. Typical output in the new JSON format looks like this: ``` {"number":89, "name":"PerfTest", "samples":[1.23, 2.35], "max_rss":16384} {"number":91, "name":"OtherTest", "samples":[14.8, 19.7]} ``` This format is easy to parse in Python. Just iterate over lines and decode each one separately. Also note that the optional fields (`"max_rss"` above) are trivial to handle: ``` import json for l in lines: j = json.loads(l) # Default 0 if not present max_rss = j.get("max_rss", 0) ``` Note the `"samples"` array includes the runtime for each individual run. Because optional fields are so much easier to handle in this form, I reworked the Python logic to translate old formats into this JSON format for more uniformity. Hopefully, we can simplify the code in a year or so by stripping out the old log formats entirely, along with some of the redundant statistical calculations. In particular, the python logic still makes an effort to preserve mean, median, max, min, stdev, and other statistical data whenever the full set of samples is not present. Once we've gotten to a point where we're always keeping full samples, we can compute any such information on the fly as needed, eliminating the need to record it. This is a pretty big rearchitecture of the core benchmarking logic. In order to try to keep things a bit more manageable, I have not taken this opportunity to replace any of the actual statistics used in the higher level code or to change how the actual samples are measured. (But I expect this rearchitecture will make such changes simpler.) In particular, this should not actually change any benchmark results. For the future, please keep this general principle in mind: Statistical summaries (averages, medians, etc) should as a rule be computed for immediate output and rarely if ever stored or used as input for other processing. Instead, aim to store and transfer raw data from which statistics can be recomputed as necessary.
1 parent 36c01e8 commit 97ec545

File tree

5 files changed

+821
-1058
lines changed

5 files changed

+821
-1058
lines changed

benchmark/scripts/Benchmark_Driver

Lines changed: 47 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class `BenchmarkDoctor` analyzes performance tests, implements `check` COMMAND.
2828
import argparse
2929
import functools
3030
import glob
31+
import json
3132
import logging
3233
import math
3334
import os
@@ -88,9 +89,10 @@ class BenchmarkDriver(object):
8889
def test_harness(self):
8990
"""Full path to test harness binary."""
9091
suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
92+
suffix += "-"
9193
if hasattr(self.args, "architecture") and self.args.architecture:
92-
suffix += "-" + self.args.architecture + "*"
93-
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix)
94+
suffix += self.args.architecture
95+
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
9496
executables = []
9597
if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
9698
executables = [pattern]
@@ -134,48 +136,39 @@ class BenchmarkDriver(object):
134136

135137
@property
136138
def _cmd_list_benchmarks(self):
137-
# Use tab delimiter for easier parsing to override the default comma.
138-
# (The third 'column' is always comma-separated list of tags in square
139-
# brackets -- currently unused here.)
140-
return [self.test_harness, "--list", "--delim=\t"] + (
139+
return [self.test_harness, "--list", "--json"] + (
141140
["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
142141
)
143142

144143
def _get_tests(self):
145144
"""Return a list of performance tests to run."""
146-
number_name_pairs = [
147-
line.split("\t")[:2]
148-
for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
149-
]
150-
# unzip list of pairs into 2 lists
151-
test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
152-
self.test_number = dict(zip(self.all_tests, test_numbers))
145+
lines = self._invoke(self._cmd_list_benchmarks).split("\n")
146+
json_tests = []
147+
for l in lines:
148+
if l.strip() != "":
149+
json_tests.append(json.loads(l))
150+
self.all_tests = [json["name"] for json in json_tests]
151+
test_numbers = [json["number"] for json in json_tests]
152+
self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
153153
if self.args.filters:
154154
return self._tests_matching_patterns()
155155
if self.args.benchmarks:
156156
return self._tests_by_name_or_number(test_numbers)
157157
return self.all_tests
158158

159159
def _tests_matching_patterns(self):
160-
regexes = [re.compile(pattern) for pattern in self.args.filters]
161-
return sorted(
162-
list(
163-
set(
164-
[
165-
name
166-
for pattern in regexes
167-
for name in self.all_tests
168-
if pattern.match(name)
169-
]
170-
)
171-
)
172-
)
160+
regexes = map(re.compile, self.args.filters)
161+
matches = set()
162+
for pattern in regexes:
163+
new_matches = filter(pattern.match, self.all_tests)
164+
matches.union(new_matches)
165+
return sorted(list(matches))
173166

174167
def _tests_by_name_or_number(self, test_numbers):
175168
benchmarks = set(self.args.benchmarks)
176169
number_to_name = dict(zip(test_numbers, self.all_tests))
177170
tests_by_number = [
178-
number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
171+
number_to_name[i] for i in benchmarks.intersection(test_numbers)
179172
]
180173
return sorted(
181174
list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
@@ -188,16 +181,15 @@ class BenchmarkDriver(object):
188181
num_iters=None,
189182
sample_time=None,
190183
verbose=None,
191-
measure_memory=False,
192-
quantile=None,
184+
measure_memory=False
193185
):
194186
"""Execute benchmark and gather results."""
195187
num_samples = num_samples or 0
196188
num_iters = num_iters or 0 # automatically determine N to run for 1s
197189
sample_time = sample_time or 0 # default is 1s
198190

199191
cmd = self._cmd_run(
200-
test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
192+
test, num_samples, num_iters, sample_time, verbose, measure_memory
201193
)
202194
output = self._invoke(cmd)
203195
results = self.parser.results_from_string(output)
@@ -210,8 +202,7 @@ class BenchmarkDriver(object):
210202
num_iters,
211203
sample_time,
212204
verbose,
213-
measure_memory,
214-
quantile,
205+
measure_memory
215206
):
216207
cmd = [self.test_harness]
217208
if test:
@@ -228,9 +219,7 @@ class BenchmarkDriver(object):
228219
cmd.append("--verbose")
229220
if measure_memory:
230221
cmd.append("--memory")
231-
if quantile:
232-
cmd.append("--quantile={0}".format(quantile))
233-
cmd.append("--delta")
222+
cmd.append("--json")
234223
return cmd
235224

236225
def run_independent_samples(self, test):
@@ -246,12 +235,12 @@ class BenchmarkDriver(object):
246235
return functools.reduce(
247236
merge_results,
248237
[
249-
self.run(test, measure_memory=True, num_iters=1, quantile=20)
238+
self.run(test, measure_memory=True, num_iters=1)
250239
for _ in range(self.args.independent_samples)
251240
],
252241
)
253242

254-
def log_results(self, output, log_file=None):
243+
def log_results(self, results, log_file=None):
255244
"""Log output to `log_file`.
256245
257246
Creates `args.output_dir` if it doesn't exist yet.
@@ -262,7 +251,8 @@ class BenchmarkDriver(object):
262251
os.makedirs(dir)
263252
print("Logging results to: %s" % log_file)
264253
with open(log_file, "w") as f:
265-
f.write(output)
254+
for r in results:
255+
print(r, file=f)
266256

267257
RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"
268258

@@ -284,25 +274,25 @@ class BenchmarkDriver(object):
284274
def console_log(values):
285275
print(format(values))
286276

287-
def result_values(r):
277+
def summary(r):
288278
return list(
289279
map(
290280
str,
291281
[
292282
r.test_num,
293283
r.name,
294284
r.num_samples,
295-
r.min,
296-
r.samples.q1,
285+
r.min_value,
286+
r.q1,
297287
r.median,
298-
r.samples.q3,
299-
r.max,
288+
r.q3,
289+
r.max_value,
300290
r.max_rss,
301291
],
302292
)
303293
)
304294

305-
header = [
295+
summary_header = [
306296
"#",
307297
"TEST",
308298
"SAMPLES",
@@ -313,25 +303,23 @@ class BenchmarkDriver(object):
313303
"MAX(μs)",
314304
"MAX_RSS(B)",
315305
]
316-
console_log(header)
317-
results = [header]
306+
console_log(summary_header)
307+
results = []
318308
for test in self.tests:
319-
result = result_values(self.run_independent_samples(test))
320-
console_log(result)
309+
result = self.run_independent_samples(test)
310+
console_log(summary(result))
321311
results.append(result)
322312

323313
print("\nTotal performance tests executed: {0}".format(len(self.tests)))
324-
return (
325-
None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
326-
) # csv_log
314+
return results
327315

328316
@staticmethod
329317
def run_benchmarks(args):
330318
"""Run benchmarks and log results."""
331319
driver = BenchmarkDriver(args)
332-
csv_log = driver.run_and_log(csv_console=(args.output_dir is None))
333-
if csv_log:
334-
driver.log_results(csv_log)
320+
results = driver.run_and_log(csv_console=(args.output_dir is None))
321+
if args.output_dir:
322+
driver.log_results([r.json for r in results])
335323
return 0
336324

337325

@@ -444,7 +432,6 @@ class BenchmarkDoctor(object):
444432
Optional `driver` parameter for injecting dependency; used for testing.
445433
"""
446434
super(BenchmarkDoctor, self).__init__()
447-
self.driver = driver or BenchmarkDriver(args)
448435
self.results = {}
449436

450437
if hasattr(args, "markdown") and args.markdown:
@@ -457,6 +444,7 @@ class BenchmarkDoctor(object):
457444
self.console_handler.setLevel(
458445
logging.DEBUG if args.verbose else logging.INFO
459446
)
447+
self.driver = driver or BenchmarkDriver(args)
460448
self.log.addHandler(self.console_handler)
461449
self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
462450
self.requirements = [
@@ -531,7 +519,7 @@ class BenchmarkDoctor(object):
531519
correction = setup / i
532520
i_series = BenchmarkDoctor._select(measurements, num_iters=i)
533521
for result in i_series:
534-
runtimes.append(result.samples.min - correction)
522+
runtimes.append(result.min_value - correction)
535523
runtime = min(runtimes)
536524

537525
threshold = 1000
@@ -583,7 +571,7 @@ class BenchmarkDoctor(object):
583571
ti1, ti2 = [
584572
float(min(mins))
585573
for mins in [
586-
[result.samples.min for result in i_series]
574+
[result.min_value for result in i_series]
587575
for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
588576
]
589577
]
@@ -678,7 +666,7 @@ class BenchmarkDoctor(object):
678666
r = self.driver.run(
679667
benchmark, num_samples=3, num_iters=1, verbose=True
680668
) # calibrate
681-
num_samples = self._adjusted_1s_samples(r.samples.min)
669+
num_samples = self._adjusted_1s_samples(r.min_value)
682670

683671
def capped(s):
684672
return min(s, 200)
@@ -688,7 +676,7 @@ class BenchmarkDoctor(object):
688676
opts = opts if isinstance(opts, list) else [opts]
689677
self.log.debug(
690678
"Runtime {0} μs yields {1} adjusted samples per second.".format(
691-
r.samples.min, num_samples
679+
r.min_value, num_samples
692680
)
693681
)
694682
self.log.debug(

0 commit comments

Comments
 (0)