Skip to content

Commit a8751cc

Browse files
committed
[benchmark] run_smoke_bench: more robust sampling
With the faster benchmarks in Swift Benchmark Suite (now that the Legacy Factor refactoring is finished), we don’t need to be so extremly frugal with the sample count. Gathering just the first 3 to 10 samples per benchmark was not very representative from the statistical point of view. I suspect it hides Type II errors — unreported changes. Adjusting the measurement method to sample each benchmark for 50 ms and gather at minimum 10 samples. For the suspected changes, gather up to 10 independent samples. Also thorougly measure the newly added test in re-runs.
1 parent a06cb2c commit a8751cc

File tree

1 file changed

+11
-20
lines changed

1 file changed

+11
-20
lines changed

benchmark/scripts/run_smoke_bench

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def main():
9696
argparser.add_argument(
9797
'-num-reruns', type=int,
9898
help="The number of re-runs until it's assumed to be a real change",
99-
default=8)
99+
default=10)
100100
argparser.add_argument(
101101
'-platform', type=str,
102102
help='The benchmark build platform', default='macosx')
@@ -153,18 +153,14 @@ def test_opt_levels(args):
153153

154154

155155
def measure(driver, tests, i):
156-
"""Log and measure samples of the tests with the given driver.
157-
158-
Collect increasing number of samples, depending on the iteration.
159-
"""
160-
num_samples = min(i + 3, 10)
161-
msg = ' Iteration {0} for {1}: num samples = {2}, '.format(
162-
i, driver.args.tests, num_samples)
156+
"""Log and measure samples of the tests with the given driver."""
157+
msg = ' Iteration {0} for {1}:, '.format(i, driver.args.tests)
163158
msg += ('running all tests' if driver.all_tests == tests else
164159
're-testing {0} tests'.format(len(tests)))
165160
log(msg)
166161
driver.tests = tests
167-
return driver.run(num_samples=num_samples, sample_time=0.0025)
162+
return driver.run(
163+
num_iters=1, min_samples=10, sample_time=0.05, quantile=20)
168164

169165

170166
def merge(results, other_results):
@@ -178,31 +174,26 @@ def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
178174
num_reruns, output_file):
179175
"""Detect performance changes in benchmarks.
180176
181-
Start fast with few samples per benchmark and gradually spend more time
182-
gathering more precise measurements of the change candidates.
177+
Gather more independent measurements of the change candidates.
183178
"""
184179

185-
i, unchanged_length_count = 0, 0
180+
i, run_count = 0, 0
186181
old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
187182
for dir in [old_dir, new_dir]]
188183
results = [measure(driver, driver.tests, i) for driver in [old, new]]
189184
tests = TestComparator(results[0], results[1], threshold)
190-
changed = tests.decreased + tests.increased
185+
changed = tests.decreased + tests.increased + tests.added
191186

192-
while len(changed) > 0 and unchanged_length_count < num_reruns:
187+
while len(changed) > 0 and run_count < num_reruns:
193188
i += 1
194189
if VERBOSE:
195190
log(' test again: ' + str([test.name for test in changed]))
196191
results = [merge(the_results,
197192
measure(driver, [test.name for test in changed], i))
198193
for the_results, driver in zip(results, [old, new])]
199194
tests = TestComparator(results[0], results[1], threshold)
200-
changed = tests.decreased + tests.increased
201-
202-
if len(old.tests) == len(changed):
203-
unchanged_length_count += 1
204-
else:
205-
unchanged_length_count = 0
195+
changed = tests.decreased + tests.increased + tests.added
196+
run_count += 1
206197

207198
log('')
208199
return report_results("Performance: -" + opt_level, None, None,

0 commit comments

Comments
 (0)