Skip to content

Commit 7c5cead

Browse files
committed
[benchmark] Compare with stable location estimate
Use the most stable location estimate (MIN, P05, P10, Q1, MED) for `ResultComparison` based on the standard deviation for given quantiles across the independent runs and the aggregate sample (hardening against outlier runs). We multiply the standard deviations for each quantile between the OLD and NEW results to get “cross-sample” variance. We pick the quantile with lowest variance as most stable and use it for the comparisons.
1 parent bf06df6 commit 7c5cead

File tree

2 files changed

+136
-5
lines changed

2 files changed

+136
-5
lines changed

benchmark/scripts/compare_perf_tests.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -338,18 +338,58 @@ def __init__(self, old_result, new_result):
338338
assert old_result.name == new_result.name
339339
self.name = old_result.name # Test name, convenience accessor
340340

341-
# Location estimates + "epsilon" to prevent division by 0
342-
old = old_result.min + 0.001
343-
new = new_result.min + 0.001
341+
self.find_most_stable_location_estimates()
342+
343+
# To avoid division by 0 in ratios, adjust the values by "epsilon"
344+
old = self.old_location + 0.001
345+
new = self.new_location + 0.001
344346

345347
self.ratio = old / new # Speedup ratio
346348
self.delta = ((new / old) - 1) * 100 # Test runtime improvement in %
347349

348350
# Indication of dubious changes: when results' ranges overlap
349351
o_min, o_max, n_min, n_max = \
350-
self.old.min, self.old.max, self.new.min, self.new.max
352+
old_result.min, old_result.max, new_result.min, new_result.max
351353
self.is_dubious = (o_min <= n_max and n_min <= o_max)
352354

355+
def find_most_stable_location_estimates(self):
356+
def independent_runs(result):
357+
return (result.independent_runs
358+
if (hasattr(result, 'independent_runs') and
359+
len(result.independent_runs) > 1) else None)
360+
361+
old_runs = independent_runs(self.old)
362+
new_runs = independent_runs(self.new)
363+
364+
if (old_runs is None or new_runs is None):
365+
self.location = 'MIN'
366+
self.old_location = self.old.min
367+
self.new_location = self.new.min
368+
return
369+
370+
locations = ['MIN', 'P05', 'P10', 'Q1', 'MED']
371+
quantiles = [0.0, 0.05, 0.1, 0.25, 0.5]
372+
373+
# Mix in the aggregated samples to bias selection against outlier runs
374+
old_runs += [self.old.samples]
375+
new_runs += [self.new.samples]
376+
377+
def q_sd(runs):
378+
"""Compute standard deviation for given quantile across runs."""
379+
# Adjust σ by 1 to always pick smaller σ in case one of variance
380+
# factors is less than 1 or a 0. The multiplicative identity is 1.
381+
return [PerformanceTestSamples('', values).sd + 1
382+
for values in [[run.quantile(q) for run in runs]
383+
for q in quantiles]]
384+
385+
self.location, most_stable_location, _, _ = \
386+
min(zip(locations, quantiles, q_sd(old_runs), q_sd(new_runs)),
387+
# Compute "cross-sample" variance: σ₁×σ₂
388+
key=lambda t: t[2] * t[3])
389+
390+
self.old_location = self.old.samples.quantile(most_stable_location)
391+
self.new_location = self.new.samples.quantile(most_stable_location)
392+
353393

354394
class LogParser(object):
355395
"""Converts log outputs into `PerformanceTestResult`s.
@@ -583,7 +623,7 @@ def values(result, dubious_formatter=lambda r: ' (?)'):
583623
if isinstance(result, PerformanceTestResult) else
584624
# isinstance(result, ResultComparison)
585625
(result.name,
586-
str(result.old.min), str(result.new.min),
626+
str(result.old_location), str(result.new_location),
587627
'{0:+.1f}%'.format(result.delta),
588628
'{0:.2f}x{1}'.format(
589629
result.ratio,

benchmark/scripts/test_compare_perf_tests.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,9 @@ def test_init(self):
463463
self.assertAlmostEqual(rc.ratio, 12325.0 / 11616.0)
464464
self.assertAlmostEqual(rc.delta, (((11616.0 / 12325.0) - 1) * 100),
465465
places=3)
466+
self.assertEqual(rc.location, 'MIN')
467+
self.assertEqual([rc.old_location, rc.new_location],
468+
[rc.old.min, rc.new.min])
466469
# handle test results that sometimes change to zero, when compiler
467470
# optimizes out the body of the incorrectly written test
468471
rc = ResultComparison(self.r0, self.r0)
@@ -489,6 +492,94 @@ def test_values_is_dubious(self):
489492
# other way around: old.min < new.min < old.max
490493
self.assertTrue(ResultComparison(self.r2, self.r1).is_dubious)
491494

495+
def test_use_most_stable_location_for_comparison(self):
496+
"""
497+
Select the most stable location estimate (MIN, P05, P10, Q1, MED).
498+
499+
When the results contain samples from multiple independent runs, use
500+
the empirical distribution to select the location estimate with lowest
501+
variance and use it in result comparison.
502+
"""
503+
def compare(log):
504+
results = [
505+
PerformanceTestResult(l.split(','), quantiles=True, delta=True)
506+
for l in log.split('\n')[1:-1]]
507+
results[0].merge(results[1])
508+
results[2].merge(results[3])
509+
return ResultComparison(results[0], results[2])
510+
511+
# --quantile=20 --delta FlattenListFlatMap
512+
rc = compare("""
513+
0,F,21,3721,147,1689,83,44,138,95,6,3,19,19,26,23,45,64,88,84,34,56,233,1
514+
0,F,21,5284,16,560,23,25,15,29,13,6,4,19,16,15,26,32,31,72,23,26,104,1
515+
0,F,21,5318,82,507,25,13,23,12,4,9,25,23,27,14,27,13,6,8,38,89,265,1
516+
0,F,21,4701,12,1151,28,13,32,5,11,26,9,33,16,28,14,13,46,185,126,16,64,1
517+
""")
518+
self.assertTrue(rc.is_dubious)
519+
self.assertEqual([rc.old.min, rc.new.min], [3721, 4701])
520+
self.assertEqual([rc.old.median, rc.new.median], [5975, 5988])
521+
self.assertEqual(rc.location, 'MED')
522+
self.assertEqual([rc.old_location, rc.new_location], [5975, 5988])
523+
# self.assertAlmostEqual(rc.delta, 26.34, places=2) # delta from MIN
524+
self.assertAlmostEqual(rc.delta, 0.22, places=2) # delta from MED
525+
526+
# --quantile=20 --delta ObjectiveCBridgeStubToNSDateRef O
527+
rc = compare("""
528+
0,O,21,128,8,1,,4,3,3,2,,,3,,1,,,,3,,1,2,1
529+
0,O,21,119,16,2,,3,3,1,5,,,,1,2,1,,2,1,1,,2,1
530+
0,O,21,125,7,5,,1,5,1,3,2,,,,2,1,1,1,2,1,1,2,1
531+
0,O,21,119,17,,1,,2,5,4,1,,,,1,1,,1,,,3,3,1
532+
""")
533+
self.assertTrue(rc.is_dubious)
534+
self.assertEqual([rc.old.min, rc.new.min], [119, 119])
535+
self.assertEqual([rc.old.samples.quantile(0.1),
536+
rc.new.samples.quantile(0.1)], [137, 136])
537+
self.assertEqual(rc.location, 'P10')
538+
self.assertEqual([rc.old_location, rc.new_location], [137, 136])
539+
self.assertAlmostEqual(rc.delta, -0.73, places=2)
540+
541+
# --quantile=20 --delta DictionaryBridgeToObjC_Bridge -Onone
542+
rc = compare("""
543+
0,D,21,15,,,,,,,,,,,,,1,,,,,,,
544+
0,D,21,15,,,,,,,,,,,,,,,,,,,,
545+
0,D,21,14,1,,,,,,,,,,,,,,,,,,,
546+
0,D,21,15,,,,,,,,,,1,,,,,,,1,,,
547+
""")
548+
self.assertTrue(rc.is_dubious)
549+
self.assertEqual([rc.old.min, rc.new.min], [15, 14])
550+
self.assertEqual([rc.old.samples.quantile(0.05),
551+
rc.new.samples.quantile(0.05)], [15, 15])
552+
# self.assertEqual(rc.location, 'P10')
553+
self.assertEqual(rc.location, 'P05')
554+
self.assertEqual([rc.old_location, rc.new_location], [15, 15])
555+
556+
def test_stable_location_vs_outlier_runs(self):
557+
"Location estimate should be robust in presence of outlier runs."
558+
def synth(min):
559+
r = ('0,S,21,' + str(min) + ',1,1,,,1,,,,,1,,,,,2,,,,,').split(',')
560+
return PerformanceTestResult(r, quantiles=True, delta=True)
561+
562+
s, t, u = synth(100), synth(100), synth(100)
563+
self.assertEqual(
564+
[s.min, s.samples.quantile(0.05), s.samples.quantile(0.1),
565+
s.samples.q1, s.median, s.samples.q3],
566+
[100, 101, 102, 103, 104, 106])
567+
568+
[s.merge(synth(100)) for i in range(1, 11)]
569+
[t.merge(synth(100)) for i in range(1, 10)]
570+
[u.merge(synth(100)) for i in range(1, 9)]
571+
t.merge(synth(94)) # one outlier run
572+
u.merge(synth(94)) # two outlier runs
573+
u.merge(synth(94))
574+
575+
rst = ResultComparison(s, t)
576+
self.assertEqual(rst.location, 'Q1')
577+
self.assertEqual([rst.old_location, rst.new_location], [103, 102])
578+
579+
rsu = ResultComparison(s, u)
580+
self.assertEqual(rsu.location, 'MED')
581+
self.assertEqual([rsu.old_location, rsu.new_location], [104, 103])
582+
492583

493584
class FileSystemIntegration(unittest.TestCase):
494585
def setUp(self):

0 commit comments

Comments
 (0)