Skip to content

Commit 3496c02

Browse files
committed
[benchmarks] add stddev calculation and outlier elimitation
We now run benchmarks until the result is stabilized below a threshold stddev value or until we reach max number of iterations. Outlier results are eliminated each time we calculate stddev, this is to help stddev stabilize quicker and minimize the number of repeat runs.
1 parent 2b5fb03 commit 3496c02

File tree

5 files changed

+104
-39
lines changed

5 files changed

+104
-39
lines changed

scripts/benchmarks/benches/base.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,6 @@ def run(self, env_vars) -> list[Result]:
7171
def teardown(self):
7272
raise NotImplementedError()
7373

74-
def ignore_iterations(self):
75-
return False
76-
7774
class Suite:
7875
def benchmarks(self) -> list[Benchmark]:
7976
raise NotImplementedError()

scripts/benchmarks/benches/llamacpp.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,6 @@ def name(self):
7676
def lower_is_better(self):
7777
return False
7878

79-
def ignore_iterations(self):
80-
return True
81-
8279
def run(self, env_vars) -> list[Result]:
8380
command = [
8481
f"{self.benchmark_bin}",

scripts/benchmarks/benches/options.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@ class Options:
1515
rebuild: bool = True
1616
benchmark_cwd: str = "INVALID"
1717
timeout: float = 600
18-
iterations: int = 5
18+
iterations: int = 3
1919
verbose: bool = False
2020
compare: Compare = Compare.LATEST
2121
compare_max: int = 10 # average/median over how many results
2222
output_html: bool = False
2323
output_markdown: bool = True
2424
dry_run: bool = False
25+
# these two should probably be merged into one setting
26+
stddev_threshold: float = 0.02
27+
epsilon: float = 0.02
2528

2629
options = Options()
2730

scripts/benchmarks/benches/result.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Result:
2323
lower_is_better: bool = True
2424
git_hash: str = ''
2525
date: Optional[datetime] = None
26+
stddev: float = 0.0
2627

2728
@dataclass_json
2829
@dataclass

scripts/benchmarks/main.py

Lines changed: 99 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,97 @@
1818

1919
import argparse
2020
import re
21+
import statistics
2122

2223
# Update this if you are changing the layout of the results files
2324
INTERNAL_WORKDIR_VERSION = '2.0'
2425

26+
def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]):
27+
for iter in range(iters):
28+
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
29+
bench_results = benchmark.run(env_vars)
30+
if bench_results is None:
31+
print(f"did not finish (OK for sycl-bench).")
32+
break
33+
34+
for bench_result in bench_results:
35+
# TODO: report failures in markdown/html ?
36+
if not bench_result.passed:
37+
print(f"complete ({bench_result.label}: verification FAILED)")
38+
continue
39+
40+
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
41+
42+
bench_result.name = bench_result.label
43+
bench_result.lower_is_better = benchmark.lower_is_better()
44+
45+
if bench_result.label not in results:
46+
results[bench_result.label] = []
47+
48+
results[bench_result.label].append(bench_result)
49+
50+
# https://www.statology.org/modified-z-score/
51+
def modified_z_score(values: list[float]) -> list[float]:
52+
median = statistics.median(values)
53+
mad = statistics.median([abs(v - median) for v in values])
54+
if mad == 0:
55+
return [0] * len(values)
56+
return [(0.6745 * (v - median)) / mad for v in values]
57+
58+
def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) -> dict[str, list[Result]]:
59+
new_results = {}
60+
for key, rlist in results.items():
61+
# don't eliminate outliers on first pass
62+
if len(rlist) <= options.iterations:
63+
new_results[key] = rlist
64+
continue
65+
66+
values = [r.value for r in rlist]
67+
z_scores = modified_z_score(values)
68+
filtered_rlist = [r for r, z in zip(rlist, z_scores) if abs(z) <= threshold]
69+
70+
if not filtered_rlist:
71+
new_results[key] = rlist
72+
else:
73+
new_results[key] = filtered_rlist
74+
75+
return new_results
76+
77+
def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
78+
processed: list[Result] = []
79+
# technically, we can detect whether result is below or above threshold per
80+
# individual result. However, we can't repeat benchmark runs with that
81+
# granularity. So we just reject all results and try again.
82+
valid_results = True # above stddev threshold
83+
84+
for label, rlist in remove_outliers(results).items():
85+
if (len(rlist) == 0):
86+
continue
87+
88+
if len(rlist) == 1:
89+
processed.append(rlist[0])
90+
continue
91+
92+
values = [r.value for r in rlist]
93+
94+
mean_value = statistics.mean(values)
95+
stddev = statistics.stdev(values)
96+
97+
threshold = options.stddev_threshold * mean_value
98+
99+
if stddev > threshold:
100+
print(f"stddev {stddev} above the threshold {threshold} for {label}")
101+
valid_results = False
102+
103+
rlist.sort(key=lambda res: res.value)
104+
median_index = len(rlist) // 2
105+
median_result = rlist[median_index]
106+
median_result.stddev = stddev
107+
108+
processed.append(median_result)
109+
110+
return valid_results, processed
111+
25112
def main(directory, additional_env_vars, save_name, compare_names, filter):
26113
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
27114

@@ -65,36 +152,15 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
65152
for benchmark in benchmarks:
66153
try:
67154
merged_env_vars = {**additional_env_vars}
68-
iteration_results = []
69-
iterations = options.iterations if not benchmark.ignore_iterations() else 1
70-
for iter in range(iterations):
71-
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
72-
bench_results = benchmark.run(merged_env_vars)
73-
if bench_results is not None:
74-
for bench_result in bench_results:
75-
if bench_result.passed:
76-
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
77-
else:
78-
print(f"complete ({bench_result.label}: verification FAILED)")
79-
iteration_results.append(bench_result)
80-
else:
81-
print(f"did not finish (OK for sycl-bench).")
155+
intermediate_results: dict[str, list[Result]] = {}
156+
processed: list[Result] = []
157+
for _ in range(5):
158+
run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
159+
valid, processed = process_results(intermediate_results)
160+
if valid:
82161
break
162+
results += processed
83163

84-
if len(iteration_results) == 0:
85-
continue
86-
87-
for label in set([result.label for result in iteration_results]):
88-
label_results = [result for result in iteration_results if result.label == label and result.passed == True]
89-
if len(label_results) > 0:
90-
label_results.sort(key=lambda res: res.value)
91-
median_index = len(label_results) // 2
92-
median_result = label_results[median_index]
93-
94-
median_result.name = label
95-
median_result.lower_is_better = benchmark.lower_is_better()
96-
97-
results.append(median_result)
98164
except Exception as e:
99165
if options.exit_on_failure:
100166
raise e
@@ -164,14 +230,15 @@ def validate_and_parse_env_args(env_args):
164230
parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
165231
parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
166232
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
167-
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
168-
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
233+
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
234+
parser.add_argument("--stddev-threshold", type=float, help='If stddev % is above this threshold, rerun all iterations', default=options.stddev_threshold)
235+
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
169236
parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
170-
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
237+
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=options.epsilon)
171238
parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
172239
parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
173240
parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
174-
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
241+
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
175242
parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
176243
parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
177244
parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)

0 commit comments

Comments
 (0)