[benchmarks] add explicit benchmark groups

pbalcer · pbalcer · commit 10f499a3eca2 · 2024-12-09T14:50:31.000+01:00
This patch adds back the previously removed bar charts,
with one critical difference - we no longer normalize all
results to a baseline 100%, but instead, benchmarks need
to be explicitly grouped for the comparison bar charts
to be generated. The groups need to be chosen such that
the units are all the same, and the values are roughly
similar.
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
@@ -104,6 +104,9 @@ def extra_env_vars(self) -> dict:
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
 
+    def explicit_group(self):
+        return ""
+
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
@@ -120,7 +123,8 @@ def run(self, env_vars) -> list[Result]:
         ret = []
         for label, median, stddev, unit in parsed_results:
             extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            ret.append(Result(label=self.name() + extra_label, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
+            explicit_group = self.explicit_group() + extra_label if self.explicit_group() != "" else ""
+            ret.append(Result(label=self.name() + extra_label, explicit_group=explicit_group, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 
     def parse_output(self, output):
@@ -158,6 +162,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_sycl SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -178,6 +185,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_ur SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -198,6 +208,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_l0 SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
@@ -18,6 +18,7 @@ class Result:
     stdout: str
     passed: bool = True
     unit: str = ""
+    explicit_group: str = ""
     # stddev can be optionally set by the benchmark,
     # if not set, it will be calculated automatically.
     stddev: float = 0.0
diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py
@@ -20,30 +20,31 @@ def setup(self):
 
     def benchmarks(self) -> list[Benchmark]:
         bench_configs = [
-            ("Memory Bandwidth", 2000, 200),
-            ("Latency", 100, 20),
-            ("Throughput", 1500, 150),
-            ("FLOPS", 3000, 300),
-            ("Cache Miss Rate", 250, 25),
+            ("Memory Bandwidth", 2000, 200, "Foo Group"),
+            ("Latency", 100, 20, "Bar Group"),
+            ("Throughput", 1500, 150, "Foo Group"),
+            ("FLOPS", 3000, 300, "Foo Group"),
+            ("Cache Miss Rate", 250, 25, "Bar Group"),
         ]
 
         result = []
-        for base_name, base_value, base_diff in bench_configs:
+        for base_name, base_value, base_diff, group in bench_configs:
             for variant in range(6):
                 value_multiplier = 1.0 + (variant * 0.2)
                 name = f"{base_name} {variant+1}"
                 value = base_value * value_multiplier
                 diff = base_diff * value_multiplier
 
-                result.append(TestBench(name, value, diff))
+                result.append(TestBench(name, value, diff, group))
 
         return result
 
 class TestBench(Benchmark):
-    def __init__(self, name, value, diff):
+    def __init__(self, name, value, diff, group = ''):
         self.bname = name
         self.value = value
         self.diff = diff
+        self.group = group
         super().__init__("")
 
     def name(self):
@@ -58,7 +59,7 @@ def setup(self):
     def run(self, env_vars) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
-            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
+            Result(label=self.name(), explicit_group=self.group, value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
         ]
 
     def teardown(self):
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
@@ -183,6 +183,9 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # should this be configurable?
     history.load(1000)
 
+    # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
+    compare_names = list(dict.fromkeys(compare_names))
+
     for name in compare_names:
         compare_result = history.get_compare(name)
         if compare_result:
@@ -203,7 +206,8 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
         history.save(saved_name, results, save_name is not None)
-        compare_names.append(saved_name)
+        if saved_name not in compare_names:
+            compare_names.append(saved_name)
 
     if options.output_html:
         html_content = generate_html(history.runs, 'oneapi-src/unified-runtime', compare_names)
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 import matplotlib.dates as mdates
 from benches.result import BenchmarkRun, Result
+import numpy as np
 
 @dataclass
 class BenchmarkMetadata:
@@ -23,11 +24,14 @@ class BenchmarkSeries:
     runs: list[BenchmarkRun]
 
 @dataclass
-class BenchmarkTimeSeries:
+class BenchmarkChart:
     label: str
     html: str
 
-def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkTimeSeries]:
+def tooltip_css() -> str:
+    return '.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}'
+
+def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkChart]:
     plt.close('all')
 
     num_benchmarks = len(benchmarks)
@@ -66,7 +70,7 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
                       for point in sorted_points]
 
             tooltip = mpld3.plugins.PointHTMLTooltip(scatter, tooltip_labels,
-                css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}',
+                css=tooltip_css(),
                 targets=targets)
             mpld3.plugins.connect(fig, tooltip)
 
@@ -94,7 +98,104 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
 
         plt.tight_layout()
-        html_charts.append(BenchmarkTimeSeries(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        html_charts.append(BenchmarkChart(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        plt.close(fig)
+
+    return html_charts
+
+@dataclass
+class ExplicitGroup:
+    name: str
+    nnames: int
+    metadata: BenchmarkMetadata
+    runs: dict[str, dict[str, Result]]
+
+def create_explicit_groups(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[ExplicitGroup]:
+    groups = {}
+
+    for run in benchmark_runs:
+        if run.name in compare_names:
+            for res in run.results:
+                if res.explicit_group != '':
+                    if res.explicit_group not in groups:
+                        groups[res.explicit_group] = ExplicitGroup(name=res.explicit_group, nnames=len(compare_names),
+                                metadata=BenchmarkMetadata(unit=res.unit, lower_is_better=res.lower_is_better),
+                                runs={})
+
+                    group = groups[res.explicit_group]
+                    if res.label not in group.runs:
+                        group.runs[res.label] = {name: None for name in compare_names}
+
+                    if group.runs[res.label][run.name] is None:
+                        group.runs[res.label][run.name] = res
+
+    return list(groups.values())
+
+def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChart]:
+    plt.close('all')
+
+    html_charts = []
+
+    for group in groups:
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        x = np.arange(group.nnames)
+        x_labels = []
+        width = 0.8 / len(group.runs)
+
+        max_height = 0
+
+        for i, (run_name, run_results) in enumerate(group.runs.items()):
+            offset = width * i
+
+            positions = x + offset
+            x_labels = run_results.keys()
+            valid_data = [r.value if r is not None else 0 for r in run_results.values()]
+            rects = ax.bar(positions, valid_data, width, label=run_name)
+            # This is a hack to disable all bar_label. Setting labels to empty doesn't work.
+            # We create our own labels below for each bar, this works better in mpld3.
+            ax.bar_label(rects, fmt='')
+
+            for rect, run, res in zip(rects, run_results.keys(), run_results.values()):
+                height = rect.get_height()
+                if height > max_height:
+                    max_height = height
+
+                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
+                                    f'{res.value:.1f}',
+                                    ha='center', va='bottom', fontsize=9)
+
+                tooltip_labels = [
+                    f"Run: {run}\n"
+                    f"Label: {res.label}\n"
+                    f"Value: {res.value:.2f} {res.unit}\n"
+                ]
+                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css=tooltip_css())
+                mpld3.plugins.connect(ax.figure, tooltip)
+
+        ax.set_xticks([])
+        ax.grid(True, axis='y', alpha=0.2)
+        ax.set_ylabel(f"Value ({group.metadata.unit})")
+        ax.legend(loc='upper left')
+        ax.set_title(group.name, pad=20)
+        performance_indicator = "lower is better" if group.metadata.lower_is_better else "higher is better"
+        ax.text(0.5, 1.03, f"({performance_indicator})",
+                ha='center',
+                transform=ax.transAxes,
+                style='italic',
+                fontsize=7,
+                color='#666666')
+
+        for idx, label in enumerate(x_labels):
+            # this is a hack to get labels to show above the legend
+            # we normalize the idx to transAxes transform and offset it a little.
+            x_norm = (idx + 0.3 - ax.get_xlim()[0]) / (ax.get_xlim()[1] - ax.get_xlim()[0])
+            ax.text(x_norm, 1.00, label,
+                transform=ax.transAxes,
+                color='#666666')
+
+        plt.tight_layout()
+        html_charts.append(BenchmarkChart(label=group.name, html=mpld3.fig_to_html(fig)))
         plt.close(fig)
 
     return html_charts
@@ -138,6 +239,11 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
     timeseries = create_time_series_chart(benchmarks, github_repo)
     timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}"><div>{ts.html}</div></div>' for ts in timeseries)
 
+    explicit_groups = create_explicit_groups(benchmark_runs, compare_names)
+
+    bar_charts = create_grouped_bar_charts(explicit_groups)
+    bar_charts_html = '\n'.join(f'<div class="chart" data-label="{bc.label}"><div>{bc.html}</div></div>' for bc in bar_charts)
+
     html_template = f"""
     <!DOCTYPE html>
     <html>
@@ -199,21 +305,72 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
                 width: 400px;
                 max-width: 100%;
             }}
+            details {{
+                margin-bottom: 24px;
+            }}
+            summary {{
+                font-size: 18px;
+                font-weight: 500;
+                cursor: pointer;
+                padding: 12px;
+                background: #e9ecef;
+                border-radius: 8px;
+                user-select: none;
+            }}
+            summary:hover {{
+                background: #dee2e6;
+            }}
         </style>
         <script>
+            function getQueryParam(param) {{
+                const urlParams = new URLSearchParams(window.location.search);
+                return urlParams.get(param);
+            }}
+
             function filterCharts() {{
                 const regexInput = document.getElementById('bench-filter').value;
                 const regex = new RegExp(regexInput, 'i');
                 const charts = document.querySelectorAll('.chart');
+                let timeseriesVisible = false;
+                let barChartsVisible = false;
+
                 charts.forEach(chart => {{
                     const label = chart.getAttribute('data-label');
                     if (regex.test(label)) {{
                         chart.style.display = '';
+                        if (chart.closest('.timeseries')) {{
+                            timeseriesVisible = true;
+                        }} else if (chart.closest('.bar-charts')) {{
+                            barChartsVisible = true;
+                        }}
                     }} else {{
                         chart.style.display = 'none';
                     }}
                 }});
+
+                updateURL(regexInput);
+
+                document.querySelector('.timeseries').open = timeseriesVisible;
+                document.querySelector('.bar-charts').open = barChartsVisible;
             }}
+
+            function updateURL(regex) {{
+                const url = new URL(window.location);
+                if (regex) {{
+                    url.searchParams.set('regex', regex);
+                }} else {{
+                    url.searchParams.delete('regex');
+                }}
+                history.replaceState(null, '', url);
+            }}
+
+            document.addEventListener('DOMContentLoaded', (event) => {{
+                const regexParam = getQueryParam('regex');
+                if (regexParam) {{
+                    document.getElementById('bench-filter').value = regexParam;
+                    filterCharts();
+                }}
+            }});
         </script>
     </head>
     <body>
@@ -222,13 +379,20 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
             <div class="filter-container">
                 <input type="text" id="bench-filter" placeholder="Regex..." oninput="filterCharts()">
             </div>
-            <h2>Historical Results</h2>
-            <div class="charts">
-                {timeseries_charts_html}
-            </div>
+            <details class="timeseries">
+                <summary>Historical Results</summary>
+                <div class="charts">
+                    {timeseries_charts_html}
+                </div>
+            </details>
+            <details class="bar-charts">
+                <summary>Comparisons</summary>
+                <div class="charts">
+                    {bar_charts_html}
+                </div>
+            </details>
         </div>
     </body>
     </html>
     """
-
     return html_template