Skip to content

Commit 497dcce

Browse files
committed
[benchmarks] improve HTML and Markdown output
This patch improves numerous aspects on how the benchmarking results are visualized: - rewrites the way HTML charts are generated, using a library (Chart.js) that's both easier to use and more visually pleasing. The new HTML page also now decouples data from the HTML itself, leading to faster load times and the ability to fetch data from remote sources. - The markdown output now contains a failures section that lists all benchmarks that failed for a given run. This will be a helpful for developers during PR testing. - Benchmarks can now have description that's displayed on the page. - And many more minor improvements.
1 parent 4f08dd6 commit 497dcce

19 files changed

+1167
-648
lines changed

devops/scripts/benchmarks/benches/base.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import os
77
import shutil
88
from pathlib import Path
9-
from .result import Result
9+
from utils.result import Result
1010
from options import options
1111
from utils.utils import download, run
1212
import urllib.request
@@ -55,16 +55,25 @@ def create_data_path(self, name, skip_data_dir=False):
5555
data_path = os.path.join(self.directory, name)
5656
else:
5757
data_path = os.path.join(self.directory, "data", name)
58-
if options.rebuild and Path(data_path).exists():
58+
if options.redownload and Path(data_path).exists():
5959
shutil.rmtree(data_path)
6060

6161
Path(data_path).mkdir(parents=True, exist_ok=True)
6262

6363
return data_path
6464

65-
def download(self, name, url, file, untar=False, unzip=False, skip_data_dir=False):
65+
def download(
66+
self,
67+
name,
68+
url,
69+
file,
70+
untar=False,
71+
unzip=False,
72+
skip_data_dir=False,
73+
checksum="",
74+
):
6675
self.data_path = self.create_data_path(name, skip_data_dir)
67-
return download(self.data_path, url, file, untar, unzip)
76+
return download(self.data_path, url, file, untar, unzip, checksum)
6877

6978
def name(self):
7079
raise NotImplementedError()

devops/scripts/benchmarks/benches/compute.py

Lines changed: 92 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
import io
99
from utils.utils import run, git_clone, create_build_path
1010
from .base import Benchmark, Suite
11-
from .result import Result
11+
from utils.result import Result
1212
from options import options
1313
from enum import Enum
1414

15+
1516
class ComputeBench(Suite):
1617
def __init__(self, directory):
1718
self.directory = directory
@@ -47,9 +48,8 @@ def setup(self):
4748
f"-Dunified-runtime_DIR={options.ur}/lib/cmake/unified-runtime",
4849
]
4950

50-
print(f"{self.__class__.__name__}: Run {configure_command}")
5151
run(configure_command, add_sycl=True)
52-
print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j")
52+
5353
run(f"cmake --build {build_path} -j", add_sycl=True)
5454

5555
self.built = True
@@ -73,16 +73,6 @@ def benchmarks(self) -> list[Benchmark]:
7373
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
7474
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
7575
VectorSum(self),
76-
MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
77-
MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
78-
MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
79-
MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
80-
MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
81-
MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
82-
MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
83-
MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
84-
MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
85-
MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
8676
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5),
8777
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5),
8878
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100),
@@ -98,6 +88,16 @@ def benchmarks(self) -> list[Benchmark]:
9888
SubmitKernelUR(self, 0, 0),
9989
SubmitKernelUR(self, 1, 0),
10090
SubmitKernelUR(self, 1, 1),
91+
MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
92+
MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
93+
MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
94+
MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
95+
MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
96+
MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
97+
MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
98+
MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
99+
MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
100+
MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
101101
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5),
102102
GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5),
103103
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100),
@@ -136,6 +136,9 @@ def setup(self):
136136
def explicit_group(self):
137137
return ""
138138

139+
def description(self) -> str:
140+
return ""
141+
139142
def run(self, env_vars) -> list[Result]:
140143
command = [
141144
f"{self.benchmark_bin}",
@@ -167,6 +170,7 @@ def run(self, env_vars) -> list[Result]:
167170
env=env_vars,
168171
stdout=result,
169172
unit=parse_unit_type(unit),
173+
description=self.description()
170174
)
171175
)
172176
return ret
@@ -221,6 +225,13 @@ def bin_args(self) -> list[str]:
221225
"--KernelExecTime=1",
222226
]
223227

228+
def description(self) -> str:
229+
order = "in-order" if self.ioq else "out-of-order"
230+
return (
231+
f"Measures CPU time overhead of submitting {order} kernels through SYCL API."
232+
"Uses 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time."
233+
)
234+
224235

225236
class SubmitKernelUR(ComputeBenchmark):
226237
def __init__(self, bench, ioq, measureCompletion):
@@ -237,6 +248,15 @@ def name(self):
237248
def explicit_group(self):
238249
return "SubmitKernel"
239250

251+
def description(self) -> str:
252+
order = "in-order" if self.ioq else "out-of-order"
253+
completion = "including" if self.measureCompletion else "excluding"
254+
return (
255+
f"Measures CPU time overhead of submitting {order} kernels through Unified Runtime API, "
256+
f"{completion} kernel completion time. Uses 10 simple kernels with minimal execution time "
257+
f"to isolate API overhead."
258+
)
259+
240260
def bin_args(self) -> list[str]:
241261
return [
242262
f"--Ioq={self.ioq}",
@@ -261,6 +281,14 @@ def name(self):
261281
def explicit_group(self):
262282
return "SubmitKernel"
263283

284+
def description(self) -> str:
285+
order = "in-order" if self.ioq else "out-of-order"
286+
return (
287+
f"Measures CPU time overhead of submitting {order} kernels through Level Zero API. "
288+
f"Uses immediate command lists with 10 minimal kernels to isolate submission overhead "
289+
f"from execution time."
290+
)
291+
264292
def bin_args(self) -> list[str]:
265293
return [
266294
f"--Ioq={self.ioq}",
@@ -286,6 +314,14 @@ def name(self):
286314
order = "in order" if self.ioq else "out of order"
287315
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
288316

317+
def description(self) -> str:
318+
order = "in-order" if self.ioq else "out-of-order"
319+
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
320+
return (
321+
f"Measures SYCL {order} queue overhead for {operation} from {self.source} to "
322+
f"{self.destination} memory with {self.size} bytes. Tests immediate execution overheads."
323+
)
324+
289325
def bin_args(self) -> list[str]:
290326
return [
291327
"--iterations=100000",
@@ -309,6 +345,13 @@ def __init__(self, bench, isCopyOnly, source, destination, size):
309345
def name(self):
310346
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
311347

348+
def description(self) -> str:
349+
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
350+
return (
351+
f"Measures SYCL in-order queue memory copy performance for {operation} from "
352+
f"{self.source} to {self.destination} with {self.size} bytes, executed 100 times per iteration."
353+
)
354+
312355
def bin_args(self) -> list[str]:
313356
return [
314357
"--iterations=10000",
@@ -330,6 +373,12 @@ def __init__(self, bench, source, destination, size):
330373
def name(self):
331374
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
332375

376+
def description(self) -> str:
377+
return (
378+
f"Measures general SYCL queue memory copy performance from {self.source} to "
379+
f"{self.destination} with {self.size} bytes per operation."
380+
)
381+
333382
def bin_args(self) -> list[str]:
334383
return [
335384
"--iterations=10000",
@@ -349,6 +398,12 @@ def __init__(self, bench, type, size, placement):
349398
def name(self):
350399
return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
351400

401+
def description(self) -> str:
402+
return (
403+
f"Measures {self.placement} memory bandwidth using {self.type} pattern with "
404+
f"{self.size} bytes. Higher values (GB/s) indicate better performance."
405+
)
406+
352407
# measurement is in GB/s
353408
def lower_is_better(self):
354409
return False
@@ -362,6 +417,7 @@ def bin_args(self) -> list[str]:
362417
"--useEvents=0",
363418
"--contents=Zeros",
364419
"--multiplier=1",
420+
"--vectorSize=1",
365421
]
366422

367423

@@ -372,6 +428,12 @@ def __init__(self, bench):
372428
def name(self):
373429
return f"miscellaneous_benchmark_sycl VectorSum"
374430

431+
def description(self) -> str:
432+
return (
433+
"Measures performance of vector addition across 3D grid (512x256x256 elements) "
434+
"using SYCL."
435+
)
436+
375437
def bin_args(self) -> list[str]:
376438
return [
377439
"--iterations=1000",
@@ -408,6 +470,16 @@ def name(self):
408470
+ (" without events" if not self.useEvents else "")
409471
)
410472

473+
def description(self) -> str:
474+
src_type = "device" if self.srcUSM == 1 else "host"
475+
dst_type = "device" if self.dstUSM == 1 else "host"
476+
events = "with" if self.useEvents else "without"
477+
return (
478+
f"Measures multithreaded memory copy performance with {self.numThreads} threads "
479+
f"each performing {self.numOpsPerThread} operations on {self.allocSize} bytes "
480+
f"from {src_type} to {dst_type} memory {events} events."
481+
)
482+
411483
def bin_args(self) -> list[str]:
412484
return [
413485
"--Ioq=1",
@@ -441,6 +513,13 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
441513
def explicit_group(self):
442514
return f"SinKernelGraph {self.numKernels}"
443515

516+
def description(self) -> str:
517+
execution = "using graphs" if self.withGraphs else "without graphs"
518+
return (
519+
f"Measures {self.runtime.value.upper()} performance when executing {self.numKernels} "
520+
f"sin kernels {execution}. Tests overhead and benefits of graph-based execution."
521+
)
522+
444523
def name(self):
445524
return f"graph_api_benchmark_{self.runtime.value} SinKernelGraph graphs:{self.withGraphs}, numKernels:{self.numKernels}"
446525

@@ -452,28 +531,3 @@ def bin_args(self) -> list[str]:
452531
"--withCopyOffload=1",
453532
"--immediateAppendCmdList=0",
454533
]
455-
456-
457-
class GraphApiSubmitExecGraph(ComputeBenchmark):
458-
def __init__(self, bench, ioq, submit, numKernels):
459-
self.ioq = ioq
460-
self.submit = submit
461-
self.numKernels = numKernels
462-
super().__init__(bench, "graph_api_benchmark_sycl", "SubmitExecGraph")
463-
464-
def name(self):
465-
return f"graph_api_benchmark_sycl SubmitExecGraph ioq:{self.ioq}, submit:{self.submit}, numKernels:{self.numKernels}"
466-
467-
def explicit_group(self):
468-
if self.submit:
469-
return "SubmitGraph"
470-
else:
471-
return "ExecGraph"
472-
473-
def bin_args(self) -> list[str]:
474-
return [
475-
"--iterations=100",
476-
f"--measureSubmit={self.submit}",
477-
f"--ioq={self.ioq}",
478-
f"--numKernels={self.numKernels}",
479-
]

devops/scripts/benchmarks/benches/llamacpp.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from pathlib import Path
99
from utils.utils import download, git_clone
1010
from .base import Benchmark, Suite
11-
from .result import Result
11+
from utils.result import Result
1212
from utils.utils import run, create_build_path
1313
from options import options
14-
from .oneapi import get_oneapi
14+
from utils.oneapi import get_oneapi
1515
import os
1616

1717

@@ -43,6 +43,7 @@ def setup(self):
4343
self.models_dir,
4444
"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf",
4545
"Phi-3-mini-4k-instruct-q4.gguf",
46+
checksum="fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4",
4647
)
4748

4849
self.oneapi = get_oneapi()
@@ -62,9 +63,9 @@ def setup(self):
6263
f'-DCMAKE_CXX_FLAGS=-I"{self.oneapi.mkl_include()}"',
6364
f"-DCMAKE_SHARED_LINKER_FLAGS=-L{self.oneapi.compiler_lib()} -L{self.oneapi.mkl_lib()}",
6465
]
65-
print(f"{self.__class__.__name__}: Run {configure_command}")
66+
6667
run(configure_command, add_sycl=True)
67-
print(f"{self.__class__.__name__}: Run cmake --build {self.build_path} -j")
68+
6869
run(
6970
f"cmake --build {self.build_path} -j",
7071
add_sycl=True,
@@ -92,6 +93,14 @@ def setup(self):
9293
def name(self):
9394
return f"llama.cpp"
9495

96+
def description(self) -> str:
97+
return (
98+
"Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. "
99+
"Runs both prompt processing (initial context processing) and text generation benchmarks with "
100+
"different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct "
101+
"quantized model and leverages SYCL with oneDNN for acceleration."
102+
)
103+
95104
def lower_is_better(self):
96105
return False
97106

@@ -130,6 +139,7 @@ def run(self, env_vars) -> list[Result]:
130139
env=env_vars,
131140
stdout=result,
132141
unit="token/s",
142+
description=self.description()
133143
)
134144
)
135145
return results

0 commit comments

Comments
 (0)