Skip to content

Commit 1936207

Browse files
committed
Merge branch 'unify-benchmark-ci' of https://github.com/intel/llvm into unify-benchmark-ci
2 parents 3cbed5e + 497dcce commit 1936207

19 files changed

+1167
-648
lines changed

devops/scripts/benchmarks/benches/base.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import os
77
import shutil
88
from pathlib import Path
9-
from .result import Result
9+
from utils.result import Result
1010
from options import options
1111
from utils.utils import download, run
1212
import urllib.request
@@ -55,16 +55,25 @@ def create_data_path(self, name, skip_data_dir=False):
5555
data_path = os.path.join(self.directory, name)
5656
else:
5757
data_path = os.path.join(self.directory, "data", name)
58-
if options.rebuild and Path(data_path).exists():
58+
if options.redownload and Path(data_path).exists():
5959
shutil.rmtree(data_path)
6060

6161
Path(data_path).mkdir(parents=True, exist_ok=True)
6262

6363
return data_path
6464

65-
def download(self, name, url, file, untar=False, unzip=False, skip_data_dir=False):
65+
def download(
66+
self,
67+
name,
68+
url,
69+
file,
70+
untar=False,
71+
unzip=False,
72+
skip_data_dir=False,
73+
checksum="",
74+
):
6675
self.data_path = self.create_data_path(name, skip_data_dir)
67-
return download(self.data_path, url, file, untar, unzip)
76+
return download(self.data_path, url, file, untar, unzip, checksum)
6877

6978
def name(self):
7079
raise NotImplementedError()

devops/scripts/benchmarks/benches/compute.py

Lines changed: 92 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
import io
99
from utils.utils import run, git_clone, create_build_path
1010
from .base import Benchmark, Suite
11-
from .result import Result
11+
from utils.result import Result
1212
from options import options
1313
from enum import Enum
1414

15+
1516
class ComputeBench(Suite):
1617
def __init__(self, directory):
1718
self.directory = directory
@@ -47,9 +48,8 @@ def setup(self):
4748
f"-Dunified-runtime_DIR={options.ur}/lib/cmake/unified-runtime",
4849
]
4950

50-
print(f"{self.__class__.__name__}: Run {configure_command}")
5151
run(configure_command, add_sycl=True)
52-
print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j")
52+
5353
run(f"cmake --build {build_path} -j", add_sycl=True)
5454

5555
self.built = True
@@ -73,16 +73,6 @@ def benchmarks(self) -> list[Benchmark]:
7373
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
7474
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
7575
VectorSum(self),
76-
MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
77-
MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
78-
MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
79-
MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
80-
MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
81-
MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
82-
MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
83-
MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
84-
MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
85-
MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
8676
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5),
8777
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5),
8878
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100),
@@ -98,6 +88,16 @@ def benchmarks(self) -> list[Benchmark]:
9888
SubmitKernelUR(self, 0, 0),
9989
SubmitKernelUR(self, 1, 0),
10090
SubmitKernelUR(self, 1, 1),
91+
MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
92+
MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
93+
MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
94+
MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
95+
MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
96+
MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
97+
MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
98+
MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
99+
MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
100+
MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
101101
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5),
102102
GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5),
103103
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100),
@@ -136,6 +136,9 @@ def setup(self):
136136
def explicit_group(self):
137137
return ""
138138

139+
def description(self) -> str:
140+
return ""
141+
139142
def run(self, env_vars) -> list[Result]:
140143
command = [
141144
f"{self.benchmark_bin}",
@@ -167,6 +170,7 @@ def run(self, env_vars) -> list[Result]:
167170
env=env_vars,
168171
stdout=result,
169172
unit=parse_unit_type(unit),
173+
description=self.description()
170174
)
171175
)
172176
return ret
@@ -221,6 +225,13 @@ def bin_args(self) -> list[str]:
221225
"--KernelExecTime=1",
222226
]
223227

228+
def description(self) -> str:
229+
order = "in-order" if self.ioq else "out-of-order"
230+
return (
231+
f"Measures CPU time overhead of submitting {order} kernels through SYCL API."
232+
"Uses 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time."
233+
)
234+
224235

225236
class SubmitKernelUR(ComputeBenchmark):
226237
def __init__(self, bench, ioq, measureCompletion):
@@ -237,6 +248,15 @@ def name(self):
237248
def explicit_group(self):
238249
return "SubmitKernel"
239250

251+
def description(self) -> str:
252+
order = "in-order" if self.ioq else "out-of-order"
253+
completion = "including" if self.measureCompletion else "excluding"
254+
return (
255+
f"Measures CPU time overhead of submitting {order} kernels through Unified Runtime API, "
256+
f"{completion} kernel completion time. Uses 10 simple kernels with minimal execution time "
257+
f"to isolate API overhead."
258+
)
259+
240260
def bin_args(self) -> list[str]:
241261
return [
242262
f"--Ioq={self.ioq}",
@@ -261,6 +281,14 @@ def name(self):
261281
def explicit_group(self):
262282
return "SubmitKernel"
263283

284+
def description(self) -> str:
285+
order = "in-order" if self.ioq else "out-of-order"
286+
return (
287+
f"Measures CPU time overhead of submitting {order} kernels through Level Zero API. "
288+
f"Uses immediate command lists with 10 minimal kernels to isolate submission overhead "
289+
f"from execution time."
290+
)
291+
264292
def bin_args(self) -> list[str]:
265293
return [
266294
f"--Ioq={self.ioq}",
@@ -286,6 +314,14 @@ def name(self):
286314
order = "in order" if self.ioq else "out of order"
287315
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
288316

317+
def description(self) -> str:
318+
order = "in-order" if self.ioq else "out-of-order"
319+
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
320+
return (
321+
f"Measures SYCL {order} queue overhead for {operation} from {self.source} to "
322+
f"{self.destination} memory with {self.size} bytes. Tests immediate execution overheads."
323+
)
324+
289325
def bin_args(self) -> list[str]:
290326
return [
291327
"--iterations=100000",
@@ -309,6 +345,13 @@ def __init__(self, bench, isCopyOnly, source, destination, size):
309345
def name(self):
310346
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
311347

348+
def description(self) -> str:
349+
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
350+
return (
351+
f"Measures SYCL in-order queue memory copy performance for {operation} from "
352+
f"{self.source} to {self.destination} with {self.size} bytes, executed 100 times per iteration."
353+
)
354+
312355
def bin_args(self) -> list[str]:
313356
return [
314357
"--iterations=10000",
@@ -330,6 +373,12 @@ def __init__(self, bench, source, destination, size):
330373
def name(self):
331374
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
332375

376+
def description(self) -> str:
377+
return (
378+
f"Measures general SYCL queue memory copy performance from {self.source} to "
379+
f"{self.destination} with {self.size} bytes per operation."
380+
)
381+
333382
def bin_args(self) -> list[str]:
334383
return [
335384
"--iterations=10000",
@@ -349,6 +398,12 @@ def __init__(self, bench, type, size, placement):
349398
def name(self):
350399
return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
351400

401+
def description(self) -> str:
402+
return (
403+
f"Measures {self.placement} memory bandwidth using {self.type} pattern with "
404+
f"{self.size} bytes. Higher values (GB/s) indicate better performance."
405+
)
406+
352407
# measurement is in GB/s
353408
def lower_is_better(self):
354409
return False
@@ -362,6 +417,7 @@ def bin_args(self) -> list[str]:
362417
"--useEvents=0",
363418
"--contents=Zeros",
364419
"--multiplier=1",
420+
"--vectorSize=1",
365421
]
366422

367423

@@ -372,6 +428,12 @@ def __init__(self, bench):
372428
def name(self):
373429
return f"miscellaneous_benchmark_sycl VectorSum"
374430

431+
def description(self) -> str:
432+
return (
433+
"Measures performance of vector addition across 3D grid (512x256x256 elements) "
434+
"using SYCL."
435+
)
436+
375437
def bin_args(self) -> list[str]:
376438
return [
377439
"--iterations=1000",
@@ -408,6 +470,16 @@ def name(self):
408470
+ (" without events" if not self.useEvents else "")
409471
)
410472

473+
def description(self) -> str:
474+
src_type = "device" if self.srcUSM == 1 else "host"
475+
dst_type = "device" if self.dstUSM == 1 else "host"
476+
events = "with" if self.useEvents else "without"
477+
return (
478+
f"Measures multithreaded memory copy performance with {self.numThreads} threads "
479+
f"each performing {self.numOpsPerThread} operations on {self.allocSize} bytes "
480+
f"from {src_type} to {dst_type} memory {events} events."
481+
)
482+
411483
def bin_args(self) -> list[str]:
412484
return [
413485
"--Ioq=1",
@@ -441,6 +513,13 @@ def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
441513
def explicit_group(self):
442514
return f"SinKernelGraph {self.numKernels}"
443515

516+
def description(self) -> str:
517+
execution = "using graphs" if self.withGraphs else "without graphs"
518+
return (
519+
f"Measures {self.runtime.value.upper()} performance when executing {self.numKernels} "
520+
f"sin kernels {execution}. Tests overhead and benefits of graph-based execution."
521+
)
522+
444523
def name(self):
445524
return f"graph_api_benchmark_{self.runtime.value} SinKernelGraph graphs:{self.withGraphs}, numKernels:{self.numKernels}"
446525

@@ -452,28 +531,3 @@ def bin_args(self) -> list[str]:
452531
"--withCopyOffload=1",
453532
"--immediateAppendCmdList=0",
454533
]
455-
456-
457-
class GraphApiSubmitExecGraph(ComputeBenchmark):
458-
def __init__(self, bench, ioq, submit, numKernels):
459-
self.ioq = ioq
460-
self.submit = submit
461-
self.numKernels = numKernels
462-
super().__init__(bench, "graph_api_benchmark_sycl", "SubmitExecGraph")
463-
464-
def name(self):
465-
return f"graph_api_benchmark_sycl SubmitExecGraph ioq:{self.ioq}, submit:{self.submit}, numKernels:{self.numKernels}"
466-
467-
def explicit_group(self):
468-
if self.submit:
469-
return "SubmitGraph"
470-
else:
471-
return "ExecGraph"
472-
473-
def bin_args(self) -> list[str]:
474-
return [
475-
"--iterations=100",
476-
f"--measureSubmit={self.submit}",
477-
f"--ioq={self.ioq}",
478-
f"--numKernels={self.numKernels}",
479-
]

devops/scripts/benchmarks/benches/llamacpp.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from pathlib import Path
99
from utils.utils import download, git_clone
1010
from .base import Benchmark, Suite
11-
from .result import Result
11+
from utils.result import Result
1212
from utils.utils import run, create_build_path
1313
from options import options
14-
from .oneapi import get_oneapi
14+
from utils.oneapi import get_oneapi
1515
import os
1616

1717

@@ -43,6 +43,7 @@ def setup(self):
4343
self.models_dir,
4444
"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf",
4545
"Phi-3-mini-4k-instruct-q4.gguf",
46+
checksum="fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4",
4647
)
4748

4849
self.oneapi = get_oneapi()
@@ -62,9 +63,9 @@ def setup(self):
6263
f'-DCMAKE_CXX_FLAGS=-I"{self.oneapi.mkl_include()}"',
6364
f"-DCMAKE_SHARED_LINKER_FLAGS=-L{self.oneapi.compiler_lib()} -L{self.oneapi.mkl_lib()}",
6465
]
65-
print(f"{self.__class__.__name__}: Run {configure_command}")
66+
6667
run(configure_command, add_sycl=True)
67-
print(f"{self.__class__.__name__}: Run cmake --build {self.build_path} -j")
68+
6869
run(
6970
f"cmake --build {self.build_path} -j",
7071
add_sycl=True,
@@ -92,6 +93,14 @@ def setup(self):
9293
def name(self):
9394
return f"llama.cpp"
9495

96+
def description(self) -> str:
97+
return (
98+
"Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. "
99+
"Runs both prompt processing (initial context processing) and text generation benchmarks with "
100+
"different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct "
101+
"quantized model and leverages SYCL with oneDNN for acceleration."
102+
)
103+
95104
def lower_is_better(self):
96105
return False
97106

@@ -130,6 +139,7 @@ def run(self, env_vars) -> list[Result]:
130139
env=env_vars,
131140
stdout=result,
132141
unit="token/s",
142+
description=self.description()
133143
)
134144
)
135145
return results

0 commit comments

Comments
 (0)