Skip to content

Commit 6d0dca5

Browse files
igchorCompute-Runtime-Automation
authored andcommitted
Port MemcpyExecute benchmark to SYCL
and implement option to submit a barrier Signed-off-by: Igor Chorazewicz <[email protected]>
1 parent 4db520a commit 6d0dca5

File tree

5 files changed

+250
-21
lines changed

5 files changed

+250
-21
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
2-
# Copyright (C) 2022-2024 Intel Corporation
2+
# Copyright (C) 2022-2025 Intel Corporation
33
#
44
# SPDX-License-Identifier: MIT
55
#
66

7-
add_benchmark(multithread_benchmark ocl l0 ur all)
7+
add_benchmark(multithread_benchmark ocl l0 ur sycl syclpreview all)

source/benchmarks/multithread_benchmark/definitions/memcpy_execute.h

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2024 Intel Corporation
2+
* Copyright (C) 2024-2025 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -20,6 +20,7 @@ struct MemcpyExecuteArguments : TestCaseArgumentContainer {
2020
BooleanArgument useQueuePerThread;
2121
BooleanArgument srcUSM;
2222
BooleanArgument dstUSM;
23+
BooleanArgument useBarrier;
2324

2425
MemcpyExecuteArguments()
2526
: inOrderQueue(*this, "Ioq", "Create the queue with the in_order property"),
@@ -30,7 +31,8 @@ struct MemcpyExecuteArguments : TestCaseArgumentContainer {
3031
useEvents(*this, "UseEvents", "Explicitly synchronize commands by events (needs to be set for Ioq=0)"),
3132
useQueuePerThread(*this, "UseQueuePerThread", "Use a separate queue in each thread"),
3233
srcUSM(*this, "SrcUSM", "Use USM for host source buffer"),
33-
dstUSM(*this, "DstUSM", "Use USM for host destination buffers") {}
34+
dstUSM(*this, "DstUSM", "Use USM for host destination buffers"),
35+
useBarrier(*this, "UseBarrier", "Submit barrier after each iteration (SYCL-only)") {}
3436
};
3537

3638
struct MemcpyExecute : TestCase<MemcpyExecuteArguments> {
@@ -44,3 +46,19 @@ struct MemcpyExecute : TestCase<MemcpyExecuteArguments> {
4446
return "measures time spent exeucting kernels interleved with memcpy operations";
4547
}
4648
};
49+
50+
// verify the results
51+
static inline TestResult verifyResults(size_t numThreads, size_t numOpsPerThread, size_t allocSize, std::vector<void *> &dst_buffers, int value) {
52+
for (size_t t = 0; t < numThreads; t++) {
53+
for (size_t i = 0; i < numOpsPerThread; i++) {
54+
for (size_t j = 0; j < allocSize / sizeof(int); j++) {
55+
auto v = *(((char *)dst_buffers[t]) + i * allocSize + j * sizeof(int));
56+
if (v != value) {
57+
std::cerr << "dst_buffers at: " << t << " " << i << " " << j << " , is: " << (int)v << std::endl;
58+
return TestResult::Error;
59+
}
60+
}
61+
}
62+
}
63+
return TestResult::Success;
64+
}

source/benchmarks/multithread_benchmark/gtest/memcpy_execute.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2024 Intel Corporation
2+
* Copyright (C) 2024-2025 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -14,7 +14,7 @@
1414

1515
[[maybe_unused]] static const inline RegisterTestCase<MemcpyExecute> registerTestCase{};
1616

17-
class MemcpyExecuteTest : public ::testing::TestWithParam<std::tuple<Api, bool, size_t, size_t, size_t, bool, bool, bool, bool, bool>> {
17+
class MemcpyExecuteTest : public ::testing::TestWithParam<std::tuple<Api, bool, size_t, size_t, size_t, bool, bool, bool, bool, bool, bool>> {
1818
};
1919

2020
TEST_P(MemcpyExecuteTest, Test) {
@@ -29,6 +29,7 @@ TEST_P(MemcpyExecuteTest, Test) {
2929
args.useQueuePerThread = std::get<7>(GetParam());
3030
args.srcUSM = std::get<8>(GetParam());
3131
args.dstUSM = std::get<9>(GetParam());
32+
args.useBarrier = std::get<10>(GetParam());
3233
MemcpyExecute test;
3334
test.run(args);
3435
}
@@ -46,5 +47,6 @@ INSTANTIATE_TEST_SUITE_P(
4647
::testing::Values(false, true), // useEvents
4748
::testing::Values(true), // useQueuePerThread
4849
::testing::Values(true), // srcUSM
49-
::testing::Values(true) // dstUSM
50+
::testing::Values(true), // dstUSM
51+
::testing::Values(false) // useBarrier
5052
));
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
/*
2+
* Copyright (C) 2024-2025 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "framework/sycl/sycl.h"
9+
#include "framework/test_case/register_test_case.h"
10+
#include "framework/utility/timer.h"
11+
12+
#include "definitions/memcpy_execute.h"
13+
14+
#include <mutex>
15+
#include <shared_mutex>
16+
#include <thread>
17+
18+
static auto inOrder = sycl::property::queue::in_order();
19+
static const sycl::property_list queueProps[] = {
20+
sycl::property_list{},
21+
sycl::property_list{inOrder}};
22+
23+
static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &statistics) {
24+
MeasurementFields typeSelector(MeasurementUnit::Microseconds, MeasurementType::Cpu);
25+
26+
if (isNoopRun()) {
27+
statistics.pushUnitAndType(typeSelector.getUnit(), typeSelector.getType());
28+
return TestResult::Nooped;
29+
}
30+
31+
bool inOrderQueue = arguments.inOrderQueue;
32+
bool measureCompletionTime = arguments.measureCompletionTime;
33+
size_t numOpsPerThread = arguments.numOpsPerThread;
34+
size_t numThreads = arguments.numThreads;
35+
size_t allocSize = arguments.allocSize;
36+
bool useEvents = arguments.useEvents;
37+
bool useQueuePerThread = arguments.useQueuePerThread;
38+
bool srcUSM = arguments.srcUSM;
39+
bool dstUSM = arguments.dstUSM;
40+
bool useBarrier = arguments.useBarrier;
41+
size_t arraySize = allocSize / sizeof(int);
42+
43+
if (!inOrderQueue) {
44+
std::cerr << "Out of order mode not supported yet" << std::endl;
45+
return TestResult::Error;
46+
}
47+
48+
// Setup
49+
Timer timer;
50+
51+
const size_t gws = arraySize;
52+
const size_t lws = 1u;
53+
sycl::nd_range<1> range(gws, lws);
54+
55+
auto queuePropsIndex = 0;
56+
queuePropsIndex |= arguments.inOrderQueue ? 0x1 : 0;
57+
58+
std::vector<std::vector<void *>> usm(numThreads);
59+
std::vector<sycl::queue> queues;
60+
61+
// Setup queues (or a single queue if !useQueuePerThread)
62+
if (!useQueuePerThread) {
63+
sycl::queue singleQueue{queueProps[queuePropsIndex]};
64+
for (size_t i = 0; i < numThreads; i++) {
65+
queues.push_back(singleQueue);
66+
}
67+
} else {
68+
for (size_t i = 0; i < numThreads; i++) {
69+
queues.emplace_back(queueProps[queuePropsIndex]);
70+
}
71+
}
72+
73+
void *src_buffer;
74+
std::vector<void *> dst_buffers;
75+
76+
if (srcUSM) {
77+
src_buffer = sycl::malloc_host(allocSize, queues[0].get_context());
78+
} else {
79+
src_buffer = malloc(allocSize);
80+
}
81+
82+
if (src_buffer == nullptr) {
83+
std::cerr << "Failed to allocate memory for src_buffer" << std::endl;
84+
return TestResult::Error;
85+
}
86+
87+
memset(src_buffer, 99, allocSize);
88+
89+
// Setup USM allocations
90+
for (size_t i = 0; i < numThreads; i++) {
91+
for (size_t j = 0; j < numOpsPerThread; j++) {
92+
usm[i].push_back(sycl::malloc_device(allocSize, queues[i].get_device(), queues[i].get_context()));
93+
}
94+
95+
dst_buffers.emplace_back();
96+
97+
if (dstUSM) {
98+
dst_buffers.back() = sycl::malloc_host(allocSize * numOpsPerThread, queues[0].get_context());
99+
} else {
100+
dst_buffers.back() = malloc(allocSize * numOpsPerThread);
101+
}
102+
if (dst_buffers.back() == nullptr) {
103+
std::cerr << "Failed to allocate memory for dst_buffer" << std::endl;
104+
return TestResult::Error;
105+
}
106+
memset(dst_buffers.back(), 0, allocSize * numOpsPerThread);
107+
}
108+
109+
auto worker = [&](size_t thread_id, Timer &timer) {
110+
timer.measureStart();
111+
112+
auto &queue = queues[thread_id];
113+
for (size_t i = 0; i < numOpsPerThread; i++) {
114+
int *usm_ptr = (int *)usm[thread_id][i];
115+
auto host_dst = ((char *)dst_buffers[thread_id]) + i * allocSize;
116+
117+
if (useEvents) {
118+
queue.memcpy(usm_ptr, src_buffer, allocSize);
119+
queue.parallel_for(sycl::range<1>{arraySize}, [usm_ptr](sycl::item<1> itemId) {
120+
auto id = itemId.get_id(0);
121+
usm_ptr[id] = 1;
122+
});
123+
queue.memcpy(host_dst, usm_ptr, allocSize);
124+
125+
if (useBarrier) {
126+
queue.ext_oneapi_submit_barrier();
127+
}
128+
} else {
129+
sycl::ext::oneapi::experimental::memcpy(queue, usm_ptr, src_buffer, allocSize);
130+
sycl::ext::oneapi::experimental::nd_launch(queue, range, [usm_ptr](sycl::nd_item<1> itemId) {
131+
auto id = itemId.get_global_id(0);
132+
usm_ptr[id] = 1;
133+
});
134+
sycl::ext::oneapi::experimental::memcpy(queue, host_dst, usm_ptr, allocSize);
135+
136+
if (useBarrier) {
137+
queue.ext_oneapi_submit_barrier();
138+
}
139+
}
140+
}
141+
142+
if (!measureCompletionTime)
143+
timer.measureEnd();
144+
145+
queue.wait();
146+
147+
if (measureCompletionTime)
148+
timer.measureEnd();
149+
};
150+
151+
// warmup
152+
for (auto iteration = 0u; iteration < arguments.numThreads; iteration++) {
153+
std::vector<std::thread> threads;
154+
for (size_t j = 0u; j < arguments.numThreads; j++) {
155+
threads.emplace_back([&, j] {
156+
Timer dummyTimer;
157+
worker(j, dummyTimer);
158+
});
159+
}
160+
for (auto &thread : threads) {
161+
thread.join();
162+
}
163+
}
164+
165+
// Benchmark
166+
for (size_t i = 0u; i < arguments.iterations; i++) {
167+
std::shared_mutex barrier;
168+
std::vector<std::thread> threads;
169+
std::vector<Timer> timers(arguments.numThreads);
170+
171+
std::unique_lock<std::shared_mutex> lock(barrier);
172+
for (size_t j = 0u; j < arguments.numThreads; j++) {
173+
threads.emplace_back([&, j] {
174+
std::shared_lock<std::shared_mutex> lock(barrier);
175+
worker(j, timers[j]);
176+
});
177+
}
178+
lock.unlock();
179+
180+
auto aggregatedTime = std::chrono::high_resolution_clock::duration(0);
181+
for (size_t j = 0u; j < arguments.numThreads; j++) {
182+
threads[j].join();
183+
aggregatedTime += timers[j].get();
184+
}
185+
auto avgTime = aggregatedTime / arguments.numThreads;
186+
187+
#ifndef NDEBUG
188+
auto res = verifyResults(numThreads, numOpsPerThread, allocSize, dst_buffers, 1);
189+
if (res != TestResult::Success)
190+
return res;
191+
#endif
192+
193+
statistics.pushValue(avgTime, typeSelector.getUnit(), typeSelector.getType());
194+
}
195+
196+
if (srcUSM) {
197+
sycl::free(src_buffer, queues[0].get_context());
198+
} else {
199+
free(src_buffer);
200+
}
201+
202+
return TestResult::Success;
203+
}
204+
205+
[[maybe_unused]] static RegisterTestCaseImplementation<MemcpyExecute> registerTestCase(run, Api::SYCL);

source/benchmarks/multithread_benchmark/implementations/ur/memcpy_execute_interleaved.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2024 Intel Corporation
2+
* Copyright (C) 2024-2025 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -39,6 +39,7 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
3939
bool srcUSM = arguments.srcUSM;
4040
bool dstUSM = arguments.dstUSM;
4141
size_t arraySize = allocSize / sizeof(int);
42+
bool useBarrier = arguments.useBarrier;
4243

4344
if (!useEvents && !inOrderQueue) {
4445
std::cerr << "In order queue must be used when events are not used" << std::endl;
@@ -98,6 +99,10 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
9899
} else {
99100
src_buffer = malloc(allocSize);
100101
}
102+
if (src_buffer == nullptr) {
103+
std::cerr << "Failed to allocate memory for src_buffer" << std::endl;
104+
return TestResult::Error;
105+
}
101106

102107
memset(src_buffer, 99, allocSize);
103108

@@ -121,13 +126,17 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
121126
} else {
122127
dst_buffers.back() = malloc(allocSize * numOpsPerThread);
123128
}
129+
if (dst_buffers.back() == nullptr) {
130+
std::cerr << "Failed to allocate memory for dst_buffer" << std::endl;
131+
return TestResult::Error;
132+
}
124133
memset(dst_buffers.back(), 0, allocSize * numOpsPerThread);
125134
}
126135

127136
auto worker = [&](size_t thread_id, Timer &timer) {
128137
std::vector<std::vector<ur_event_handle_t>> events(numOpsPerThread);
129138
for (auto &events_vec : events) {
130-
events_vec.assign(3, nullptr);
139+
events_vec.assign(4, nullptr);
131140
}
132141

133142
timer.measureStart();
@@ -145,6 +154,10 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
145154
EXPECT_UR_RESULT_SUCCESS(urEnqueueUSMMemcpy(queue, false, usm_ptr, src_buffer, allocSize, 0, nullptr, memcpySignalEventPtr));
146155
EXPECT_UR_RESULT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &arraySize, nullptr, useEvents, memcpySignalEventPtr, kernelSignalEventPtr));
147156
EXPECT_UR_RESULT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_dst, usm_ptr, allocSize, useEvents, kernelSignalEventPtr, finalSignalEventPtr));
157+
158+
if (useBarrier) {
159+
EXPECT_UR_RESULT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, useEvents, finalSignalEventPtr, useEvents ? &events[i][3] : nullptr));
160+
}
148161
}
149162

150163
if (!measureCompletionTime)
@@ -206,18 +219,9 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
206219
auto avgTime = aggregatedTime / arguments.numThreads;
207220

208221
#ifndef NDEBUG
209-
// verify the results
210-
for (size_t t = 0; t < numThreads; t++) {
211-
for (size_t i = 0; i < numOpsPerThread; i++) {
212-
for (size_t j = 0; j < allocSize / sizeof(int); j++) {
213-
auto v = *(((char *)dst_buffers[t]) + i * allocSize + j * sizeof(int));
214-
if (v != 1) {
215-
std::cerr << "dst_buffers at: " << t << " " << i << " " << j << " , is: " << (int)v << std::endl;
216-
return TestResult::Error;
217-
}
218-
}
219-
}
220-
}
222+
auto res = verifyResults(numThreads, numOpsPerThread, allocSize, dst_buffers, 1);
223+
if (res != TestResult::Success)
224+
return res;
221225
#endif
222226

223227
statistics.pushValue(avgTime, typeSelector.getUnit(), typeSelector.getType());

0 commit comments

Comments
 (0)