Port MemcpyExecute benchmark to SYCL

igchor · Compute-Runtime-Automation · commit 6d0dca5606eb · 2025-05-21T11:28:23.000+02:00
and implement option to submit a barrier

Signed-off-by: Igor Chorazewicz &lt;igor.chorazewicz@intel.com&gt;
diff --git a/source/benchmarks/multithread_benchmark/CMakeLists.txt b/source/benchmarks/multithread_benchmark/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
-# Copyright (C) 2022-2024 Intel Corporation
+# Copyright (C) 2022-2025 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
 
-add_benchmark(multithread_benchmark ocl l0 ur all)
+add_benchmark(multithread_benchmark ocl l0 ur sycl syclpreview all)
diff --git a/source/benchmarks/multithread_benchmark/definitions/memcpy_execute.h b/source/benchmarks/multithread_benchmark/definitions/memcpy_execute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2024-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -20,6 +20,7 @@ struct MemcpyExecuteArguments : TestCaseArgumentContainer {
     BooleanArgument useQueuePerThread;
     BooleanArgument srcUSM;
     BooleanArgument dstUSM;
+    BooleanArgument useBarrier;
 
     MemcpyExecuteArguments()
         : inOrderQueue(*this, "Ioq", "Create the queue with the in_order property"),
@@ -30,7 +31,8 @@ struct MemcpyExecuteArguments : TestCaseArgumentContainer {
           useEvents(*this, "UseEvents", "Explicitly synchronize commands by events (needs to be set for Ioq=0)"),
           useQueuePerThread(*this, "UseQueuePerThread", "Use a separate queue in each thread"),
           srcUSM(*this, "SrcUSM", "Use USM for host source buffer"),
-          dstUSM(*this, "DstUSM", "Use USM for host destination buffers") {}
+          dstUSM(*this, "DstUSM", "Use USM for host destination buffers"),
+          useBarrier(*this, "UseBarrier", "Submit barrier after each iteration (SYCL-only)") {}
 };
 
 struct MemcpyExecute : TestCase<MemcpyExecuteArguments> {
@@ -44,3 +46,19 @@ struct MemcpyExecute : TestCase<MemcpyExecuteArguments> {
         return "measures time spent exeucting kernels interleved with memcpy operations";
     }
 };
+
+// verify the results
+static inline TestResult verifyResults(size_t numThreads, size_t numOpsPerThread, size_t allocSize, std::vector<void *> &dst_buffers, int value) {
+    for (size_t t = 0; t < numThreads; t++) {
+        for (size_t i = 0; i < numOpsPerThread; i++) {
+            for (size_t j = 0; j < allocSize / sizeof(int); j++) {
+                auto v = *(((char *)dst_buffers[t]) + i * allocSize + j * sizeof(int));
+                if (v != value) {
+                    std::cerr << "dst_buffers at: " << t << " " << i << " " << j << " , is: " << (int)v << std::endl;
+                    return TestResult::Error;
+                }
+            }
+        }
+    }
+    return TestResult::Success;
+}
diff --git a/source/benchmarks/multithread_benchmark/gtest/memcpy_execute.cpp b/source/benchmarks/multithread_benchmark/gtest/memcpy_execute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2024-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -14,7 +14,7 @@
 
 [[maybe_unused]] static const inline RegisterTestCase<MemcpyExecute> registerTestCase{};
 
-class MemcpyExecuteTest : public ::testing::TestWithParam<std::tuple<Api, bool, size_t, size_t, size_t, bool, bool, bool, bool, bool>> {
+class MemcpyExecuteTest : public ::testing::TestWithParam<std::tuple<Api, bool, size_t, size_t, size_t, bool, bool, bool, bool, bool, bool>> {
 };
 
 TEST_P(MemcpyExecuteTest, Test) {
@@ -29,6 +29,7 @@ TEST_P(MemcpyExecuteTest, Test) {
     args.useQueuePerThread = std::get<7>(GetParam());
     args.srcUSM = std::get<8>(GetParam());
     args.dstUSM = std::get<9>(GetParam());
+    args.useBarrier = std::get<10>(GetParam());
     MemcpyExecute test;
     test.run(args);
 }
@@ -46,5 +47,6 @@ INSTANTIATE_TEST_SUITE_P(
         ::testing::Values(false, true),   // useEvents
         ::testing::Values(true),          // useQueuePerThread
         ::testing::Values(true),          // srcUSM
-        ::testing::Values(true)           // dstUSM
+        ::testing::Values(true),          // dstUSM
+        ::testing::Values(false)          // useBarrier
         ));
diff --git a/source/benchmarks/multithread_benchmark/implementations/sycl/memcpy_execute_interleaved.cpp b/source/benchmarks/multithread_benchmark/implementations/sycl/memcpy_execute_interleaved.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2024-2025 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "framework/sycl/sycl.h"
+#include "framework/test_case/register_test_case.h"
+#include "framework/utility/timer.h"
+
+#include "definitions/memcpy_execute.h"
+
+#include <mutex>
+#include <shared_mutex>
+#include <thread>
+
+static auto inOrder = sycl::property::queue::in_order();
+static const sycl::property_list queueProps[] = {
+    sycl::property_list{},
+    sycl::property_list{inOrder}};
+
+static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &statistics) {
+    MeasurementFields typeSelector(MeasurementUnit::Microseconds, MeasurementType::Cpu);
+
+    if (isNoopRun()) {
+        statistics.pushUnitAndType(typeSelector.getUnit(), typeSelector.getType());
+        return TestResult::Nooped;
+    }
+
+    bool inOrderQueue = arguments.inOrderQueue;
+    bool measureCompletionTime = arguments.measureCompletionTime;
+    size_t numOpsPerThread = arguments.numOpsPerThread;
+    size_t numThreads = arguments.numThreads;
+    size_t allocSize = arguments.allocSize;
+    bool useEvents = arguments.useEvents;
+    bool useQueuePerThread = arguments.useQueuePerThread;
+    bool srcUSM = arguments.srcUSM;
+    bool dstUSM = arguments.dstUSM;
+    bool useBarrier = arguments.useBarrier;
+    size_t arraySize = allocSize / sizeof(int);
+
+    if (!inOrderQueue) {
+        std::cerr << "Out of order mode not supported yet" << std::endl;
+        return TestResult::Error;
+    }
+
+    // Setup
+    Timer timer;
+
+    const size_t gws = arraySize;
+    const size_t lws = 1u;
+    sycl::nd_range<1> range(gws, lws);
+
+    auto queuePropsIndex = 0;
+    queuePropsIndex |= arguments.inOrderQueue ? 0x1 : 0;
+
+    std::vector<std::vector<void *>> usm(numThreads);
+    std::vector<sycl::queue> queues;
+
+    // Setup queues (or a single queue if !useQueuePerThread)
+    if (!useQueuePerThread) {
+        sycl::queue singleQueue{queueProps[queuePropsIndex]};
+        for (size_t i = 0; i < numThreads; i++) {
+            queues.push_back(singleQueue);
+        }
+    } else {
+        for (size_t i = 0; i < numThreads; i++) {
+            queues.emplace_back(queueProps[queuePropsIndex]);
+        }
+    }
+
+    void *src_buffer;
+    std::vector<void *> dst_buffers;
+
+    if (srcUSM) {
+        src_buffer = sycl::malloc_host(allocSize, queues[0].get_context());
+    } else {
+        src_buffer = malloc(allocSize);
+    }
+
+    if (src_buffer == nullptr) {
+        std::cerr << "Failed to allocate memory for src_buffer" << std::endl;
+        return TestResult::Error;
+    }
+
+    memset(src_buffer, 99, allocSize);
+
+    // Setup USM allocations
+    for (size_t i = 0; i < numThreads; i++) {
+        for (size_t j = 0; j < numOpsPerThread; j++) {
+            usm[i].push_back(sycl::malloc_device(allocSize, queues[i].get_device(), queues[i].get_context()));
+        }
+
+        dst_buffers.emplace_back();
+
+        if (dstUSM) {
+            dst_buffers.back() = sycl::malloc_host(allocSize * numOpsPerThread, queues[0].get_context());
+        } else {
+            dst_buffers.back() = malloc(allocSize * numOpsPerThread);
+        }
+        if (dst_buffers.back() == nullptr) {
+            std::cerr << "Failed to allocate memory for dst_buffer" << std::endl;
+            return TestResult::Error;
+        }
+        memset(dst_buffers.back(), 0, allocSize * numOpsPerThread);
+    }
+
+    auto worker = [&](size_t thread_id, Timer &timer) {
+        timer.measureStart();
+
+        auto &queue = queues[thread_id];
+        for (size_t i = 0; i < numOpsPerThread; i++) {
+            int *usm_ptr = (int *)usm[thread_id][i];
+            auto host_dst = ((char *)dst_buffers[thread_id]) + i * allocSize;
+
+            if (useEvents) {
+                queue.memcpy(usm_ptr, src_buffer, allocSize);
+                queue.parallel_for(sycl::range<1>{arraySize}, [usm_ptr](sycl::item<1> itemId) {
+                    auto id = itemId.get_id(0);
+                    usm_ptr[id] = 1;
+                });
+                queue.memcpy(host_dst, usm_ptr, allocSize);
+
+                if (useBarrier) {
+                    queue.ext_oneapi_submit_barrier();
+                }
+            } else {
+                sycl::ext::oneapi::experimental::memcpy(queue, usm_ptr, src_buffer, allocSize);
+                sycl::ext::oneapi::experimental::nd_launch(queue, range, [usm_ptr](sycl::nd_item<1> itemId) {
+                    auto id = itemId.get_global_id(0);
+                    usm_ptr[id] = 1;
+                });
+                sycl::ext::oneapi::experimental::memcpy(queue, host_dst, usm_ptr, allocSize);
+
+                if (useBarrier) {
+                    queue.ext_oneapi_submit_barrier();
+                }
+            }
+        }
+
+        if (!measureCompletionTime)
+            timer.measureEnd();
+
+        queue.wait();
+
+        if (measureCompletionTime)
+            timer.measureEnd();
+    };
+
+    // warmup
+    for (auto iteration = 0u; iteration < arguments.numThreads; iteration++) {
+        std::vector<std::thread> threads;
+        for (size_t j = 0u; j < arguments.numThreads; j++) {
+            threads.emplace_back([&, j] {
+                Timer dummyTimer;
+                worker(j, dummyTimer);
+            });
+        }
+        for (auto &thread : threads) {
+            thread.join();
+        }
+    }
+
+    // Benchmark
+    for (size_t i = 0u; i < arguments.iterations; i++) {
+        std::shared_mutex barrier;
+        std::vector<std::thread> threads;
+        std::vector<Timer> timers(arguments.numThreads);
+
+        std::unique_lock<std::shared_mutex> lock(barrier);
+        for (size_t j = 0u; j < arguments.numThreads; j++) {
+            threads.emplace_back([&, j] {
+                std::shared_lock<std::shared_mutex> lock(barrier);
+                worker(j, timers[j]);
+            });
+        }
+        lock.unlock();
+
+        auto aggregatedTime = std::chrono::high_resolution_clock::duration(0);
+        for (size_t j = 0u; j < arguments.numThreads; j++) {
+            threads[j].join();
+            aggregatedTime += timers[j].get();
+        }
+        auto avgTime = aggregatedTime / arguments.numThreads;
+
+#ifndef NDEBUG
+        auto res = verifyResults(numThreads, numOpsPerThread, allocSize, dst_buffers, 1);
+        if (res != TestResult::Success)
+            return res;
+#endif
+
+        statistics.pushValue(avgTime, typeSelector.getUnit(), typeSelector.getType());
+    }
+
+    if (srcUSM) {
+        sycl::free(src_buffer, queues[0].get_context());
+    } else {
+        free(src_buffer);
+    }
+
+    return TestResult::Success;
+}
+
+[[maybe_unused]] static RegisterTestCaseImplementation<MemcpyExecute> registerTestCase(run, Api::SYCL);
diff --git a/source/benchmarks/multithread_benchmark/implementations/ur/memcpy_execute_interleaved.cpp b/source/benchmarks/multithread_benchmark/implementations/ur/memcpy_execute_interleaved.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2024 Intel Corporation
+ * Copyright (C) 2024-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,7 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
     bool srcUSM = arguments.srcUSM;
     bool dstUSM = arguments.dstUSM;
     size_t arraySize = allocSize / sizeof(int);
+    bool useBarrier = arguments.useBarrier;
 
     if (!useEvents && !inOrderQueue) {
         std::cerr << "In order queue must be used when events are not used" << std::endl;
@@ -98,6 +99,10 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
     } else {
         src_buffer = malloc(allocSize);
     }
+    if (src_buffer == nullptr) {
+        std::cerr << "Failed to allocate memory for src_buffer" << std::endl;
+        return TestResult::Error;
+    }
 
     memset(src_buffer, 99, allocSize);
 
@@ -121,13 +126,17 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
         } else {
             dst_buffers.back() = malloc(allocSize * numOpsPerThread);
         }
+        if (dst_buffers.back() == nullptr) {
+            std::cerr << "Failed to allocate memory for dst_buffer" << std::endl;
+            return TestResult::Error;
+        }
         memset(dst_buffers.back(), 0, allocSize * numOpsPerThread);
     }
 
     auto worker = [&](size_t thread_id, Timer &timer) {
         std::vector<std::vector<ur_event_handle_t>> events(numOpsPerThread);
         for (auto &events_vec : events) {
-            events_vec.assign(3, nullptr);
+            events_vec.assign(4, nullptr);
         }
 
         timer.measureStart();
@@ -145,6 +154,10 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
             EXPECT_UR_RESULT_SUCCESS(urEnqueueUSMMemcpy(queue, false, usm_ptr, src_buffer, allocSize, 0, nullptr, memcpySignalEventPtr));
             EXPECT_UR_RESULT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &arraySize, nullptr, useEvents, memcpySignalEventPtr, kernelSignalEventPtr));
             EXPECT_UR_RESULT_SUCCESS(urEnqueueUSMMemcpy(queue, false, host_dst, usm_ptr, allocSize, useEvents, kernelSignalEventPtr, finalSignalEventPtr));
+
+            if (useBarrier) {
+                EXPECT_UR_RESULT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, useEvents, finalSignalEventPtr, useEvents ? &events[i][3] : nullptr));
+            }
         }
 
         if (!measureCompletionTime)
@@ -206,18 +219,9 @@ static TestResult run(const MemcpyExecuteArguments &arguments, Statistics &stati
         auto avgTime = aggregatedTime / arguments.numThreads;
 
 #ifndef NDEBUG
-        // verify the results
-        for (size_t t = 0; t < numThreads; t++) {
-            for (size_t i = 0; i < numOpsPerThread; i++) {
-                for (size_t j = 0; j < allocSize / sizeof(int); j++) {
-                    auto v = *(((char *)dst_buffers[t]) + i * allocSize + j * sizeof(int));
-                    if (v != 1) {
-                        std::cerr << "dst_buffers at: " << t << " " << i << " " << j << " , is: " << (int)v << std::endl;
-                        return TestResult::Error;
-                    }
-                }
-            }
-        }
+        auto res = verifyResults(numThreads, numOpsPerThread, allocSize, dst_buffers, 1);
+        if (res != TestResult::Success)
+            return res;
 #endif
 
         statistics.pushValue(avgTime, typeSelector.getUnit(), typeSelector.getType());

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#`
`2`		`-# Copyright (C) 2022-2024 Intel Corporation`
	`2`	`+# Copyright (C) 2022-2025 Intel Corporation`
`3`	`3`	`#`
`4`	`4`	`# SPDX-License-Identifier: MIT`
`5`	`5`	`#`
`6`	`6`
`7`		`-add_benchmark(multithread_benchmark ocl l0 ur all)`
	`7`	`+add_benchmark(multithread_benchmark ocl l0 ur sycl syclpreview all)`