Skip to content

[SYCL][NATIVECPU][UR] performance improvments in NativeCPU adapter #17102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions unified-runtime/source/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,19 +217,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
auto numGroups = groups.size();
auto groupsPerThread = numGroups / numParallelThreads;
auto remainder = numGroups % numParallelThreads;
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
futures.emplace_back(
tp.schedule_task([groups, thread, groupsPerThread,
&kernel = *kernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, kernel);
}
}));
if (groupsPerThread) {
for (unsigned thread = 0; thread < numParallelThreads; thread++) {
futures.emplace_back(
tp.schedule_task([groups, thread, groupsPerThread,
&kernel = *kernel](size_t threadId) {
for (unsigned i = 0; i < groupsPerThread; i++) {
auto index = thread * groupsPerThread + i;
groups[index](threadId, kernel);
}
}));
}
}

// schedule the remaining tasks
auto remainder = numGroups % numParallelThreads;
if (remainder) {
futures.emplace_back(
tp.schedule_task([groups, remainder,
Expand Down Expand Up @@ -263,11 +265,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
return UR_RESULT_SUCCESS;
}

ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent,
const std::function<ur_result_t()> &f) {
template <class T>
static inline ur_result_t
withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent, T &&f) {
urEventWait(numEventsInWaitList, phEventWaitList);
ur_event_handle_t event = nullptr;
if (phEvent) {
Expand Down
21 changes: 10 additions & 11 deletions unified-runtime/source/adapters/native_cpu/threadpool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include <condition_variable>
#include <cstdlib>
#include <forward_list>
#include <functional>
#include <future>
#include <iterator>
#include <mutex>
Expand All @@ -24,7 +23,7 @@

namespace native_cpu {

using worker_task_t = std::function<void(size_t)>;
using worker_task_t = std::packaged_task<void(size_t)>;

namespace detail {

Expand Down Expand Up @@ -63,11 +62,11 @@ class worker_thread {
m_isRunning.store(true, std::memory_order_release);
}

inline void schedule(const worker_task_t &task) {
inline void schedule(worker_task_t &&task) {
{
std::lock_guard<std::mutex> lock(m_workMutex);
// Add the task to the queue
m_tasks.push(task);
m_tasks.emplace(std::move(task));
++m_numTasks;
}
m_startWorkCondition.notify_one();
Expand Down Expand Up @@ -135,9 +134,9 @@ class simple_thread_pool {
m_isRunning.store(false, std::memory_order_release);
}

inline void schedule(const worker_task_t &task) {
inline void schedule(worker_task_t &&task) {
// Schedule the task on the best available worker thread
this->best_worker().schedule(task);
this->best_worker().schedule(std::move(task));
}

inline bool is_running() const noexcept {
Expand Down Expand Up @@ -201,11 +200,11 @@ template <typename ThreadPoolT> class threadpool_interface {

threadpool_interface() : threadpool() {}

auto schedule_task(worker_task_t &&task) {
auto workerTask = std::make_shared<std::packaged_task<void(size_t)>>(
[task](auto &&PH1) { return task(std::forward<decltype(PH1)>(PH1)); });
threadpool.schedule([=](size_t threadId) { (*workerTask)(threadId); });
return workerTask->get_future();
template <class T> std::future<void> schedule_task(T &&task) {
auto workerTask = std::packaged_task<void(size_t)>(std::forward<T>(task));
auto ret = workerTask.get_future();
threadpool.schedule(std::move(workerTask));
return ret;
}
};

Expand Down