Skip to content

[SYCL] Fix a race condition when enqueueing an interop kernel #8111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2106,7 +2106,12 @@ pi_int32 enqueueImpKernel(
auto ContextImpl = Queue->getContextImplPtr();
auto DeviceImpl = Queue->getDeviceImplPtr();
RT::PiKernel Kernel = nullptr;
std::mutex *KernelMutex = nullptr;
// Cacheable kernels use per-kernel mutexes that will be fetched from the
// cache, others (e.g. interoperability kernels) share a single mutex.
// TODO consider adding a PiKernel -> mutex map for allowing to enqueue
// different PiKernel's in parallel.
static std::mutex NoncacheableEnqueueMutex;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be probably better to have an std::mutexstored in kernel_impl. So, we do not serialize submission of different sycl::kernels.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@romanovvlad I tend to agree, although wouldn't this mutex need to be tied to pi_kernel like described in the comment above? Otherwise we will run into issues if two interop sycl::kernel are created with the same native kernel.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable.

std::mutex *KernelMutex = &NoncacheableEnqueueMutex;
RT::PiProgram Program = nullptr;

std::shared_ptr<kernel_impl> SyclKernelImpl;
Expand Down Expand Up @@ -2179,18 +2184,13 @@ pi_int32 enqueueImpKernel(
detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
OSModuleHandle, Program, KernelName);
}
if (KernelMutex != nullptr) {
// For cacheable kernels, we use per-kernel mutex
{
assert(KernelMutex);
std::lock_guard<std::mutex> Lock(*KernelMutex);
Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
NDRDesc, EventsWaitList, OutEvent,
EliminatedArgMask, getMemAllocationFunc);
} else {
Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
NDRDesc, EventsWaitList, OutEvent,
EliminatedArgMask, getMemAllocationFunc);
}

if (PI_SUCCESS != Error) {
// If we have got non-success error code, let's analyze it to emit nice
// exception explaining what was wrong
Expand Down
133 changes: 14 additions & 119 deletions sycl/unittests/handler/SetArgForLocalAccessor.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
//==------- SetArgForLocalAccessor.cpp --- Handler unit tests --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <gtest/gtest.h>
#include <helpers/KernelInteropCommon.hpp>
#include <helpers/PiMock.hpp>

#include <sycl/sycl.hpp>
Expand All @@ -9,135 +18,20 @@

namespace {

struct TestContext {
size_t localBufferArgSize = 0;

// SYCL RT has number of checks that all devices and contexts are consistent
// between kernel, kernel_bundle and other objects.
//
// To ensure that those checks pass, we intercept some PI calls to extract
// the exact PI handles of device and context used in queue creation to later
// return them when program/context/kernel info is requested.
pi_device deviceHandle;
pi_context contextHandle;

pi_program programHandle = createDummyHandle<pi_program>();

~TestContext() { releaseDummyHandle<pi_program>(programHandle); }
};

TestContext GlobalContext;

} // namespace
size_t LocalBufferArgSize = 0;

pi_result redefined_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index,
size_t arg_size, const void *arg_value) {
GlobalContext.localBufferArgSize = arg_size;

return PI_SUCCESS;
}

pi_result after_piContextGetInfo(pi_context context, pi_context_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_CONTEXT_INFO_DEVICES:
if (param_value)
*static_cast<pi_device *>(param_value) = GlobalContext.deviceHandle;
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.deviceHandle);
break;
default:;
}

return PI_SUCCESS;
}

pi_result after_piProgramGetInfo(pi_program program, pi_program_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {

switch (param_name) {
case PI_PROGRAM_INFO_DEVICES:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.deviceHandle);
if (param_value)
*static_cast<pi_device *>(param_value) = GlobalContext.deviceHandle;
break;
default:;
}

return PI_SUCCESS;
}

pi_result redefined_piProgramGetBuildInfo(pi_program program, pi_device device,
_pi_program_build_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_PROGRAM_BUILD_INFO_BINARY_TYPE:
if (param_value_size_ret)
*param_value_size_ret = sizeof(pi_program_binary_type);
if (param_value)
*static_cast<pi_program_binary_type *>(param_value) =
PI_PROGRAM_BINARY_TYPE_EXECUTABLE;
break;
default:;
}

return PI_SUCCESS;
}

pi_result after_piContextCreate(const pi_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data),
void *user_data, pi_context *ret_context) {
if (ret_context)
GlobalContext.contextHandle = *ret_context;
GlobalContext.deviceHandle = *devices;
return PI_SUCCESS;
}

pi_result after_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_KERNEL_INFO_CONTEXT:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.contextHandle);
if (param_value)
*static_cast<pi_context *>(param_value) = GlobalContext.contextHandle;
break;
case PI_KERNEL_INFO_PROGRAM:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.programHandle);
if (param_value)
*(pi_program *)param_value = GlobalContext.programHandle;
break;
default:;
}
LocalBufferArgSize = arg_size;

return PI_SUCCESS;
}

TEST(HandlerSetArg, LocalAccessor) {
sycl::unittest::PiMock Mock;

redefineMockForKernelInterop(Mock);
Mock.redefine<sycl::detail::PiApiKind::piKernelSetArg>(
redefined_piKernelSetArg);
Mock.redefineAfter<sycl::detail::PiApiKind::piContextCreate>(
after_piContextCreate);
Mock.redefineAfter<sycl::detail::PiApiKind::piProgramGetInfo>(
after_piProgramGetInfo);
Mock.redefineAfter<sycl::detail::PiApiKind::piContextGetInfo>(
after_piContextGetInfo);
Mock.redefineAfter<sycl::detail::PiApiKind::piKernelGetInfo>(
after_piKernelGetInfo);
Mock.redefine<sycl::detail::PiApiKind::piProgramGetBuildInfo>(
redefined_piProgramGetBuildInfo);

constexpr size_t Size = 128;
sycl::queue Q;
Expand All @@ -154,5 +48,6 @@ TEST(HandlerSetArg, LocalAccessor) {
CGH.single_task(Kernel);
}).wait();

ASSERT_EQ(GlobalContext.localBufferArgSize, Size * sizeof(float));
ASSERT_EQ(LocalBufferArgSize, Size * sizeof(float));
}
} // namespace
126 changes: 126 additions & 0 deletions sycl/unittests/helpers/KernelInteropCommon.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//==-- KernelInteropCommon.hpp --- Common kernel interop redefinitions -----==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <helpers/PiMock.hpp>

struct TestContext {

// SYCL RT has number of checks that all devices and contexts are consistent
// between kernel, kernel_bundle and other objects.
//
// To ensure that those checks pass, we intercept some PI calls to extract
// the exact PI handles of device and context used in queue creation to later
// return them when program/context/kernel info is requested.
pi_device deviceHandle;
pi_context contextHandle;

pi_program programHandle = createDummyHandle<pi_program>();

~TestContext() { releaseDummyHandle<pi_program>(programHandle); }
};

TestContext GlobalContext;

pi_result after_piContextGetInfo(pi_context context, pi_context_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_CONTEXT_INFO_DEVICES:
if (param_value)
*static_cast<pi_device *>(param_value) = GlobalContext.deviceHandle;
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.deviceHandle);
break;
default:;
}

return PI_SUCCESS;
}

pi_result after_piProgramGetInfo(pi_program program, pi_program_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {

switch (param_name) {
case PI_PROGRAM_INFO_DEVICES:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.deviceHandle);
if (param_value)
*static_cast<pi_device *>(param_value) = GlobalContext.deviceHandle;
break;
default:;
}

return PI_SUCCESS;
}

pi_result redefined_piProgramGetBuildInfo(pi_program program, pi_device device,
_pi_program_build_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_PROGRAM_BUILD_INFO_BINARY_TYPE:
if (param_value_size_ret)
*param_value_size_ret = sizeof(pi_program_binary_type);
if (param_value)
*static_cast<pi_program_binary_type *>(param_value) =
PI_PROGRAM_BINARY_TYPE_EXECUTABLE;
break;
default:;
}

return PI_SUCCESS;
}

pi_result after_piContextCreate(const pi_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data),
void *user_data, pi_context *ret_context) {
if (ret_context)
GlobalContext.contextHandle = *ret_context;
GlobalContext.deviceHandle = *devices;
return PI_SUCCESS;
}

pi_result after_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
switch (param_name) {
case PI_KERNEL_INFO_CONTEXT:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.contextHandle);
if (param_value)
*static_cast<pi_context *>(param_value) = GlobalContext.contextHandle;
break;
case PI_KERNEL_INFO_PROGRAM:
if (param_value_size_ret)
*param_value_size_ret = sizeof(GlobalContext.programHandle);
if (param_value)
*(pi_program *)param_value = GlobalContext.programHandle;
break;
default:;
}

return PI_SUCCESS;
}

void redefineMockForKernelInterop(sycl::unittest::PiMock &Mock) {
Mock.redefineAfter<sycl::detail::PiApiKind::piContextCreate>(
after_piContextCreate);
Mock.redefineAfter<sycl::detail::PiApiKind::piProgramGetInfo>(
after_piProgramGetInfo);
Mock.redefineAfter<sycl::detail::PiApiKind::piContextGetInfo>(
after_piContextGetInfo);
Mock.redefineAfter<sycl::detail::PiApiKind::piKernelGetInfo>(
after_piKernelGetInfo);
Mock.redefine<sycl::detail::PiApiKind::piProgramGetBuildInfo>(
redefined_piProgramGetBuildInfo);
}
1 change: 1 addition & 0 deletions sycl/unittests/thread_safety/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_sycl_unittest(ThreadSafetyTests OBJECT
HostAccessorDeadLock.cpp
InteropKernelEnqueue.cpp
)
65 changes: 65 additions & 0 deletions sycl/unittests/thread_safety/InteropKernelEnqueue.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
//==-------- InteropKernelEnqueue.cpp --- Thread safety unit tests ---------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <cstddef>
#include <gtest/gtest.h>
#include <helpers/KernelInteropCommon.hpp>
#include <helpers/PiMock.hpp>
#include <sycl/sycl.hpp>

#include "ThreadUtils.h"

namespace {
using namespace sycl;

constexpr std::size_t NArgs = 16;
constexpr std::size_t ThreadCount = 4;
constexpr std::size_t LaunchCount = 8;

pi_uint32 LastArgSet = -1;
std::size_t LastThread = -1;
pi_result redefined_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index,
size_t arg_size, const void *arg_value) {
EXPECT_EQ((LastArgSet + 1) % NArgs, arg_index);
LastArgSet = arg_index;
std::size_t ArgValue = *static_cast<const std::size_t *>(arg_value);
if (arg_index == 0)
LastThread = ArgValue;
else
EXPECT_EQ(LastThread, ArgValue);
return PI_SUCCESS;
}

TEST(KernelEnqueue, InteropKernel) {
unittest::PiMock Mock;
redefineMockForKernelInterop(Mock);
Mock.redefine<sycl::detail::PiApiKind::piKernelSetArg>(
redefined_piKernelSetArg);

platform Plt = Mock.getPlatform();
queue Q;

DummyHandleT Handle;
auto KernelCL = reinterpret_cast<typename sycl::backend_traits<
sycl::backend::opencl>::template input_type<sycl::kernel>>(&Handle);
auto Kernel =
sycl::make_kernel<sycl::backend::opencl>(KernelCL, Q.get_context());

auto TestLambda = [&](std::size_t ThreadId) {
Q.submit([&](sycl::handler &CGH) {
for (std::size_t I = 0; I < NArgs; ++I)
CGH.set_arg(I, ThreadId);
CGH.single_task(Kernel);
}).wait();
};

for (std::size_t I = 0; I < LaunchCount; ++I) {
ThreadPool Pool(ThreadCount, TestLambda);
}
}
} // namespace