Skip to content

Commit a1edb96

Browse files
authored
[SYCL] Fix issue of acquring kernel twice (#11953)
In #11751, ref counting of kernels objects was changed to be more accurate in order to allow for in-memory caching to be disabled. When getting a kernel form the cache, the ref count the kernel handle is now incremented (when caching is enabled). Thus, a method like `ProgramManager::getOrCreateKernel` will increment the ref count of the kernel it gets. However, in `enqueueImpKernel`, when enqueuing a kernel with a kernel bundle,`ProgramManager::getOrCreateKernel` is called twice, first indirectly by: https://github.com/intel/llvm/blob/c43a90f28eebfcdf1bc1d55430485e2834790a60/sycl/source/detail/scheduler/commands.cpp#L2527-L2528 and second directly by: https://github.com/intel/llvm/blob/c43a90f28eebfcdf1bc1d55430485e2834790a60/sycl/source/detail/scheduler/commands.cpp#L2538-L2548 This means that the ref count of the acquired kernel is incremented twice, yet the rest of the function will only free once, which leads to a leak of the kernel. As the second comment and asserts say, the only need for the second call to `getOrCreateKernel` is to fetch the mutex associated to the cached kernel retrieved from the first call, so this PR adjusts `get_kernel` to save this mutex and forgo this extra `getOrCreateKernel` call and unintentional additional ref count.
1 parent 0351926 commit a1edb96

File tree

6 files changed

+24
-22
lines changed

6 files changed

+24
-22
lines changed

sycl/source/detail/kernel_bundle_impl.hpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -531,15 +531,14 @@ class kernel_bundle_impl {
531531
"The kernel bundle does not contain the kernel "
532532
"identified by kernelId.");
533533

534-
sycl::detail::pi::PiKernel Kernel = nullptr;
535-
const KernelArgMask *ArgMask = nullptr;
536-
std::tie(Kernel, std::ignore, ArgMask) =
534+
auto [Kernel, CacheMutex, ArgMask] =
537535
detail::ProgramManager::getInstance().getOrCreateKernel(
538536
MContext, KernelID.get_name(), /*PropList=*/{},
539537
SelectedImage->get_program_ref());
540538

541-
std::shared_ptr<kernel_impl> KernelImpl = std::make_shared<kernel_impl>(
542-
Kernel, detail::getSyclObjImpl(MContext), SelectedImage, Self, ArgMask);
539+
std::shared_ptr<kernel_impl> KernelImpl =
540+
std::make_shared<kernel_impl>(Kernel, detail::getSyclObjImpl(MContext),
541+
SelectedImage, Self, ArgMask, CacheMutex);
543542

544543
return detail::createSyclObjFromImpl<kernel>(KernelImpl);
545544
}

sycl/source/detail/kernel_impl.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,11 @@ kernel_impl::kernel_impl(sycl::detail::pi::PiKernel Kernel,
6161
ContextImplPtr ContextImpl,
6262
DeviceImageImplPtr DeviceImageImpl,
6363
KernelBundleImplPtr KernelBundleImpl,
64-
const KernelArgMask *ArgMask)
64+
const KernelArgMask *ArgMask, std::mutex *CacheMutex)
6565
: MKernel(Kernel), MContext(std::move(ContextImpl)), MProgramImpl(nullptr),
6666
MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)),
6767
MKernelBundleImpl(std::move(KernelBundleImpl)),
68-
MKernelArgMaskPtr{ArgMask} {
68+
MKernelArgMaskPtr{ArgMask}, MCacheMutex{CacheMutex} {
6969
MIsInterop = MKernelBundleImpl->isInterop();
7070
}
7171

sycl/source/detail/kernel_impl.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ class kernel_impl {
7373
kernel_impl(sycl::detail::pi::PiKernel Kernel, ContextImplPtr ContextImpl,
7474
DeviceImageImplPtr DeviceImageImpl,
7575
KernelBundleImplPtr KernelBundleImpl,
76-
const KernelArgMask *ArgMask);
76+
const KernelArgMask *ArgMask, std::mutex *CacheMutex);
7777

7878
/// Constructs a SYCL kernel for host device
7979
///
@@ -183,6 +183,7 @@ class kernel_impl {
183183
}
184184

185185
const KernelArgMask *getKernelArgMask() const { return MKernelArgMaskPtr; }
186+
std::mutex *getCacheMutex() const { return MCacheMutex; }
186187

187188
private:
188189
sycl::detail::pi::PiKernel MKernel;
@@ -194,6 +195,7 @@ class kernel_impl {
194195
bool MIsInterop = false;
195196
std::mutex MNoncacheableEnqueueMutex;
196197
const KernelArgMask *MKernelArgMaskPtr;
198+
std::mutex *MCacheMutex;
197199

198200
bool isBuiltInKernel(const device &Device) const;
199201
void checkIfValidForNumArgsInfoQuery() const;

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,8 @@ static void emitBuiltProgramInfo(const pi_program &Prog,
622622
}
623623
}
624624

625+
// When caching is enabled, the returned PiProgram will already have
626+
// its ref count incremented.
625627
sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram(
626628
const ContextImplPtr &ContextImpl, const DeviceImplPtr &DeviceImpl,
627629
const std::string &KernelName, bool JITCompilationIsRequired) {
@@ -739,6 +741,8 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram(
739741
return *BuildResult->Ptr.load();
740742
}
741743

744+
// When caching is enabled, the returned PiProgram and PiKernel will
745+
// already have their ref count incremented.
742746
std::tuple<sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *,
743747
sycl::detail::pi::PiProgram>
744748
ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
@@ -2432,6 +2436,8 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage,
24322436
return createSyclObjFromImpl<device_image_plain>(ExecImpl);
24332437
}
24342438

2439+
// When caching is enabled, the returned PiKernel will already have
2440+
// its ref count incremented.
24352441
std::tuple<sycl::detail::pi::PiKernel, std::mutex *, const KernelArgMask *>
24362442
ProgramManager::getOrCreateKernel(const context &Context,
24372443
const std::string &KernelName,

sycl/source/detail/scheduler/commands.cpp

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,18 +2535,7 @@ pi_int32 enqueueImpKernel(
25352535
Program = DeviceImageImpl->get_program_ref();
25362536

25372537
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2538-
// When caching is enabled, kernel objects can be shared,
2539-
// so we need to retrieve the mutex associated to it via
2540-
// getOrCreateKernel
2541-
if (SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
2542-
auto [CachedKernel, CachedKernelMutex, CachedEliminatedArgMask] =
2543-
detail::ProgramManager::getInstance().getOrCreateKernel(
2544-
KernelBundleImplPtr->get_context(), KernelName,
2545-
/*PropList=*/{}, Program);
2546-
assert(CachedKernel == Kernel);
2547-
assert(CachedEliminatedArgMask == EliminatedArgMask);
2548-
KernelMutex = CachedKernelMutex;
2549-
}
2538+
KernelMutex = SyclKernelImpl->getCacheMutex();
25502539
} else if (nullptr != MSyclKernel) {
25512540
assert(MSyclKernel->get_info<info::kernel::context>() ==
25522541
Queue->get_context());

sycl/test-e2e/KernelAndProgram/disable-caching.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
// if and only if caching is disabled.
33

44
// RUN: %{build} -o %t.out
5-
// RUN: env SYCL_PI_TRACE=-1 SYCL_CACHE_IN_MEM=0 %{run} %t.out \
5+
// RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 SYCL_CACHE_IN_MEM=0 %{run} %t.out \
66
// RUN: | FileCheck %s
7-
// RUN: env SYCL_PI_TRACE=-1 %{run} %t.out \
7+
// RUN: env ZE_DEBUG=-6 SYCL_PI_TRACE=-1 %{run} %t.out \
88
// RUN: | FileCheck %s --check-prefixes=CHECK-CACHE
99
#include <sycl/sycl.hpp>
1010

@@ -25,8 +25,10 @@ int main() {
2525

2626
// CHECK-CACHE: piProgramCreate
2727
// CHECK-CACHE: piProgramRetain
28+
// CHECK-CACHE-NOT: piProgramRetain
2829
// CHECK-CACHE: piKernelCreate
2930
// CHECK-CACHE: piKernelRetain
31+
// CHECK-CACHE-NOT: piKernelCreate
3032
// CHECK-CACHE: piEnqueueKernelLaunch
3133
// CHECK-CACHE: piKernelRelease
3234
// CHECK-CACHE: piProgramRelease
@@ -44,8 +46,10 @@ int main() {
4446

4547
// CHECK-CACHE: piProgramCreate
4648
// CHECK-CACHE: piProgramRetain
49+
// CHECK-CACHE-NOT: piProgramRetain
4750
// CHECK-CACHE: piKernelCreate
4851
// CHECK-CACHE: piKernelRetain
52+
// CHECK-CACHE-NOT: piKernelCreate
4953
// CHECK-CACHE: piEnqueueKernelLaunch
5054
// CHECK-CACHE: piKernelRelease
5155
// CHECK-CACHE: piProgramRelease
@@ -62,8 +66,10 @@ int main() {
6266

6367
// CHECK-CACHE: piProgramCreate
6468
// CHECK-CACHE: piProgramRetain
69+
// CHECK-CACHE-NOT: piProgramRetain
6570
// CHECK-CACHE: piKernelCreate
6671
// CHECK-CACHE: piKernelRetain
72+
// CHECK-CACHE-NOT: piKernelCreate
6773
// CHECK-CACHE: piEnqueueKernelLaunch
6874
// CHECK-CACHE: piKernelRelease
6975
// CHECK-CACHE: piProgramRelease

0 commit comments

Comments
 (0)