Skip to content

Commit 9322d14

Browse files
authored
[SYCL] Add environment variable to disable in-memory program caching (#11751)
This PR adds an environment variable `SYCL_CACHE_IN_MEM` to control the in-memory caching of programs. Currently, every program/kernel is saved in the global `KernelProgramCache`, which means that every program/kernel will not be released until end of the program when the destructor of `KernelProgramCache` is ran. By enabling this environment variables, caching is not performed and the resources program and kernels use are freed after use.
1 parent c0ebe4f commit 9322d14

File tree

10 files changed

+222
-27
lines changed

10 files changed

+222
-27
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ compiler and runtime.
1414
| `SYCL_CACHE_DIR` | Path | Path to persistent cache root directory. Default values are `%AppData%\libsycl_cache` for Windows and `$XDG_CACHE_HOME/libsycl_cache` on Linux, if `XDG_CACHE_HOME` is not set then `$HOME/.cache/libsycl_cache`. When none of the environment variables are set SYCL persistent cache is disabled. |
1515
| `SYCL_CACHE_DISABLE_PERSISTENT (deprecated)` | Any(\*) | Has no effect. |
1616
| `SYCL_CACHE_PERSISTENT` | Integer | Controls persistent device compiled code cache. Turns it on if set to '1' and turns it off if set to '0'. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is off. |
17+
| `SYCL_CACHE_IN_MEM` | '1' or '0' | Enable ('1') or disable ('0') in-memory caching of device compiled code. When cache is enabled SYCL runtime will try to cache and reuse JIT-compiled binaries. Default is '1'. |
1718
| `SYCL_CACHE_EVICTION_DISABLE` | Any(\*) | Switches cache eviction off when the variable is set. |
1819
| `SYCL_CACHE_MAX_SIZE` | Positive integer | Cache eviction is triggered once total size of cached images exceeds the value in megabytes (default - 8 192 for 8 GB). Set to 0 to disable size-based cache eviction. |
1920
| `SYCL_CACHE_THRESHOLD` | Positive integer | Cache eviction threshold in days (default value is 7 for 1 week). Set to 0 for disabling time-based cache eviction. |

sycl/source/detail/config.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,4 @@ CONFIG(SYCL_RT_WARNING_LEVEL, 4, __SYCL_RT_WARNING_LEVEL)
4040
CONFIG(SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE, 16, __SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE)
4141
CONFIG(ONEAPI_DEVICE_SELECTOR, 1024, __ONEAPI_DEVICE_SELECTOR)
4242
CONFIG(SYCL_ENABLE_FUSION_CACHING, 1, __SYCL_ENABLE_FUSION_CACHING)
43+
CONFIG(SYCL_CACHE_IN_MEM, 1, __SYCL_CACHE_IN_MEM)

sycl/source/detail/config.hpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,34 @@ template <> class SYCLConfig<SYCL_ENABLE_FUSION_CACHING> {
614614
}
615615
};
616616

617+
template <> class SYCLConfig<SYCL_CACHE_IN_MEM> {
618+
using BaseT = SYCLConfigBase<SYCL_CACHE_IN_MEM>;
619+
620+
public:
621+
static constexpr bool Default = true; // default is true
622+
static bool get() { return getCachedValue(); }
623+
static const char *getName() { return BaseT::MConfigName; }
624+
625+
private:
626+
static bool parseValue() {
627+
const char *ValStr = BaseT::getRawValue();
628+
if (!ValStr)
629+
return Default;
630+
if (strlen(ValStr) != 1 || (ValStr[0] != '0' && ValStr[0] != '1')) {
631+
std::string Msg =
632+
std::string{"Invalid value for bool configuration variable "} +
633+
getName() + std::string{": "} + ValStr;
634+
throw runtime_error(Msg, PI_ERROR_INVALID_OPERATION);
635+
}
636+
return ValStr[0] == '1';
637+
}
638+
639+
static bool getCachedValue() {
640+
static bool Val = parseValue();
641+
return Val;
642+
}
643+
};
644+
617645
#undef INVALID_CONFIG_EXCEPTION
618646

619647
} // namespace detail

sycl/source/detail/kernel_impl.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,6 @@ kernel_impl::kernel_impl(sycl::detail::pi::PiKernel Kernel,
6666
MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)),
6767
MKernelBundleImpl(std::move(KernelBundleImpl)),
6868
MKernelArgMaskPtr{ArgMask} {
69-
70-
// kernel_impl shared ownership of kernel handle
71-
if (!is_host()) {
72-
getPlugin()->call<PiApiKind::piKernelRetain>(MKernel);
73-
}
74-
7569
MIsInterop = MKernelBundleImpl->isInterop();
7670
}
7771

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,11 +699,21 @@ sycl::detail::pi::PiProgram ProgramManager::getBuiltPIProgram(
699699
return Cache.getOrInsertProgram(CacheKey);
700700
};
701701

702+
if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get())
703+
return BuildF();
704+
702705
auto BuildResult =
703706
getOrBuild<sycl::detail::pi::PiProgram, compile_program_error>(
704707
Cache, GetCachedBuildF, BuildF);
705708
// getOrBuild is not supposed to return nullptr
706709
assert(BuildResult != nullptr && "Invalid build result");
710+
711+
// If caching is enabled, one copy of the program handle will be
712+
// stored in the cache, and one handle is returned to the
713+
// caller. In that case, we need to increase the ref count of the
714+
// program.
715+
ContextImpl->getPlugin()->call<PiApiKind::piProgramRetain>(
716+
*BuildResult->Ptr.load());
707717
return *BuildResult->Ptr.load();
708718
}
709719

@@ -728,9 +738,20 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
728738

729739
auto key = std::make_tuple(std::move(SpecConsts), PiDevice,
730740
CompileOpts + LinkOpts, KernelName);
731-
auto ret_tuple = Cache.tryToGetKernelFast(key);
732-
if (std::get<0>(ret_tuple))
733-
return ret_tuple;
741+
if (SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
742+
auto ret_tuple = Cache.tryToGetKernelFast(key);
743+
constexpr size_t Kernel = 0; // see KernelFastCacheValT tuple
744+
constexpr size_t Program = 3; // see KernelFastCacheValT tuple
745+
if (std::get<Kernel>(ret_tuple)) {
746+
// Pulling a copy of a kernel and program from the cache,
747+
// so we need to retain those resources.
748+
ContextImpl->getPlugin()->call<PiApiKind::piKernelRetain>(
749+
std::get<Kernel>(ret_tuple));
750+
ContextImpl->getPlugin()->call<PiApiKind::piProgramRetain>(
751+
std::get<Program>(ret_tuple));
752+
return ret_tuple;
753+
}
754+
}
734755

735756
sycl::detail::pi::PiProgram Program =
736757
getBuiltPIProgram(ContextImpl, DeviceImpl, KernelName);
@@ -757,6 +778,14 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
757778
return Cache.getOrInsertKernel(Program, KernelName);
758779
};
759780

781+
if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
782+
// The built kernel cannot be shared between multiple
783+
// threads when caching is disabled, so we can return
784+
// nullptr for the mutex.
785+
auto [Kernel, ArgMask] = BuildF();
786+
return make_tuple(Kernel, nullptr, ArgMask, Program);
787+
}
788+
760789
auto BuildResult = getOrBuild<KernelArgMaskPairT, invalid_object_error>(
761790
Cache, GetCachedBuildF, BuildF);
762791
// getOrBuild is not supposed to return nullptr
@@ -765,6 +794,12 @@ ProgramManager::getOrCreateKernel(const ContextImplPtr &ContextImpl,
765794
auto ret_val = std::make_tuple(KernelArgMaskPair.first,
766795
&(BuildResult->MBuildResultMutex),
767796
KernelArgMaskPair.second, Program);
797+
// If caching is enabled, one copy of the kernel handle will be
798+
// stored in the cache, and one handle is returned to the
799+
// caller. In that case, we need to increase the ref count of the
800+
// kernel.
801+
ContextImpl->getPlugin()->call<PiApiKind::piKernelRetain>(
802+
KernelArgMaskPair.first);
768803
Cache.saveKernel(key, ret_val);
769804
return ret_val;
770805
}
@@ -2297,6 +2332,17 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage,
22972332
return BuiltProgram.release();
22982333
};
22992334

2335+
if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
2336+
auto ResProgram = BuildF();
2337+
DeviceImageImplPtr ExecImpl = std::make_shared<detail::device_image_impl>(
2338+
InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable,
2339+
InputImpl->get_kernel_ids_ptr(), ResProgram,
2340+
InputImpl->get_spec_const_data_ref(),
2341+
InputImpl->get_spec_const_blob_ref());
2342+
2343+
return createSyclObjFromImpl<device_image_plain>(ExecImpl);
2344+
}
2345+
23002346
uint32_t ImgId = Img.getImageID();
23012347
const sycl::detail::pi::PiDevice PiDevice =
23022348
getRawSyclObjImpl(Devs[0])->getHandleRef();
@@ -2389,11 +2435,25 @@ ProgramManager::getOrCreateKernel(const context &Context,
23892435
return Cache.getOrInsertKernel(Program, KernelName);
23902436
};
23912437

2438+
if (!SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
2439+
// The built kernel cannot be shared between multiple
2440+
// threads when caching is disabled, so we can return
2441+
// nullptr for the mutex.
2442+
auto [Kernel, ArgMask] = BuildF();
2443+
return make_tuple(Kernel, nullptr, ArgMask);
2444+
}
2445+
23922446
auto BuildResult =
23932447
getOrBuild<KernelProgramCache::KernelArgMaskPairT, invalid_object_error>(
23942448
Cache, GetCachedBuildF, BuildF);
23952449
// getOrBuild is not supposed to return nullptr
23962450
assert(BuildResult != nullptr && "Invalid build result");
2451+
// If caching is enabled, one copy of the kernel handle will be
2452+
// stored in the cache, and one handle is returned to the
2453+
// caller. In that case, we need to increase the ref count of the
2454+
// kernel.
2455+
Ctx->getPlugin()->call<PiApiKind::piKernelRetain>(
2456+
BuildResult->Ptr.load()->first);
23972457
return std::make_tuple(BuildResult->Ptr.load()->first,
23982458
&(BuildResult->MBuildResultMutex),
23992459
BuildResult->Ptr.load()->second);

sycl/source/detail/scheduler/commands.cpp

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,8 +2403,8 @@ pi_int32 enqueueImpCommandBufferKernel(
24032403
auto ContextImpl = sycl::detail::getSyclObjImpl(Ctx);
24042404
const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
24052405
pi_kernel PiKernel = nullptr;
2406-
std::mutex *KernelMutex = nullptr;
24072406
pi_program PiProgram = nullptr;
2407+
std::shared_ptr<kernel_impl> SyclKernelImpl = nullptr;
24082408
std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
24092409

24102410
auto Kernel = CommandGroup.MSyclKernel;
@@ -2417,7 +2417,6 @@ pi_int32 enqueueImpCommandBufferKernel(
24172417
// and can therefore not be looked up, but since they are self-contained
24182418
// they can simply be launched directly.
24192419
if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
2420-
std::shared_ptr<kernel_impl> SyclKernelImpl;
24212420
auto KernelName = CommandGroup.MKernelName;
24222421
kernel_id KernelID =
24232422
detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
@@ -2427,14 +2426,14 @@ pi_int32 enqueueImpCommandBufferKernel(
24272426
PiKernel = SyclKernelImpl->getHandleRef();
24282427
DeviceImageImpl = SyclKernelImpl->getDeviceImage();
24292428
PiProgram = DeviceImageImpl->get_program_ref();
2430-
std::tie(PiKernel, KernelMutex, EliminatedArgMask) =
2431-
detail::ProgramManager::getInstance().getOrCreateKernel(
2432-
KernelBundleImplPtr->get_context(), KernelName,
2433-
/*PropList=*/{}, PiProgram);
2429+
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
24342430
} else if (Kernel != nullptr) {
24352431
PiKernel = Kernel->getHandleRef();
2432+
auto SyclProg = Kernel->getProgramImpl();
2433+
PiProgram = SyclProg->getHandleRef();
2434+
EliminatedArgMask = Kernel->getKernelArgMask();
24362435
} else {
2437-
std::tie(PiKernel, KernelMutex, EliminatedArgMask, PiProgram) =
2436+
std::tie(PiKernel, std::ignore, EliminatedArgMask, PiProgram) =
24382437
sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
24392438
ContextImpl, DeviceImpl, CommandGroup.MKernelName);
24402439
}
@@ -2483,6 +2482,11 @@ pi_int32 enqueueImpCommandBufferKernel(
24832482
&NDRDesc.GlobalSize[0], LocalSize, SyncPoints.size(),
24842483
SyncPoints.size() ? SyncPoints.data() : nullptr, OutSyncPoint);
24852484

2485+
if (!SyclKernelImpl && !Kernel) {
2486+
Plugin->call<PiApiKind::piKernelRelease>(PiKernel);
2487+
Plugin->call<PiApiKind::piProgramRelease>(PiProgram);
2488+
}
2489+
24862490
if (Res != pi_result::PI_SUCCESS) {
24872491
throw sycl::exception(errc::invalid,
24882492
"Failed to add kernel to PI command-buffer");
@@ -2530,10 +2534,19 @@ pi_int32 enqueueImpKernel(
25302534

25312535
Program = DeviceImageImpl->get_program_ref();
25322536

2533-
std::tie(Kernel, KernelMutex, EliminatedArgMask) =
2534-
detail::ProgramManager::getInstance().getOrCreateKernel(
2535-
KernelBundleImplPtr->get_context(), KernelName,
2536-
/*PropList=*/{}, Program);
2537+
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2538+
// When caching is enabled, kernel objects can be shared,
2539+
// so we need to retrieve the mutex associated to it via
2540+
// getOrCreateKernel
2541+
if (SYCLConfig<SYCL_CACHE_IN_MEM>::get()) {
2542+
auto [CachedKernel, CachedKernelMutex, CachedEliminatedArgMask] =
2543+
detail::ProgramManager::getInstance().getOrCreateKernel(
2544+
KernelBundleImplPtr->get_context(), KernelName,
2545+
/*PropList=*/{}, Program);
2546+
assert(CachedKernel == Kernel);
2547+
assert(CachedEliminatedArgMask == EliminatedArgMask);
2548+
KernelMutex = CachedKernelMutex;
2549+
}
25372550
} else if (nullptr != MSyclKernel) {
25382551
assert(MSyclKernel->get_info<info::kernel::context>() ==
25392552
Queue->get_context());
@@ -2574,8 +2587,11 @@ pi_int32 enqueueImpKernel(
25742587

25752588
pi_result Error = PI_SUCCESS;
25762589
{
2577-
assert(KernelMutex);
2578-
std::lock_guard<std::mutex> Lock(*KernelMutex);
2590+
// When KernelMutex is null, this means that in-memory caching is
2591+
// disabled, which means that kernel object is not shared, so no locking
2592+
// is necessary.
2593+
using LockT = std::unique_lock<std::mutex>;
2594+
auto Lock = KernelMutex ? LockT(*KernelMutex) : LockT();
25792595

25802596
// Set SLM/Cache configuration for the kernel if non-default value is
25812597
// provided.
@@ -2590,6 +2606,12 @@ pi_int32 enqueueImpKernel(
25902606
Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
25912607
NDRDesc, EventsWaitList, OutEventImpl,
25922608
EliminatedArgMask, getMemAllocationFunc);
2609+
2610+
const PluginPtr &Plugin = Queue->getPlugin();
2611+
if (!SyclKernelImpl && !MSyclKernel) {
2612+
Plugin->call<PiApiKind::piKernelRelease>(Kernel);
2613+
Plugin->call<PiApiKind::piProgramRelease>(Program);
2614+
}
25932615
}
25942616
if (PI_SUCCESS != Error) {
25952617
// If we have got non-success error code, let's analyze it to emit nice
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// This test ensures created program/kernels are not retained
2+
// if and only if caching is disabled.
3+
4+
// RUN: %{build} -o %t.out
5+
// RUN: env SYCL_PI_TRACE=-1 SYCL_CACHE_IN_MEM=0 %{run} %t.out \
6+
// RUN: | FileCheck %s
7+
// RUN: env SYCL_PI_TRACE=-1 %{run} %t.out \
8+
// RUN: | FileCheck %s --check-prefixes=CHECK-CACHE
9+
#include <sycl/sycl.hpp>
10+
11+
using namespace sycl;
12+
13+
constexpr specialization_id<int> spec_id;
14+
15+
int main() {
16+
queue q;
17+
// CHECK: piProgramCreate
18+
// CHECK-NOT: piProgramRetain
19+
// CHECK: piKernelCreate
20+
// CHECK-NOT: piKernelRetain
21+
// CHECK: piEnqueueKernelLaunch
22+
// CHECK: piKernelRelease
23+
// CHECK: piProgramRelease
24+
// CHECK: piEventsWait
25+
26+
// CHECK-CACHE: piProgramCreate
27+
// CHECK-CACHE: piProgramRetain
28+
// CHECK-CACHE: piKernelCreate
29+
// CHECK-CACHE: piKernelRetain
30+
// CHECK-CACHE: piEnqueueKernelLaunch
31+
// CHECK-CACHE: piKernelRelease
32+
// CHECK-CACHE: piProgramRelease
33+
// CHECK-CACHE: piEventsWait
34+
q.single_task([] {}).wait();
35+
36+
// CHECK: piProgramCreate
37+
// CHECK-NOT: piProgramRetain
38+
// CHECK: piKernelCreate
39+
// CHECK-NOT: piKernelRetain
40+
// CHECK: piEnqueueKernelLaunch
41+
// CHECK: piKernelRelease
42+
// CHECK: piProgramRelease
43+
// CHECK: piEventsWait
44+
45+
// CHECK-CACHE: piProgramCreate
46+
// CHECK-CACHE: piProgramRetain
47+
// CHECK-CACHE: piKernelCreate
48+
// CHECK-CACHE: piKernelRetain
49+
// CHECK-CACHE: piEnqueueKernelLaunch
50+
// CHECK-CACHE: piKernelRelease
51+
// CHECK-CACHE: piProgramRelease
52+
// CHECK-CACHE: piEventsWait
53+
54+
// CHECK: piProgramCreate
55+
// CHECK-NOT: piProgramRetain
56+
// CHECK: piKernelCreate
57+
// CHECK-NOT: piKernelRetain
58+
// CHECK: piEnqueueKernelLaunch
59+
// CHECK: piKernelRelease
60+
// CHECK: piProgramRelease
61+
// CHECK: piEventsWait
62+
63+
// CHECK-CACHE: piProgramCreate
64+
// CHECK-CACHE: piProgramRetain
65+
// CHECK-CACHE: piKernelCreate
66+
// CHECK-CACHE: piKernelRetain
67+
// CHECK-CACHE: piEnqueueKernelLaunch
68+
// CHECK-CACHE: piKernelRelease
69+
// CHECK-CACHE: piProgramRelease
70+
// CHECK-CACHE: piEventsWait
71+
auto *p = malloc_shared<int>(1, q);
72+
for (int i = 0; i < 2; ++i)
73+
q.submit([&](handler &cgh) {
74+
cgh.set_specialization_constant<spec_id>(i);
75+
cgh.parallel_for(1, [=](auto, kernel_handler kh) {
76+
*p = kh.get_specialization_constant<spec_id>();
77+
});
78+
}).wait();
79+
80+
free(p, q);
81+
}
82+
83+
// (Program cache releases)
84+
// CHECK-CACHE: piKernelRelease
85+
// CHECK-CACHE: piProgramRelease
86+
// CHECK-CACHE: piKernelRelease
87+
// CHECK-CACHE: piProgramRelease

sycl/test-e2e/XPTI/basic_event_collection_linux.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
// CHECK: PI Call Begin : piextKernelSetArgPointer
4545
// CHECK-NEXT: PI Call Begin : piKernelGetGroupInfo
4646
// CHECK-NEXT: PI Call Begin : piEnqueueKernelLaunch
47+
// CHECK-NEXT: PI Call Begin : piKernelRelease
48+
// CHECK-NEXT: PI Call Begin : piProgramRelease
4749
// CHECK-NEXT: Signal
4850
// CHECK-DAG: sym_line_no : {{.*}}
4951
// CHECK-DAG: sym_source_file_name : {{.*}}

sycl/unittests/SYCL2020/GetNativeOpenCL.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ TEST(GetNative, GetNativeHandle) {
123123
get_native<backend::opencl>(Buffer);
124124

125125
// Depending on global caches state, piDeviceRetain is called either once or
126-
// twice, so there'll be 5 or 6 calls.
127-
ASSERT_EQ(TestCounter, 5 + DeviceRetainCounter - 1)
126+
// twice, so there'll be 6 or 7 calls.
127+
ASSERT_EQ(TestCounter, 6 + DeviceRetainCounter - 1)
128128
<< "Not all the retain methods were called";
129129
}

0 commit comments

Comments
 (0)