Skip to content

Commit 12584c2

Browse files
authored
[SYCL] Separate host instantiation from HostKernel (#18534)
The HostKernel class is currently used for two different things: 1) To extend the lifetime of the kernel function object/lambda; and 2) To force the kernel to be instantiated on the host. Requiring a single solution to solve both problems prevents optimization. Providing separate solutions will enable us to use the fastest solution for each problem. --- A few notes that might help with review: - I would like us to eventually reach a point where we can remove the `HostKernel` class entirely and avoid extending the lifetime of the kernel function object. We can't do that if it's being relied upon for debugging. - I had to change the `runKernelWithoutArg` signature to remove SFINAE because MSVC complained about creating a pointer to an overloaded function. I would have expected that the compiler could figure out the function pointer _after_ SFINAE, but it failed. - Removing the old mechanism of instantiating on the host is an ABI break because it removes a function from the vtable. - With the new approach, we can avoid instantiating (and compiling!) the kernel on the host under `NDEBUG`. I've verified that `-DNDEBUG -fpreview-breaking-changes` removes the kernel symbols from the binary. `-DNDEBUG` alone isn't currently sufficient, because the old instantiation mechanism is still triggered. I wasn't sure how much of the old implementation to leave in tact. --------- Signed-off-by: John Pennycook <[email protected]>
1 parent ebc8528 commit 12584c2

File tree

5 files changed

+73
-32
lines changed

5 files changed

+73
-32
lines changed

sycl/include/sycl/detail/cg_types.hpp

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -124,32 +124,25 @@ struct KernelLambdaHasKernelHandlerArgT {
124124

125125
// Helpers for running kernel lambda on the host device
126126

127-
template <typename KernelType>
128-
typename std::enable_if_t<KernelLambdaHasKernelHandlerArgT<KernelType>::value>
129-
runKernelWithoutArg(KernelType KernelName) {
130-
kernel_handler KH;
131-
KernelName(KH);
132-
}
133-
134-
template <typename KernelType>
135-
typename std::enable_if_t<!KernelLambdaHasKernelHandlerArgT<KernelType>::value>
136-
runKernelWithoutArg(KernelType KernelName) {
137-
KernelName();
138-
}
139-
140-
template <typename ArgType, typename KernelType>
141-
typename std::enable_if_t<
142-
KernelLambdaHasKernelHandlerArgT<KernelType, ArgType>::value>
143-
runKernelWithArg(KernelType KernelName, ArgType Arg) {
144-
kernel_handler KH;
145-
KernelName(Arg, KH);
127+
template <typename KernelType, bool HasKernelHandlerArg>
128+
void runKernelWithoutArg(KernelType KernelName,
129+
const std::bool_constant<HasKernelHandlerArg> &) {
130+
if constexpr (HasKernelHandlerArg) {
131+
kernel_handler KH;
132+
KernelName(KH);
133+
} else {
134+
KernelName();
135+
}
146136
}
147-
148-
template <typename ArgType, typename KernelType>
149-
typename std::enable_if_t<
150-
!KernelLambdaHasKernelHandlerArgT<KernelType, ArgType>::value>
151-
runKernelWithArg(KernelType KernelName, ArgType Arg) {
152-
KernelName(Arg);
137+
template <typename ArgType, typename KernelType, bool HasKernelHandlerArg>
138+
void runKernelWithArg(KernelType KernelName, ArgType Arg,
139+
const std::bool_constant<HasKernelHandlerArg> &) {
140+
if constexpr (HasKernelHandlerArg) {
141+
kernel_handler KH;
142+
KernelName(Arg, KH);
143+
} else {
144+
KernelName(Arg);
145+
}
153146
}
154147

155148
// The pure virtual class aimed to store lambda/functors of any type.
@@ -159,8 +152,10 @@ class HostKernelBase {
159152
// Used to extract captured variables.
160153
virtual char *getPtr() = 0;
161154
virtual ~HostKernelBase() = default;
155+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
162156
// NOTE: InstatiateKernelOnHost() should not be called.
163157
virtual void InstantiateKernelOnHost() = 0;
158+
#endif
164159
};
165160

166161
// Class which stores specific lambda object.
@@ -176,17 +171,21 @@ class HostKernel : public HostKernelBase {
176171

177172
~HostKernel() = default;
178173

174+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
179175
// This function is needed for host-side compilation to keep kernels
180176
// instantitated. This is important for debuggers to be able to associate
181177
// kernel code instructions with source code lines.
182178
// NOTE: InstatiateKernelOnHost() should not be called.
183179
void InstantiateKernelOnHost() override {
184180
using IDBuilder = sycl::detail::Builder;
181+
constexpr bool HasKernelHandlerArg =
182+
KernelLambdaHasKernelHandlerArgT<KernelType, KernelArgType>::value;
185183
if constexpr (std::is_same_v<KernelArgType, void>) {
186-
runKernelWithoutArg(MKernel);
184+
runKernelWithoutArg(MKernel, std::bool_constant<HasKernelHandlerArg>());
187185
} else if constexpr (std::is_same_v<KernelArgType, sycl::id<Dims>>) {
188186
sycl::id ID = InitializedVal<Dims, id>::template get<0>();
189-
runKernelWithArg<const KernelArgType &>(MKernel, ID);
187+
runKernelWithArg<const KernelArgType &>(
188+
MKernel, ID, std::bool_constant<HasKernelHandlerArg>());
190189
} else if constexpr (std::is_same_v<KernelArgType, item<Dims, true>> ||
191190
std::is_same_v<KernelArgType, item<Dims, false>>) {
192191
constexpr bool HasOffset =
@@ -195,13 +194,15 @@ class HostKernel : public HostKernelBase {
195194
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
196195
InitializedVal<Dims, range>::template get<1>(),
197196
InitializedVal<Dims, id>::template get<0>());
198-
runKernelWithArg<KernelArgType>(MKernel, Item);
197+
runKernelWithArg<KernelArgType>(
198+
MKernel, Item, std::bool_constant<HasKernelHandlerArg>());
199199
} else {
200200
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
201201
InitializedVal<Dims, range>::template get<1>(),
202202
InitializedVal<Dims, id>::template get<0>(),
203203
InitializedVal<Dims, id>::template get<0>());
204-
runKernelWithArg<KernelArgType>(MKernel, Item);
204+
runKernelWithArg<KernelArgType>(
205+
MKernel, Item, std::bool_constant<HasKernelHandlerArg>());
205206
}
206207
} else if constexpr (std::is_same_v<KernelArgType, nd_item<Dims>>) {
207208
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
@@ -214,22 +215,44 @@ class HostKernel : public HostKernelBase {
214215
IDBuilder::createItem<Dims, false>(Range, ID);
215216
KernelArgType NDItem =
216217
IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);
217-
runKernelWithArg<const KernelArgType>(MKernel, NDItem);
218+
runKernelWithArg<const KernelArgType>(
219+
MKernel, NDItem, std::bool_constant<HasKernelHandlerArg>());
218220
} else if constexpr (std::is_same_v<KernelArgType, sycl::group<Dims>>) {
219221
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
220222
sycl::id<Dims> ID = InitializedVal<Dims, id>::template get<0>();
221223
KernelArgType Group =
222224
IDBuilder::createGroup<Dims>(Range, Range, Range, ID);
223-
runKernelWithArg<KernelArgType>(MKernel, Group);
225+
runKernelWithArg<KernelArgType>(
226+
MKernel, Group, std::bool_constant<HasKernelHandlerArg>());
224227
} else {
225228
// Assume that anything else can be default-constructed. If not, this
226229
// should fail to compile and the implementor should implement a generic
227230
// case for the new argument type.
228-
runKernelWithArg<KernelArgType>(MKernel, KernelArgType{});
231+
runKernelWithArg<KernelArgType>(
232+
MKernel, KernelArgType{}, std::bool_constant<HasKernelHandlerArg>());
229233
}
230234
}
235+
#endif
231236
};
232237

238+
// This function is needed for host-side compilation to keep kernels
239+
// instantitated. This is important for debuggers to be able to associate
240+
// kernel code instructions with source code lines.
241+
template <class KernelType, class KernelArgType, int Dims>
242+
constexpr void *GetInstantiateKernelOnHostPtr() {
243+
if constexpr (std::is_same_v<KernelArgType, void>) {
244+
constexpr bool HasKernelHandlerArg =
245+
KernelLambdaHasKernelHandlerArgT<KernelType>::value;
246+
return reinterpret_cast<void *>(
247+
&runKernelWithoutArg<KernelType, HasKernelHandlerArg>);
248+
} else {
249+
constexpr bool HasKernelHandlerArg =
250+
KernelLambdaHasKernelHandlerArgT<KernelType, KernelArgType>::value;
251+
return reinterpret_cast<void *>(
252+
&runKernelWithArg<KernelArgType, KernelType, HasKernelHandlerArg>);
253+
}
254+
}
255+
233256
} // namespace detail
234257
} // namespace _V1
235258
} // namespace sycl

sycl/include/sycl/handler.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,14 @@ class __SYCL_EXPORT handler {
821821
MHostKernel.reset(new detail::HostKernel<KernelType, LambdaArgType, Dims>(
822822
std::forward<KernelTypeUniversalRef>(KernelFunc)));
823823

824+
// Instantiating the kernel on the host improves debugging.
825+
// Passing this pointer to another translation unit prevents optimization.
826+
#ifndef NDEBUG
827+
instantiateKernelOnHost(
828+
detail::GetInstantiateKernelOnHostPtr<KernelType, LambdaArgType,
829+
Dims>());
830+
#endif
831+
824832
constexpr bool KernelHasName =
825833
detail::getKernelName<KernelName>() != nullptr &&
826834
detail::getKernelName<KernelName>()[0] != '\0';
@@ -3834,6 +3842,8 @@ class __SYCL_EXPORT handler {
38343842
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),
38353843
bool KernelIsESIMD, bool KernelHasSpecialCaptures);
38363844

3845+
void instantiateKernelOnHost(void *InstantiateKernelOnHostPtr);
3846+
38373847
friend class detail::HandlerAccess;
38383848

38393849
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES

sycl/source/handler.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2386,6 +2386,12 @@ void handler::setKernelInfo(
23862386
impl->MKernelHasSpecialCaptures = KernelHasSpecialCaptures;
23872387
}
23882388

2389+
void handler::instantiateKernelOnHost(void *InstantiateKernelOnHostPtr) {
2390+
// Passing the pointer to the runtime is enough to prevent optimization.
2391+
// We don't need to use the pointer for anything.
2392+
(void)InstantiateKernelOnHostPtr;
2393+
}
2394+
23892395
void handler::saveCodeLoc(detail::code_location CodeLoc, bool IsTopCodeLoc) {
23902396
MCodeLoc = CodeLoc;
23912397
impl->MIsTopCodeLoc = IsTopCodeLoc;

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,6 +3575,7 @@ _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE
35753575
_ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE
35763576
_ZN4sycl3_V17handler22setKernelClusterLaunchENS0_5rangeILi3EEEi
35773577
_ZN4sycl3_V17handler22setKernelIsCooperativeEb
3578+
_ZN4sycl3_V17handler23instantiateKernelOnHostEPv
35783579
_ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_
35793580
_ZN4sycl3_V17handler24ext_intel_read_host_pipeENS0_6detail11string_viewEPvmb
35803581
_ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm

sycl/test/abi/sycl_symbols_windows.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4230,6 +4230,7 @@
42304230
?has_kernel_bundle_impl@detail@_V1@sycl@@YA_NAEBVcontext@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4bundle_state@23@@Z
42314231
?has_specialization_constant_impl@kernel_bundle_plain@detail@_V1@sycl@@IEBA_NPEBD@Z
42324232
?increase_threshold_to@memory_pool@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z
4233+
?instantiateKernelOnHost@handler@_V1@sycl@@AEAAXPEAX@Z
42334234
?internalProfilingTagImpl@handler@_V1@sycl@@AEAAXXZ
42344235
?isBackendSupportedFillSize@handler@_V1@sycl@@CA_N_K@Z
42354236
?isConstOrGlobal@handler@_V1@sycl@@CA_NW4target@access@23@@Z

0 commit comments

Comments
 (0)