Skip to content

Commit f59a17b

Browse files
[SYCL] Pass handler & instead of queue across ABI for reduction utils
Queue might be `nullptr` in case of graph, but the information this utils query is device-specific. By passing entire `handler &` and having access to graph information we'd be able to return more precise results. Another positive side-effect is that we eliminiate explicit `std::shared_ptr<queue_impl>` which is a small step forward in the ongoing refactoring efforts to prefer passing `*_impl` by raw ptr/ref with explicit `shared_from_this` whenever lifetimes need to be extended.
1 parent 0dff0ff commit f59a17b

File tree

4 files changed

+78
-56
lines changed

4 files changed

+78
-56
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 13 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,10 @@ template <typename... Ts> ReduTupleT<Ts...> makeReduTupleT(Ts... Elements) {
140140
return sycl::detail::make_tuple(Elements...);
141141
}
142142

143-
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
144-
__SYCL_EXPORT size_t reduGetMaxWGSize(const std::shared_ptr<queue_impl> &Queue,
145-
size_t LocalMemBytesPerWorkItem);
146-
__SYCL_EXPORT size_t reduGetPreferredWGSize(
147-
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem);
148-
#else
149-
__SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
143+
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
150144
size_t LocalMemBytesPerWorkItem);
151-
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
145+
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
152146
size_t LocalMemBytesPerWorkItem);
153-
#endif
154147
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
155148
size_t &NWorkGroups);
156149

@@ -1708,8 +1701,7 @@ struct NDRangeReduction<
17081701
reduction::strategy::group_reduce_and_multiple_kernels> {
17091702
template <typename KernelName, int Dims, typename PropertiesT,
17101703
typename KernelType, typename Reduction>
1711-
static void run(handler &CGH,
1712-
const std::shared_ptr<detail::queue_impl> &Queue,
1704+
static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
17131705
nd_range<Dims> NDRange, PropertiesT &Properties,
17141706
Reduction &Redu, KernelType &KernelFunc) {
17151707
static_assert(Reduction::has_identity,
@@ -1729,7 +1721,7 @@ struct NDRangeReduction<
17291721
// TODO: currently the maximal work group size is determined for the given
17301722
// queue/device, while it may be safer to use queries to the kernel compiled
17311723
// for the device.
1732-
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1724+
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
17331725
if (NDRange.get_local_range().size() > MaxWGSize)
17341726
throw sycl::exception(make_error_code(errc::nd_range),
17351727
"The implementation handling parallel_for with"
@@ -1847,8 +1839,7 @@ struct NDRangeReduction<
18471839
template <> struct NDRangeReduction<reduction::strategy::basic> {
18481840
template <typename KernelName, int Dims, typename PropertiesT,
18491841
typename KernelType, typename Reduction>
1850-
static void run(handler &CGH,
1851-
const std::shared_ptr<detail::queue_impl> &Queue,
1842+
static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
18521843
nd_range<Dims> NDRange, PropertiesT &Properties,
18531844
Reduction &Redu, KernelType &KernelFunc) {
18541845
using element_type = typename Reduction::reducer_element_type;
@@ -1858,7 +1849,7 @@ template <> struct NDRangeReduction<reduction::strategy::basic> {
18581849
// TODO: currently the maximal work group size is determined for the given
18591850
// queue/device, while it may be safer to use queries to the kernel
18601851
// compiled for the device.
1861-
size_t MaxWGSize = reduGetMaxWGSize(Queue, OneElemSize);
1852+
size_t MaxWGSize = reduGetMaxWGSize(CGH, OneElemSize);
18621853
if (NDRange.get_local_range().size() > MaxWGSize)
18631854
throw sycl::exception(make_error_code(errc::nd_range),
18641855
"The implementation handling parallel_for with"
@@ -2623,9 +2614,9 @@ tuple_select_elements(TupleT Tuple, std::index_sequence<Is...>) {
26232614
template <> struct NDRangeReduction<reduction::strategy::multi> {
26242615
template <typename KernelName, int Dims, typename PropertiesT,
26252616
typename... RestT>
2626-
static void
2627-
run(handler &CGH, const std::shared_ptr<detail::queue_impl> &Queue,
2628-
nd_range<Dims> NDRange, PropertiesT &Properties, RestT... Rest) {
2617+
static void run(handler &CGH, const std::shared_ptr<detail::queue_impl> &,
2618+
nd_range<Dims> NDRange, PropertiesT &Properties,
2619+
RestT... Rest) {
26292620
std::tuple<RestT...> ArgsTuple(Rest...);
26302621
constexpr size_t NumArgs = sizeof...(RestT);
26312622
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
@@ -2636,7 +2627,7 @@ template <> struct NDRangeReduction<reduction::strategy::multi> {
26362627
// TODO: currently the maximal work group size is determined for the given
26372628
// queue/device, while it is safer to use queries to the kernel compiled
26382629
// for the device.
2639-
size_t MaxWGSize = reduGetMaxWGSize(Queue, LocalMemPerWorkItem);
2630+
size_t MaxWGSize = reduGetMaxWGSize(CGH, LocalMemPerWorkItem);
26402631
if (NDRange.get_local_range().size() > MaxWGSize)
26412632
throw sycl::exception(make_error_code(errc::nd_range),
26422633
"The implementation handling parallel_for with"
@@ -2731,8 +2722,7 @@ void reduction_parallel_for(handler &CGH, nd_range<Dims> NDRange,
27312722
Properties, Rest...);
27322723
}
27332724

2734-
__SYCL_EXPORT uint32_t
2735-
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
2725+
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh);
27362726

27372727
template <typename KernelName, reduction::strategy Strategy, int Dims,
27382728
typename PropertiesT, typename... RestT>
@@ -2763,13 +2753,13 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
27632753
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
27642754
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
27652755
#else
2766-
reduGetMaxNumConcurrentWorkGroups(CGH.MQueue);
2756+
reduGetMaxNumConcurrentWorkGroups(CGH);
27672757
#endif
27682758

27692759
// TODO: currently the preferred work group size is determined for the given
27702760
// queue/device, while it is safer to use queries to the kernel pre-compiled
27712761
// for the device.
2772-
size_t PrefWGSize = reduGetPreferredWGSize(CGH.MQueue, OneElemSize);
2762+
size_t PrefWGSize = reduGetPreferredWGSize(CGH, OneElemSize);
27732763

27742764
size_t NWorkItems = Range.size();
27752765
size_t WGSize = std::min(NWorkItems, PrefWGSize);

sycl/source/detail/reduction.cpp

Lines changed: 59 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,24 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
4949
return WGSize;
5050
}
5151

52+
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
53+
// Inline this helper:
54+
#endif
55+
uint32_t reduGetMaxNumConcurrentWorkGroups(device_impl &Dev) {
56+
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
57+
// TODO: The heuristics here require additional tuning for various devices
58+
// and vendors. Also, it would be better to check vendor/generation/etc.
59+
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
60+
NumThreads *= 8;
61+
return NumThreads;
62+
}
5263
// Returns the estimated number of physical threads on the device associated
5364
// with the given queue.
65+
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(handler &cgh) {
66+
return reduGetMaxNumConcurrentWorkGroups(getSyclObjImpl(cgh)->get_device());
67+
}
68+
69+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
5470
__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
5571
std::shared_ptr<sycl::detail::queue_impl> Queue) {
5672
// TODO: Graphs extension explicit API uses a handler with no queue attached,
@@ -63,25 +79,14 @@ __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
6379
if (Queue == nullptr) {
6480
return 8;
6581
}
66-
device Dev = Queue->get_device();
67-
uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
68-
// TODO: The heuristics here require additional tuning for various devices
69-
// and vendors. Also, it would be better to check vendor/generation/etc.
70-
if (Dev.is_gpu() && Dev.get_info<sycl::info::device::host_unified_memory>())
71-
NumThreads *= 8;
72-
return NumThreads;
82+
return reduGetMaxNumConcurrentWorkGroups(Queue->getDeviceImpl());
7383
}
84+
#endif
7485

7586
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
76-
__SYCL_EXPORT size_t
77-
reduGetMaxWGSize(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
78-
size_t LocalMemBytesPerWorkItem) {
79-
#else
80-
__SYCL_EXPORT size_t
81-
reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
82-
size_t LocalMemBytesPerWorkItem) {
87+
// Inline this helper:
8388
#endif
84-
device Dev = Queue->get_device();
89+
size_t reduGetMaxWGSize(device_impl &Dev, size_t LocalMemBytesPerWorkItem) {
8590
size_t MaxWGSize = Dev.get_info<sycl::info::device::max_work_group_size>();
8691

8792
size_t WGSizePerMem = MaxWGSize * 2;
@@ -118,26 +123,24 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
118123

119124
return WGSize;
120125
}
126+
__SYCL_EXPORT size_t reduGetMaxWGSize(handler &cgh,
127+
size_t LocalMemBytesPerWorkItem) {
128+
return reduGetMaxWGSize(getSyclObjImpl(cgh)->get_device(),
129+
LocalMemBytesPerWorkItem);
130+
}
131+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
132+
__SYCL_EXPORT
133+
size_t reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
134+
size_t LocalMemBytesPerWorkItem) {
135+
return reduGetMaxWGSize(Queue->getDeviceImpl(), LocalMemBytesPerWorkItem);
136+
}
137+
#endif
121138

122139
#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
123-
__SYCL_EXPORT size_t reduGetPreferredWGSize(
124-
const std::shared_ptr<queue_impl> &Queue, size_t LocalMemBytesPerWorkItem) {
125-
#else
126-
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
127-
size_t LocalMemBytesPerWorkItem) {
140+
// Inline this helper:
128141
#endif
129-
// TODO: Graphs extension explicit API uses a handler with a null queue to
130-
// process CGFs, in future we should have access to the device so we can
131-
// correctly calculate this.
132-
//
133-
// The 32 value was chosen as the hardcoded value as it is the returned
134-
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
135-
// Intel HD Graphics devices used as a L0 backend during development.
136-
if (Queue == nullptr) {
137-
return 32;
138-
}
139-
device Dev = Queue->get_device();
140-
142+
size_t reduGetPreferredWGSize(device_impl &Dev,
143+
size_t LocalMemBytesPerWorkItem) {
141144
// The maximum WGSize returned by CPU devices is very large and does not
142145
// help the reduction implementation: since all work associated with a
143146
// work-group is typically assigned to one CPU thread, selecting a large
@@ -174,8 +177,31 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
174177
}
175178

176179
// Use the maximum work-group size otherwise.
177-
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
180+
return reduGetMaxWGSize(Dev, LocalMemBytesPerWorkItem);
181+
}
182+
__SYCL_EXPORT size_t reduGetPreferredWGSize(handler &cgh,
183+
size_t LocalMemBytesPerWorkItem) {
184+
return reduGetPreferredWGSize(getSyclObjImpl(cgh)->get_device(),
185+
LocalMemBytesPerWorkItem);
178186
}
187+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
188+
__SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
189+
size_t LocalMemBytesPerWorkItem) {
190+
// TODO: Graphs extension explicit API uses a handler with a null queue to
191+
// process CGFs, in future we should have access to the device so we can
192+
// correctly calculate this.
193+
//
194+
// The 32 value was chosen as the hardcoded value as it is the returned
195+
// value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
196+
// Intel HD Graphics devices used as a L0 backend during development.
197+
if (Queue == nullptr) {
198+
return 32;
199+
}
200+
device_impl &Dev = Queue->getDeviceImpl();
201+
202+
return reduGetPreferredWGSize(Dev, LocalMemBytesPerWorkItem);
203+
}
204+
#endif
179205

180206
__SYCL_EXPORT void
181207
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3290,6 +3290,7 @@ _ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6a
32903290
_ZN4sycl3_V16detail16AccessorBaseHostC1ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
32913291
_ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviibmbRKNS0_13property_listE
32923292
_ZN4sycl3_V16detail16AccessorBaseHostC2ENS0_2idILi3EEENS0_5rangeILi3EEES6_NS0_6access4modeEPviimbRKNS0_13property_listE
3293+
_ZN4sycl3_V16detail16reduGetMaxWGSizeERNS0_7handlerEm
32933294
_ZN4sycl3_V16detail16reduGetMaxWGSizeESt10shared_ptrINS1_10queue_implEEm
32943295
_ZN4sycl3_V16detail17HostProfilingInfo3endEv
32953296
_ZN4sycl3_V16detail17HostProfilingInfo5startEv
@@ -3324,6 +3325,7 @@ _ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6devi
33243325
_ZN4sycl3_V16detail22get_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
33253326
_ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EENS0_12bundle_stateE
33263327
_ZN4sycl3_V16detail22has_kernel_bundle_implERKNS0_7contextERKSt6vectorINS0_6deviceESaIS6_EERKS5_INS0_9kernel_idESaISB_EENS0_12bundle_stateE
3328+
_ZN4sycl3_V16detail22reduGetPreferredWGSizeERNS0_7handlerEm
33273329
_ZN4sycl3_V16detail22reduGetPreferredWGSizeERSt10shared_ptrINS1_10queue_implEEm
33283330
_ZN4sycl3_V16detail22removeDuplicateDevicesERKSt6vectorINS0_6deviceESaIS3_EE
33293331
_ZN4sycl3_V16detail23constructorNotificationEPvS2_NS0_6access6targetENS3_4modeERKNS1_13code_locationE
@@ -3343,6 +3345,7 @@ _ZN4sycl3_V16detail30UnsampledImageAccessorBaseHost6getPtrEv
33433345
_ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC1ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
33443346
_ZN4sycl3_V16detail30UnsampledImageAccessorBaseHostC2ENS0_5rangeILi3EEENS0_6access4modeEPviiNS0_2idILi3EEENS0_18image_channel_typeENS0_19image_channel_orderERKNS0_13property_listE
33453347
_ZN4sycl3_V16detail33enable_ext_oneapi_default_contextEb
3348+
_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsERNS0_7handlerE
33463349
_ZN4sycl3_V16detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS1_10queue_implEE
33473350
_ZN4sycl3_V16detail34addHostSampledImageAccessorAndWaitEPNS1_28SampledImageAccessorImplHostE
33483351
_ZN4sycl3_V16detail35sampledImageConstructorNotificationEPvS2_RKSt8optionalINS0_12image_targetEEPKvjRKNS1_13code_locationE

sycl/test/abi/sycl_symbols_windows.dump

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4342,9 +4342,12 @@
43424342
?processArg@handler@_V1@sycl@@AEAAXPEAXAEBW4kernel_param_kind_t@detail@23@H_KAEA_K_N4@Z
43434343
?query@tls_code_loc_t@detail@_V1@sycl@@QEAAAEBUcode_location@234@XZ
43444344
?reduComputeWGSize@detail@_V1@sycl@@YA_K_K0AEA_K@Z
4345+
?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIAEAVhandler@23@@Z
43454346
?reduGetMaxNumConcurrentWorkGroups@detail@_V1@sycl@@YAIV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@@Z
4347+
?reduGetMaxWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
43464348
?reduGetMaxWGSize@detail@_V1@sycl@@YA_KV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
43474349
?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K@Z
4350+
?reduGetPreferredWGSize@detail@_V1@sycl@@YA_KAEAVhandler@23@_K@Z
43484351
?registerDynamicParameter@handler@_V1@sycl@@AEAAXAEAVdynamic_parameter_base@detail@experimental@oneapi@ext@23@H@Z
43494352
?registerDynamicParameter@handler@_V1@sycl@@AEAAXPEAVdynamic_parameter_impl@detail@experimental@oneapi@ext@23@H@Z
43504353
?release_external_memory@experimental@oneapi@ext@_V1@sycl@@YAXUexternal_mem@12345@AEBVdevice@45@AEBVcontext@45@@Z

0 commit comments

Comments
 (0)