Skip to content

Commit 81aacfa

Browse files
authored
[SYCL] Implement max_num_work_groups from the launch queries extension (#14333)
This PR implements the `max_num_work_groups ` query from the `sycl_ext_oneapi_launch_queries` extension. Additionally, this PR introduces changes that overload `ext_oneapi_get_info` for another kernel-queue-specific query - `max_num_work_group_sync` to take extra parameters for local work-group size and dynamic local memory size (in bytes) in order to allow users to pass those runtime resource limiting factors to the query, so they are taken into account in the final group count suggestion.
1 parent 729d6f6 commit 81aacfa

File tree

16 files changed

+424
-37
lines changed

16 files changed

+424
-37
lines changed

sycl/cmake/modules/FetchUnifiedRuntime.cmake

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
117117
endfunction()
118118

119119
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
120-
# commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
121-
# Merge: a99dbcee 3abe18cf
122-
# Author: Piotr Balcer <piotr.balcer@intel.com>
123-
# Date: Fri Sep 6 17:21:17 2024 +0200
124-
# Merge pull request #1820 from pbalcer/static-linking
125-
# Add support for static linking of the L0 adapter
126-
set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
120+
# commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
121+
# Merge: e26bba51 45a781f4
122+
# Author: Omar Ahmed <omar.ahmed@codeplay.com>
123+
# Date: Tue Sep 10 12:08:57 2024 +0100
124+
# Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
125+
# [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
126+
set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
127127

128128
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
129129
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need

sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,11 @@ otherwise it is 0.
204204
|Returns the maximum number of work-groups, when the kernel is submitted to the
205205
specified queue with the specified work-group size and the specified amount of
206206
dynamic work-group local memory (in bytes), accounting for any kernel
207-
properties or features. If the kernel can be submitted to the specified queue
208-
without an error, the minimum value returned by this query is 1, otherwise it
209-
is 0.
207+
properties or features. If the specified work-group size is 0, which is
208+
invalid, then the implementation will throw a synchronous exception with the
209+
`errc::invalid` error code. If the kernel can be submitted to the specified
210+
queue without an error, the minimum value returned by this query is 1,
211+
otherwise it is 0.
210212

211213
|===
212214

sycl/include/sycl/detail/info_desc_helpers.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
3131
template <typename T> struct is_kernel_info_desc : std::false_type {};
3232
template <typename T>
3333
struct is_kernel_device_specific_info_desc : std::false_type {};
34+
template <typename T>
35+
struct is_kernel_queue_specific_info_desc : std::false_type {};
3436
template <typename T> struct is_event_info_desc : std::false_type {};
3537
template <typename T> struct is_event_profiling_info_desc : std::false_type {};
3638
// Normally we would just use std::enable_if to limit valid get_info template
@@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
134136
#include <sycl/info/ext_intel_device_traits.def>
135137
#include <sycl/info/ext_oneapi_device_traits.def>
136138
#undef __SYCL_PARAM_TRAITS_SPEC
139+
140+
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode) \
141+
template <> \
142+
struct is_##DescType##_info_desc<Namespace::info::DescType::Desc> \
143+
: std::true_type { \
144+
using return_type = Namespace::info::DescType::Desc::return_type; \
145+
};
146+
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
147+
#undef __SYCL_PARAM_TRAITS_SPEC
148+
137149
#define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode) \
138150
template <> \
139151
struct is_backend_info_desc<info::DescType::Desc> : std::true_type { \

sycl/include/sycl/ext/oneapi/experimental/root_group.hpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,8 @@ namespace sycl {
2424
inline namespace _V1 {
2525
namespace ext::oneapi::experimental {
2626

27-
namespace info::kernel_queue_specific {
28-
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
29-
// #7598 is merged.
30-
struct max_num_work_group_sync {
31-
using return_type = size_t;
32-
};
33-
} // namespace info::kernel_queue_specific
27+
// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
28+
// device-specific properties that relate to 'root_group'.
3429

3530
template <int Dimensions> class root_group {
3631
public:
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// TODO: Revisit 'max_num_work_group_sync' and align it with the
2+
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
3+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
4+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)

sycl/include/sycl/info/info_desc.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
247247
#include <sycl/info/ext_codeplay_device_traits.def>
248248
#include <sycl/info/ext_intel_device_traits.def>
249249
#include <sycl/info/ext_oneapi_device_traits.def>
250+
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
251+
250252
#undef __SYCL_PARAM_TRAITS_SPEC
251253
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
252254
} // namespace _V1

sycl/include/sycl/kernel.hpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
159159
get_info(const device &Device, const range<3> &WGSize) const;
160160

161161
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
162-
// once #7598 is merged.
162+
// once #7598 is merged. (regarding the 'max_num_work_group_sync' query)
163+
164+
/// Query queue/launch-specific information from a kernel using the
165+
/// info::kernel_queue_specific descriptor for a specific Queue.
166+
///
167+
/// \param Queue is a valid SYCL queue.
168+
/// \return depends on information being queried.
169+
template <typename Param>
170+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
171+
ext_oneapi_get_info(queue Queue) const;
172+
173+
/// Query queue/launch-specific information from a kernel using the
174+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
175+
/// max_num_work_groups is the only valid descriptor for this function.
176+
///
177+
/// \param Queue is a valid SYCL queue.
178+
/// \param WorkGroupSize is the work-group size the number of work-groups is
179+
/// requested for.
180+
/// \return depends on information being queried.
163181
template <typename Param>
164-
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
182+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
183+
ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
184+
size_t DynamicLocalMemorySize) const;
165185

166186
private:
167187
/// Constructs a SYCL kernel object from a valid kernel_impl instance.

sycl/source/detail/kernel_impl.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
106106
"interoperability function or to query a device built-in kernel");
107107
}
108108

109+
bool kernel_impl::exceedsOccupancyResourceLimits(
110+
const device &Device, const range<3> &WorkGroupSize,
111+
size_t DynamicLocalMemorySize) const {
112+
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
113+
// Generally, exceeding hardware resource limits will yield in an error when
114+
// the kernel is launched.
115+
const size_t MaxWorkGroupSize =
116+
get_info<info::kernel_device_specific::work_group_size>(Device);
117+
const size_t MaxLocalMemorySizeInBytes =
118+
Device.get_info<info::device::local_mem_size>();
119+
120+
if (WorkGroupSize.size() > MaxWorkGroupSize)
121+
return true;
122+
123+
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
124+
return true;
125+
126+
// It will be impossible to launch a kernel for Cuda when the hardware limit
127+
// for the 32-bit registers page file size is exceeded.
128+
if (Device.get_backend() == backend::ext_oneapi_cuda) {
129+
const uint32_t RegsPerWorkItem =
130+
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
131+
const uint32_t MaxRegsPerWorkGroup =
132+
Device.get_info<ext::codeplay::experimental::info::device::
133+
max_registers_per_work_group>();
134+
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
135+
return true;
136+
}
137+
138+
return false;
139+
}
140+
109141
template <>
110142
typename info::platform::version::return_type
111143
kernel_impl::get_backend_info<info::platform::version>() const {

sycl/source/detail/kernel_impl.hpp

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,26 @@ class kernel_impl {
114114
typename Param::return_type get_info(const device &Device,
115115
const range<3> &WGSize) const;
116116

117+
/// Query queue/launch-specific information from a kernel using the
118+
/// info::kernel_queue_specific descriptor for a specific Queue.
119+
///
120+
/// \param Queue is a valid SYCL queue.
121+
/// \return depends on information being queried.
122+
template <typename Param>
123+
typename Param::return_type ext_oneapi_get_info(queue Queue) const;
124+
125+
/// Query queue/launch-specific information from a kernel using the
126+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
127+
/// max_num_work_groups is the only valid descriptor for this function.
128+
///
129+
/// \param Queue is a valid SYCL queue.
130+
/// \param WorkGroupSize is the work-group size the number of work-groups is
131+
/// requested for.
132+
/// \return depends on information being queried.
117133
template <typename Param>
118-
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
134+
typename Param::return_type
135+
ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
136+
size_t DynamicLocalMemorySize) const;
119137

120138
/// Get a constant reference to a raw kernel object.
121139
///
@@ -171,6 +189,12 @@ class kernel_impl {
171189

172190
bool isBuiltInKernel(const device &Device) const;
173191
void checkIfValidForNumArgsInfoQuery() const;
192+
193+
/// Check if the occupancy limits are exceeded for the given kernel launch
194+
/// configuration.
195+
bool exceedsOccupancyResourceLimits(const device &Device,
196+
const range<3> &WorkGroupSize,
197+
size_t DynamicLocalMemorySize) const;
174198
};
175199

176200
template <typename Param>
@@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
217241
getPlugin());
218242
}
219243

244+
namespace syclex = ext::oneapi::experimental;
245+
220246
template <>
221-
inline typename ext::oneapi::experimental::info::kernel_queue_specific::
222-
max_num_work_group_sync::return_type
247+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
248+
return_type
223249
kernel_impl::ext_oneapi_get_info<
224-
ext::oneapi::experimental::info::kernel_queue_specific::
225-
max_num_work_group_sync>(const queue &Queue) const {
250+
syclex::info::kernel_queue_specific::max_num_work_groups>(
251+
queue Queue, const range<3> &WorkGroupSize,
252+
size_t DynamicLocalMemorySize) const {
253+
if (WorkGroupSize.size() == 0)
254+
throw exception(sycl::make_error_code(errc::invalid),
255+
"The launch work-group size cannot be zero.");
256+
226257
const auto &Plugin = getPlugin();
227258
const auto &Handle = getHandleRef();
259+
auto Device = Queue.get_device();
260+
261+
uint32_t GroupCount{0};
262+
if (auto Result = Plugin->call_nocheck<
263+
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
264+
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
265+
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
266+
// The feature is supported. Check for other errors and throw if any.
267+
Plugin->checkUrResult(Result);
268+
return GroupCount;
269+
}
270+
271+
// Fallback. If the backend API is unsupported, this query will return either
272+
// 0 or 1 based on the kernel resource usage and the user-requested resources.
273+
return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
274+
DynamicLocalMemorySize)
275+
? 0
276+
: 1;
277+
}
278+
279+
template <>
280+
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
281+
return_type
282+
kernel_impl::ext_oneapi_get_info<
283+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
284+
queue Queue, const range<3> &WorkGroupSize,
285+
size_t DynamicLocalMemorySize) const {
286+
return ext_oneapi_get_info<
287+
syclex::info::kernel_queue_specific::max_num_work_groups>(
288+
Queue, WorkGroupSize, DynamicLocalMemorySize);
289+
}
290+
291+
template <>
292+
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
293+
return_type
294+
kernel_impl::ext_oneapi_get_info<
295+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
296+
queue Queue) const {
297+
auto Device = Queue.get_device();
228298
const auto MaxWorkGroupSize =
229-
Queue.get_device().get_info<info::device::max_work_group_size>();
230-
uint32_t GroupCount = 0;
231-
Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
232-
Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
233-
return GroupCount;
299+
get_info<info::kernel_device_specific::work_group_size>(Device);
300+
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
301+
return ext_oneapi_get_info<
302+
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
303+
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
234304
}
235305

236306
} // namespace detail

sycl/source/kernel.cpp

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
106106
const device &, const sycl::range<3> &) const;
107107

108108
template <typename Param>
109-
typename Param::return_type
110-
kernel::ext_oneapi_get_info(const queue &Queue) const {
109+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
110+
kernel::ext_oneapi_get_info(queue Queue) const {
111111
return impl->ext_oneapi_get_info<Param>(Queue);
112112
}
113113

114+
template <typename Param>
115+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
116+
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
117+
size_t DynamicLocalMemorySize) const {
118+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
119+
DynamicLocalMemorySize);
120+
}
121+
114122
template __SYCL_EXPORT typename ext::oneapi::experimental::info::
115123
kernel_queue_specific::max_num_work_group_sync::return_type
116124
kernel::ext_oneapi_get_info<
117125
ext::oneapi::experimental::info::kernel_queue_specific::
118-
max_num_work_group_sync>(const queue &Queue) const;
126+
max_num_work_group_sync>(queue Queue) const;
127+
128+
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
129+
template __SYCL_EXPORT ReturnT \
130+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
131+
queue, const range<3> &, size_t) const;
132+
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
133+
// kernel_queue_specific queries require the above-defined get_info interface.
134+
// clang-format off
135+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
136+
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
137+
// clang-format on
138+
#undef __SYCL_PARAM_TRAITS_SPEC
119139

120140
kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
121141

0 commit comments

Comments
 (0)