Skip to content

Commit 336b89e

Browse files
0x12CCmartygrant
andauthored
Update the kernel_queue_specific::max_num_work_group query (#16051)
Update calls to `urKernelSuggestMaxCooperativeGroupCountExp` to match the changes in oneapi-src/unified-runtime#2316. This PR also implements the `range<1>` and `range<2>` overloads of the `ext_oneapi_get_info` API from [sycl_ext_oneapi_launch_queries](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc). --------- Signed-off-by: Michael Aziz <[email protected]> Co-authored-by: Martin Morrison-Grant <[email protected]>
1 parent 95a858d commit 336b89e

File tree

10 files changed

+233
-96
lines changed

10 files changed

+233
-96
lines changed
Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
# commit 27398080349f1d8d21d6a8680e234d29dcd14734 (HEAD, origin/main, origin/HEAD)
2-
# Merge: 572355db942d dc971af72a31
1+
# commit 72e80a42cc8e5b11d43dd9d34b40d470e1476181 (HEAD, origin/main, origin/HEAD)
2+
# Merge: 6e5d0e6b9a47 9c7e56cc765b
33
# Author: Martin Grant <[email protected]>
4-
# Date: Thu Dec 5 14:57:07 2024 +0000
5-
# Merge pull request #2293 from yingcong-wu/yc-PR/241107-misc-minor-fix
6-
# [DeviceAsan] Serval bug fixes
7-
set(UNIFIED_RUNTIME_TAG 27398080349f1d8d21d6a8680e234d29dcd14734)
4+
# Date: Fri Dec 6 10:11:15 2024 +0000
5+
# Merge pull request #2316 from 0x12CC/coop_kernel_query
6+
# Change `urSuggestMaxCooperativeGroupCountExp` to accept ND size parameter
7+
set(UNIFIED_RUNTIME_TAG 72e80a42cc8e5b11d43dd9d34b40d470e1476181)
Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
1-
// TODO: Revisit 'max_num_work_group_sync' and align it with the
2-
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
3-
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
41
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)

sycl/include/sycl/kernel.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,32 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
168168
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
169169
ext_oneapi_get_info(queue Queue) const;
170170

171+
/// Query queue/launch-specific information from a kernel using the
172+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
173+
/// max_num_work_groups is the only valid descriptor for this function.
174+
///
175+
/// \param Queue is a valid SYCL queue.
176+
/// \param WorkGroupSize is the work-group size the number of work-groups is
177+
/// requested for.
178+
/// \return depends on information being queried.
179+
template <typename Param>
180+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
181+
ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
182+
size_t DynamicLocalMemorySize) const;
183+
184+
/// Query queue/launch-specific information from a kernel using the
185+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
186+
/// max_num_work_groups is the only valid descriptor for this function.
187+
///
188+
/// \param Queue is a valid SYCL queue.
189+
/// \param WorkGroupSize is the work-group size the number of work-groups is
190+
/// requested for.
191+
/// \return depends on information being queried.
192+
template <typename Param>
193+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
194+
ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
195+
size_t DynamicLocalMemorySize) const;
196+
171197
/// Query queue/launch-specific information from a kernel using the
172198
/// info::kernel_queue_specific descriptor for a specific Queue and values.
173199
/// max_num_work_groups is the only valid descriptor for this function.

sycl/source/detail/kernel_impl.cpp

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -106,38 +106,6 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
106106
"interoperability function or to query a device built-in kernel");
107107
}
108108

109-
bool kernel_impl::exceedsOccupancyResourceLimits(
110-
const device &Device, const range<3> &WorkGroupSize,
111-
size_t DynamicLocalMemorySize) const {
112-
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
113-
// Generally, exceeding hardware resource limits will yield in an error when
114-
// the kernel is launched.
115-
const size_t MaxWorkGroupSize =
116-
get_info<info::kernel_device_specific::work_group_size>(Device);
117-
const size_t MaxLocalMemorySizeInBytes =
118-
Device.get_info<info::device::local_mem_size>();
119-
120-
if (WorkGroupSize.size() > MaxWorkGroupSize)
121-
return true;
122-
123-
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
124-
return true;
125-
126-
// It will be impossible to launch a kernel for Cuda when the hardware limit
127-
// for the 32-bit registers page file size is exceeded.
128-
if (Device.get_backend() == backend::ext_oneapi_cuda) {
129-
const uint32_t RegsPerWorkItem =
130-
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
131-
const uint32_t MaxRegsPerWorkGroup =
132-
Device.get_info<ext::codeplay::experimental::info::device::
133-
max_registers_per_work_group>();
134-
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
135-
return true;
136-
}
137-
138-
return false;
139-
}
140-
141109
template <>
142110
typename info::platform::version::return_type
143111
kernel_impl::get_backend_info<info::platform::version>() const {

sycl/source/detail/kernel_impl.hpp

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,32 @@ class kernel_impl {
123123
template <typename Param>
124124
typename Param::return_type ext_oneapi_get_info(queue Queue) const;
125125

126+
/// Query queue/launch-specific information from a kernel using the
127+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
128+
/// max_num_work_groups is the only valid descriptor for this function.
129+
///
130+
/// \param Queue is a valid SYCL queue.
131+
/// \param WorkGroupSize is the work-group size the number of work-groups is
132+
/// requested for.
133+
/// \return depends on information being queried.
134+
template <typename Param>
135+
typename Param::return_type
136+
ext_oneapi_get_info(queue Queue, const range<1> &MaxWorkGroupSize,
137+
size_t DynamicLocalMemorySize) const;
138+
139+
/// Query queue/launch-specific information from a kernel using the
140+
/// info::kernel_queue_specific descriptor for a specific Queue and values.
141+
/// max_num_work_groups is the only valid descriptor for this function.
142+
///
143+
/// \param Queue is a valid SYCL queue.
144+
/// \param WorkGroupSize is the work-group size the number of work-groups is
145+
/// requested for.
146+
/// \return depends on information being queried.
147+
template <typename Param>
148+
typename Param::return_type
149+
ext_oneapi_get_info(queue Queue, const range<2> &MaxWorkGroupSize,
150+
size_t DynamicLocalMemorySize) const;
151+
126152
/// Query queue/launch-specific information from a kernel using the
127153
/// info::kernel_queue_specific descriptor for a specific Queue and values.
128154
/// max_num_work_groups is the only valid descriptor for this function.
@@ -193,11 +219,49 @@ class kernel_impl {
193219

194220
/// Check if the occupancy limits are exceeded for the given kernel launch
195221
/// configuration.
222+
template <int Dimensions>
196223
bool exceedsOccupancyResourceLimits(const device &Device,
197-
const range<3> &WorkGroupSize,
224+
const range<Dimensions> &WorkGroupSize,
198225
size_t DynamicLocalMemorySize) const;
226+
template <int Dimensions>
227+
size_t queryMaxNumWorkGroups(queue Queue,
228+
const range<Dimensions> &WorkGroupSize,
229+
size_t DynamicLocalMemorySize) const;
199230
};
200231

232+
template <int Dimensions>
233+
bool kernel_impl::exceedsOccupancyResourceLimits(
234+
const device &Device, const range<Dimensions> &WorkGroupSize,
235+
size_t DynamicLocalMemorySize) const {
236+
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
237+
// Generally, exceeding hardware resource limits will yield in an error when
238+
// the kernel is launched.
239+
const size_t MaxWorkGroupSize =
240+
get_info<info::kernel_device_specific::work_group_size>(Device);
241+
const size_t MaxLocalMemorySizeInBytes =
242+
Device.get_info<info::device::local_mem_size>();
243+
244+
if (WorkGroupSize.size() > MaxWorkGroupSize)
245+
return true;
246+
247+
if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
248+
return true;
249+
250+
// It will be impossible to launch a kernel for Cuda when the hardware limit
251+
// for the 32-bit registers page file size is exceeded.
252+
if (Device.get_backend() == backend::ext_oneapi_cuda) {
253+
const uint32_t RegsPerWorkItem =
254+
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
255+
const uint32_t MaxRegsPerWorkGroup =
256+
Device.get_info<ext::codeplay::experimental::info::device::
257+
max_registers_per_work_group>();
258+
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
259+
return true;
260+
}
261+
262+
return false;
263+
}
264+
201265
template <typename Param>
202266
inline typename Param::return_type kernel_impl::get_info() const {
203267
static_assert(is_kernel_info_desc<Param>::value,
@@ -244,13 +308,11 @@ kernel_impl::get_info(const device &Device,
244308

245309
namespace syclex = ext::oneapi::experimental;
246310

247-
template <>
248-
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
249-
return_type
250-
kernel_impl::ext_oneapi_get_info<
251-
syclex::info::kernel_queue_specific::max_num_work_groups>(
252-
queue Queue, const range<3> &WorkGroupSize,
253-
size_t DynamicLocalMemorySize) const {
311+
template <int Dimensions>
312+
size_t
313+
kernel_impl::queryMaxNumWorkGroups(queue Queue,
314+
const range<Dimensions> &WorkGroupSize,
315+
size_t DynamicLocalMemorySize) const {
254316
if (WorkGroupSize.size() == 0)
255317
throw exception(sycl::make_error_code(errc::invalid),
256318
"The launch work-group size cannot be zero.");
@@ -259,12 +321,21 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
259321
const auto &Handle = getHandleRef();
260322
auto Device = Queue.get_device();
261323

324+
size_t WG[Dimensions];
325+
WG[0] = WorkGroupSize[0];
326+
if constexpr (Dimensions >= 2)
327+
WG[1] = WorkGroupSize[1];
328+
if constexpr (Dimensions == 3)
329+
WG[2] = WorkGroupSize[2];
330+
262331
uint32_t GroupCount{0};
263332
if (auto Result = Adapter->call_nocheck<
264333
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
265-
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
266-
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
267-
// The feature is supported. Check for other errors and throw if any.
334+
Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount);
335+
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
336+
Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) {
337+
// The feature is supported and the group size is valid. Check for other
338+
// errors and throw if any.
268339
Adapter->checkUrResult(Result);
269340
return GroupCount;
270341
}
@@ -278,30 +349,33 @@ inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
278349
}
279350

280351
template <>
281-
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
352+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
282353
return_type
283354
kernel_impl::ext_oneapi_get_info<
284-
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
285-
queue Queue, const range<3> &WorkGroupSize,
355+
syclex::info::kernel_queue_specific::max_num_work_groups>(
356+
queue Queue, const range<1> &WorkGroupSize,
286357
size_t DynamicLocalMemorySize) const {
287-
return ext_oneapi_get_info<
288-
syclex::info::kernel_queue_specific::max_num_work_groups>(
289-
Queue, WorkGroupSize, DynamicLocalMemorySize);
358+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
290359
}
291360

292361
template <>
293-
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
362+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
294363
return_type
295364
kernel_impl::ext_oneapi_get_info<
296-
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
297-
queue Queue) const {
298-
auto Device = Queue.get_device();
299-
const auto MaxWorkGroupSize =
300-
get_info<info::kernel_device_specific::work_group_size>(Device);
301-
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
302-
return ext_oneapi_get_info<
303-
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
304-
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
365+
syclex::info::kernel_queue_specific::max_num_work_groups>(
366+
queue Queue, const range<2> &WorkGroupSize,
367+
size_t DynamicLocalMemorySize) const {
368+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
369+
}
370+
371+
template <>
372+
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
373+
return_type
374+
kernel_impl::ext_oneapi_get_info<
375+
syclex::info::kernel_queue_specific::max_num_work_groups>(
376+
queue Queue, const range<3> &WorkGroupSize,
377+
size_t DynamicLocalMemorySize) const {
378+
return queryMaxNumWorkGroups(Queue, WorkGroupSize, DynamicLocalMemorySize);
305379
}
306380

307381
} // namespace detail

sycl/source/kernel.cpp

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,26 +113,41 @@ kernel::ext_oneapi_get_info(queue Queue) const {
113113

114114
template <typename Param>
115115
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
116-
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
116+
kernel::ext_oneapi_get_info(queue Queue, const range<1> &WorkGroupSize,
117117
size_t DynamicLocalMemorySize) const {
118118
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
119119
DynamicLocalMemorySize);
120120
}
121121

122-
template __SYCL_EXPORT typename ext::oneapi::experimental::info::
123-
kernel_queue_specific::max_num_work_group_sync::return_type
124-
kernel::ext_oneapi_get_info<
125-
ext::oneapi::experimental::info::kernel_queue_specific::
126-
max_num_work_group_sync>(queue Queue) const;
122+
template <typename Param>
123+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
124+
kernel::ext_oneapi_get_info(queue Queue, const range<2> &WorkGroupSize,
125+
size_t DynamicLocalMemorySize) const {
126+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
127+
DynamicLocalMemorySize);
128+
}
129+
130+
template <typename Param>
131+
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
132+
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
133+
size_t DynamicLocalMemorySize) const {
134+
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
135+
DynamicLocalMemorySize);
136+
}
127137

128138
#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
129139
template __SYCL_EXPORT ReturnT \
140+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
141+
queue, const range<1> &, size_t) const; \
142+
template __SYCL_EXPORT ReturnT \
143+
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
144+
queue, const range<2> &, size_t) const; \
145+
template __SYCL_EXPORT ReturnT \
130146
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
131147
queue, const range<3> &, size_t) const;
132148
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
133149
// kernel_queue_specific queries require the above-defined get_info interface.
134150
// clang-format off
135-
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
136151
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
137152
// clang-format on
138153
#undef __SYCL_PARAM_TRAITS_SPEC
@@ -143,5 +158,53 @@ ur_native_handle_t kernel::getNative() const { return impl->getNative(); }
143158

144159
ur_native_handle_t kernel::getNativeImpl() const { return impl->getNative(); }
145160

161+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
162+
// The following query was deprecated since it doesn't include a way to specify
163+
// the invdividual dimensions of the work group. All of the contents of this
164+
// #ifndef block should be removed during the next ABI breaking window.
165+
namespace ext::oneapi::experimental::info::kernel_queue_specific {
166+
struct max_num_work_group_sync {
167+
using return_type = size_t;
168+
};
169+
} // namespace ext::oneapi::experimental::info::kernel_queue_specific
170+
template <>
171+
struct detail::is_kernel_queue_specific_info_desc<
172+
ext::oneapi::experimental::info::kernel_queue_specific::
173+
max_num_work_group_sync> : std::true_type {
174+
using return_type = ext::oneapi::experimental::info::kernel_queue_specific::
175+
max_num_work_group_sync::return_type;
176+
};
177+
template <>
178+
__SYCL2020_DEPRECATED(
179+
"The 'max_num_work_group_sync' query is deprecated. See "
180+
"'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
181+
__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
182+
max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
183+
ext::oneapi::experimental::info::kernel_queue_specific::
184+
max_num_work_group_sync>(queue Queue, const range<3> &WorkGroupSize,
185+
size_t DynamicLocalMemorySize) const {
186+
return ext_oneapi_get_info<ext::oneapi::experimental::info::
187+
kernel_queue_specific::max_num_work_groups>(
188+
Queue, WorkGroupSize, DynamicLocalMemorySize);
189+
}
190+
template <>
191+
__SYCL2020_DEPRECATED(
192+
"The 'max_num_work_group_sync' query is deprecated. See "
193+
"'sycl_ext_oneapi_launch_queries' for the new 'max_num_work_groups' query.")
194+
__SYCL_EXPORT typename ext::oneapi::experimental::info::kernel_queue_specific::
195+
max_num_work_group_sync::return_type kernel::ext_oneapi_get_info<
196+
ext::oneapi::experimental::info::kernel_queue_specific::
197+
max_num_work_group_sync>(queue Queue) const {
198+
auto Device = Queue.get_device();
199+
const auto MaxWorkGroupSize =
200+
get_info<info::kernel_device_specific::work_group_size>(Device);
201+
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
202+
return ext_oneapi_get_info<ext::oneapi::experimental::info::
203+
kernel_queue_specific::max_num_work_groups>(
204+
Queue, WorkGroupSize,
205+
/* DynamicLocalMemorySize */ 0);
206+
}
207+
#endif
208+
146209
} // namespace _V1
147210
} // namespace sycl

sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,14 +162,10 @@ int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
162162
// It cannot be possible to launch a kernel successfully with a configuration
163163
// that exceeds the available resources as in the above defined workGroupSize.
164164
// workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
165-
// Note: Level-Zero currently always returns a non-zero value.
166-
// TODO: Remove the backend condition once the Level-Zero API issue is fixed.
167-
if (dev.get_backend() != sycl::backend::ext_oneapi_level_zero) {
168-
assert(maxWGs == 0 &&
169-
"max_num_work_groups query failed.\n"
170-
"It should return 0 possible groups when the requested resources "
171-
"by the lanuch config exceed those available in the hardware.");
172-
}
165+
assert(maxWGs == 0 &&
166+
"max_num_work_groups query failed.\n"
167+
"It should return 0 possible groups when the requested resources "
168+
"by the lanuch config exceed those available in the hardware.");
173169

174170
// As we ensured that the 'max_num_work_groups' query correctly
175171
// returns 0 possible work-groups, test that the kernel launch will fail.

0 commit comments

Comments
 (0)