Skip to content

Commit af8361c

Browse files
authored
[SYCL] Add max work-group size kernel properties (#14518)
This patch adds two kernel properties to allow users to specify the maximum work-group size that a kernel will be invoked with. The `max_work_group_size` property corresponds to the `intel::max_work_group_size` function attribute, but can be specified with 1, 2, or 3 dimensions (unlike the attribute which accepts only 3). The `max_linear_work_group_size` property is similar but is always a single value which denotes the combined linear (total) work-group size. This can be used when the user cannot guarantee a maximum bound in each of the dimensions they wish to run the kernel, but can guarantee a total. This acts similarly to CUDA's `maxThreadsPerBlock` launch bounds property. This patch also wires up the 'max_work_group_size' property to the equivalent SPIR-V execution mode, which should hopefully improve certain use cases.
1 parent 2f0abc6 commit af8361c

File tree

15 files changed

+806
-51
lines changed

15 files changed

+806
-51
lines changed

llvm/lib/SYCLLowerIR/CompileTimePropertiesPass.cpp

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -362,19 +362,24 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
362362
AddFPControlMetadataForWidth(SPIRV_DENORM_PRESERVE, 64);
363363
}
364364

365-
if (AttrKindStr == "sycl-work-group-size" ||
366-
AttrKindStr == "sycl-work-group-size-hint") {
365+
static constexpr std::tuple<const char *, const char *> SimpleWGAttrs[] = {
366+
{"sycl-work-group-size", "reqd_work_group_size"},
367+
{"sycl-work-group-size-hint", "work_group_size_hint"},
368+
{"sycl-max-work-group-size", "max_work_group_size"},
369+
};
370+
371+
for (auto &[AttrKind, MDStr] : SimpleWGAttrs) {
372+
if (AttrKindStr != AttrKind)
373+
continue;
367374
// Split values in the comma-separated list integers.
368-
SmallVector<StringRef, 3> ValStrs;
369-
Attr.getValueAsString().split(ValStrs, ',');
375+
SmallVector<StringRef, 3> AttrValStrs;
376+
Attr.getValueAsString().split(AttrValStrs, ',');
370377

371-
size_t NumDims = ValStrs.size();
372-
assert(NumDims <= 3 &&
373-
"sycl-work-group-size and sycl-work-group-size-hint currently only "
374-
"support up to three values");
378+
size_t NumDims = AttrValStrs.size();
379+
assert(NumDims <= 3 && "Incorrect number of values for kernel property");
375380

376381
// SYCL work-group sizes must be reversed for SPIR-V.
377-
std::reverse(ValStrs.begin(), ValStrs.end());
382+
std::reverse(AttrValStrs.begin(), AttrValStrs.end());
378383

379384
// Use integer pointer size as closest analogue to size_t.
380385
IntegerType *IntPtrTy = DLayout.getIntPtrType(Ctx);
@@ -383,7 +388,7 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
383388

384389
// Get the integers from the strings.
385390
SmallVector<Metadata *, 3> MDVals;
386-
for (StringRef ValStr : ValStrs)
391+
for (StringRef ValStr : AttrValStrs)
387392
MDVals.push_back(ConstantAsMetadata::get(
388393
Constant::getIntegerValue(SizeTTy, APInt(SizeTBitSize, ValStr, 10))));
389394
while (MDVals.size() < 3)
@@ -397,10 +402,7 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
397402
Type::getInt32Ty(Ctx), NumDims))));
398403
}
399404

400-
const char *MDName = (AttrKindStr == "sycl-work-group-size")
401-
? "reqd_work_group_size"
402-
: "work_group_size_hint";
403-
return std::pair<std::string, MDNode *>(MDName, MDNode::get(Ctx, MDVals));
405+
return std::pair<std::string, MDNode *>(MDStr, MDNode::get(Ctx, MDVals));
404406
}
405407

406408
if (AttrKindStr == "sycl-sub-group-size") {
@@ -413,6 +415,21 @@ attributeToExecModeMetadata(const Attribute &Attr, Function &F) {
413415
MDNode::get(Ctx, MD));
414416
}
415417

418+
if (AttrKindStr == "sycl-max-linear-work-group-size") {
419+
auto MaxLinearSize = getAttributeAsInteger<uint64_t>(Attr);
420+
// Use integer pointer size as closest analogue to size_t.
421+
IntegerType *IntPtrTy = DLayout.getIntPtrType(Ctx);
422+
IntegerType *SizeTTy = Type::getIntNTy(Ctx, IntPtrTy->getBitWidth());
423+
unsigned SizeTBitSize = SizeTTy->getBitWidth();
424+
425+
// Get the integers from the strings.
426+
Metadata *MD = ConstantAsMetadata::get(Constant::getIntegerValue(
427+
SizeTTy, APInt(SizeTBitSize, MaxLinearSize, 10)));
428+
429+
return std::pair<std::string, MDNode *>("max_linear_work_group_size",
430+
MDNode::get(Ctx, MD));
431+
}
432+
416433
// The sycl-single-task attribute currently only has an effect when targeting
417434
// SPIR FPGAs, in which case it will generate a "max_global_work_dim" MD node
418435
// with a 0 value, similar to applying [[intel::max_global_work_dim(0)]] to

llvm/lib/SYCLLowerIR/ComputeModuleRuntimeInfo.cpp

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -112,28 +112,35 @@ std::vector<StringRef> getKernelNamesUsingAssert(const Module &M) {
112112
return SPIRKernelNames;
113113
}
114114

115-
// Gets reqd_work_group_size information for function Func.
116-
std::vector<uint32_t> getKernelReqdWorkGroupSizeMetadata(const Function &Func) {
117-
MDNode *ReqdWorkGroupSizeMD = Func.getMetadata("reqd_work_group_size");
118-
if (!ReqdWorkGroupSizeMD)
115+
// Gets 1- to 3-dimension work-group related information for function Func.
116+
// Returns an empty vector if not present.
117+
template <typename T>
118+
std::vector<T> getKernelWorkGroupMetadata(const Function &Func,
119+
const char *MDName) {
120+
MDNode *WorkGroupMD = Func.getMetadata(MDName);
121+
if (!WorkGroupMD)
119122
return {};
120-
size_t NumOperands = ReqdWorkGroupSizeMD->getNumOperands();
123+
size_t NumOperands = WorkGroupMD->getNumOperands();
121124
assert(NumOperands >= 1 && NumOperands <= 3 &&
122-
"reqd_work_group_size does not have between 1 and 3 operands.");
123-
std::vector<uint32_t> OutVals;
125+
"work-group metadata does not have between 1 and 3 operands.");
126+
std::vector<T> OutVals;
124127
OutVals.reserve(NumOperands);
125-
for (const MDOperand &MDOp : ReqdWorkGroupSizeMD->operands())
128+
for (const MDOperand &MDOp : WorkGroupMD->operands())
126129
OutVals.push_back(mdconst::extract<ConstantInt>(MDOp)->getZExtValue());
127130
return OutVals;
128131
}
129-
// Gets work_group_num_dim information for function Func, conviniently 0 if
130-
// metadata is not present.
131-
uint32_t getKernelWorkGroupNumDim(const Function &Func) {
132-
MDNode *MaxDimMD = Func.getMetadata("work_group_num_dim");
133-
if (!MaxDimMD)
134-
return 0;
135-
assert(MaxDimMD->getNumOperands() == 1 && "Malformed node.");
136-
return mdconst::extract<ConstantInt>(MaxDimMD->getOperand(0))->getZExtValue();
132+
133+
// Gets a single-dimensional piece of information for function Func.
134+
// Returns std::nullopt if metadata is not present.
135+
template <typename T>
136+
std::optional<T> getKernelSingleEltMetadata(const Function &Func,
137+
const char *MDName) {
138+
if (MDNode *MaxDimMD = Func.getMetadata(MDName)) {
139+
assert(MaxDimMD->getNumOperands() == 1 && "Malformed node.");
140+
return mdconst::extract<ConstantInt>(MaxDimMD->getOperand(0))
141+
->getZExtValue();
142+
}
143+
return std::nullopt;
137144
}
138145

139146
PropSetRegTy computeModuleProperties(const Module &M,
@@ -249,22 +256,40 @@ PropSetRegTy computeModuleProperties(const Module &M,
249256
SmallVector<std::string, 4> MetadataNames;
250257

251258
if (GlobProps.EmitProgramMetadata) {
252-
// Add reqd_work_group_size and work_group_num_dim information to
253-
// program metadata.
259+
// Add various pieces of function metadata to program metadata.
254260
for (const Function &Func : M.functions()) {
255-
std::vector<uint32_t> KernelReqdWorkGroupSize =
256-
getKernelReqdWorkGroupSizeMetadata(Func);
257-
if (!KernelReqdWorkGroupSize.empty()) {
261+
// Note - we're implicitly truncating 64-bit work-group data to 32 bits in
262+
// all work-group related metadata. All current consumers of this program
263+
// metadata format only support SYCL ID queries that fit within MAX_INT.
264+
if (auto KernelReqdWorkGroupSize = getKernelWorkGroupMetadata<uint32_t>(
265+
Func, "reqd_work_group_size");
266+
!KernelReqdWorkGroupSize.empty()) {
258267
MetadataNames.push_back(Func.getName().str() + "@reqd_work_group_size");
259268
PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
260269
KernelReqdWorkGroupSize);
261270
}
262271

263-
uint32_t WorkGroupNumDim = getKernelWorkGroupNumDim(Func);
264-
if (WorkGroupNumDim) {
272+
if (auto WorkGroupNumDim = getKernelSingleEltMetadata<uint32_t>(
273+
Func, "work_group_num_dim")) {
265274
MetadataNames.push_back(Func.getName().str() + "@work_group_num_dim");
266275
PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
267-
WorkGroupNumDim);
276+
*WorkGroupNumDim);
277+
}
278+
279+
if (auto KernelMaxWorkGroupSize =
280+
getKernelWorkGroupMetadata<uint32_t>(Func, "max_work_group_size");
281+
!KernelMaxWorkGroupSize.empty()) {
282+
MetadataNames.push_back(Func.getName().str() + "@max_work_group_size");
283+
PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
284+
KernelMaxWorkGroupSize);
285+
}
286+
287+
if (auto MaxLinearWGSize = getKernelSingleEltMetadata<uint64_t>(
288+
Func, "max_linear_work_group_size")) {
289+
MetadataNames.push_back(Func.getName().str() +
290+
"@max_linear_work_group_size");
291+
PropSet.add(PropSetRegTy::SYCL_PROGRAM_METADATA, MetadataNames.back(),
292+
*MaxLinearWGSize);
268293
}
269294
}
270295

llvm/lib/SYCLLowerIR/SYCLCreateNVVMAnnotations.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ SYCLCreateNVVMAnnotationsPass::run(Module &M, ModuleAnalysisManager &MAM) {
9797

9898
constexpr static std::pair<const char *, const char *>
9999
SingleValAnnotations[] = {{"min_work_groups_per_cu", "minctasm"},
100-
{"max_work_groups_per_mp", "maxclusterrank"}};
100+
{"max_work_groups_per_mp", "maxclusterrank"},
101+
{"max_linear_work_group_size", "maxntidx"}};
101102

102103
for (auto &[MDName, AnnotationName] : SingleValAnnotations) {
103104
if (MDNode *Node = F.getMetadata(MDName)) {

sycl/cmake/modules/FetchUnifiedRuntime.cmake

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
117117
endfunction()
118118

119119
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
120-
# commit 45ad7c52a75e6d3e52f658e38e796563744914c7
121-
# Merge: 7a2caca5 3bf2becb
120+
# commit 7ecf64d60c31cd72bd88588498536d067bad59d6
121+
# Merge: 17aa04d3 6eb5208b
122122
# Author: aarongreig <[email protected]>
123-
# Date: Tue Sep 24 08:04:54 2024 -0700
124-
# Merge pull request #2116 from RossBrunton/ross/morewarn
125-
# More warning squishing
126-
set(UNIFIED_RUNTIME_TAG 45ad7c52a75e6d3e52f658e38e796563744914c7)
123+
# Date: Wed Sep 25 11:14:47 2024 +0100
124+
# Merge pull request #1996 from frasercrmck/ur-max-wg-size-props
125+
# Add two new properties to ur_kernel_group_info_t
126+
set(UNIFIED_RUNTIME_TAG 7ecf64d60c31cd72bd88588498536d067bad59d6)
127127

128128
set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
129129
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need

sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ Jessica Davies, Intel +
5858
Joe Garvey, Intel +
5959
Greg Lueck, Intel +
6060
John Pennycook, Intel +
61-
Roland Schulz, Intel
61+
Roland Schulz, Intel +
62+
Fraser Cormack, Codeplay
6263

6364
== Overview
6465

@@ -232,6 +233,68 @@ SYCL implementations may introduce additional kernel properties. If any
232233
combinations of kernel attributes are invalid, this must be clearly documented
233234
as part of the new kernel property definition.
234235

236+
=== Kernel Properties for the CUDA backend
237+
238+
The kernel properties specified in this section may only be used to decorate
239+
kernels that are submitted to the CUDA backend. Attempting to submit a kernel
240+
with these properties to another backend has undefined behavior.
241+
242+
```c++
243+
namespace sycl {
244+
namespace ext {
245+
namespace oneapi {
246+
namespace experimental {
247+
248+
struct max_work_group_size_key {
249+
template <size_t... Dims>
250+
using value_t = property_value<max_work_group_size_key, std::integral_constant<size_t, Dims>...>;
251+
}; // max_work_group_size_key
252+
253+
struct max_linear_work_group_size_key {
254+
template <size_t Size>
255+
using value_t = property_value<max_linear_work_group_size_key, std::integral_constant<size_t, Size>>;
256+
}; // max_linear_work_group_size_key
257+
258+
template <size_t... Dims>
259+
inline constexpr max_work_group_size_key::value_t<Dims...> max_work_group_size;
260+
261+
template <size_t Size>
262+
inline constexpr max_linear_work_group_size_key::value_t<Size> max_linear_work_group_size;
263+
264+
template <> struct is_property_key<max_work_group_size_key> : std::true_type {};
265+
template <> struct is_property_key<max_linear_work_group_size_key> : std::true_type {};
266+
267+
} // namespace experimental
268+
} // namespace oneapi
269+
} // namespace ext
270+
} // namespace sycl
271+
```
272+
273+
|===
274+
|Property|Description
275+
276+
|`max_work_group_size`
277+
|The `max_work_group_size` property provides a promise to the compiler
278+
that the kernel will never be launched with a larger work-group than the
279+
specified size. The number of template arguments in the `Dims` parameter pack
280+
must match the dimensionality of the work-group used to invoke the kernel. The
281+
order of the template arguments matches the constructor of the `range` class.
282+
283+
If the kernel is submitted with an `nd_range` that exceeds the size specified
284+
by the property, the implementation must throw a synchronous exception with the
285+
`errc::nd_range` error code.
286+
287+
|`max_linear_work_group_size`
288+
|The `max_linear_work_group_size` property provides a promise to the compiler
289+
that the kernel will never be launched with a work-group for which the return
290+
value of `group::get_local_linear_range()` exceeds the specified amount.
291+
292+
If the kernel is submitted with an `nd_range` that exceeds the size specified
293+
by the property, the implementation must throw a synchronous exception with the
294+
`errc::nd_range` error code.
295+
296+
|===
297+
235298
=== Adding a Property List to a Kernel Launch
236299

237300
To enable properties to be associated with kernels, this extension adds

sycl/include/sycl/ext/oneapi/kernel_properties/properties.hpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,21 @@ struct single_task_kernel_key {
7171
using value_t = property_value<single_task_kernel_key>;
7272
};
7373

74+
struct max_work_group_size_key
75+
: detail::compile_time_property_key<detail::PropKind::MaxWorkGroupSize> {
76+
template <size_t... Dims>
77+
using value_t = property_value<max_work_group_size_key,
78+
std::integral_constant<size_t, Dims>...>;
79+
};
80+
81+
struct max_linear_work_group_size_key
82+
: detail::compile_time_property_key<
83+
detail::PropKind::MaxLinearWorkGroupSize> {
84+
template <size_t Size>
85+
using value_t = property_value<max_linear_work_group_size_key,
86+
std::integral_constant<size_t, Size>>;
87+
};
88+
7489
template <size_t Dim0, size_t... Dims>
7590
struct property_value<work_group_size_key, std::integral_constant<size_t, Dim0>,
7691
std::integral_constant<size_t, Dims>...> {
@@ -138,6 +153,28 @@ template <> struct property_value<single_task_kernel_key> {
138153
using key_t = single_task_kernel_key;
139154
};
140155

156+
template <size_t Dim0, size_t... Dims>
157+
struct property_value<max_work_group_size_key,
158+
std::integral_constant<size_t, Dim0>,
159+
std::integral_constant<size_t, Dims>...> {
160+
static_assert(sizeof...(Dims) + 1 <= 3,
161+
"max_work_group_size property currently "
162+
"only supports up to three values.");
163+
static_assert(
164+
detail::AllNonZero<Dim0, Dims...>::value,
165+
"max_work_group_size property must only contain non-zero values.");
166+
167+
using key_t = max_work_group_size_key;
168+
169+
constexpr size_t operator[](int Dim) const {
170+
return std::array<size_t, sizeof...(Dims) + 1>{Dim0, Dims...}[Dim];
171+
}
172+
};
173+
174+
template <> struct property_value<max_linear_work_group_size_key> {
175+
using key_t = max_linear_work_group_size_key;
176+
};
177+
141178
template <size_t Dim0, size_t... Dims>
142179
inline constexpr work_group_size_key::value_t<Dim0, Dims...> work_group_size;
143180

@@ -156,6 +193,14 @@ inline constexpr nd_range_kernel_key::value_t<Dims> nd_range_kernel;
156193

157194
inline constexpr single_task_kernel_key::value_t single_task_kernel;
158195

196+
template <size_t Dim0, size_t... Dims>
197+
inline constexpr max_work_group_size_key::value_t<Dim0, Dims...>
198+
max_work_group_size;
199+
200+
template <size_t Size>
201+
inline constexpr max_linear_work_group_size_key::value_t<Size>
202+
max_linear_work_group_size;
203+
159204
struct work_group_progress_key
160205
: detail::compile_time_property_key<detail::PropKind::WorkGroupProgress> {
161206
template <forward_progress_guarantee Guarantee,
@@ -283,6 +328,16 @@ template <> struct PropertyMetaInfo<single_task_kernel_key::value_t> {
283328
static constexpr const char *name = "sycl-single-task-kernel";
284329
static constexpr int value = 0;
285330
};
331+
template <size_t Dim0, size_t... Dims>
332+
struct PropertyMetaInfo<max_work_group_size_key::value_t<Dim0, Dims...>> {
333+
static constexpr const char *name = "sycl-max-work-group-size";
334+
static constexpr const char *value = SizeListToStr<Dim0, Dims...>::value;
335+
};
336+
template <size_t Size>
337+
struct PropertyMetaInfo<max_linear_work_group_size_key::value_t<Size>> {
338+
static constexpr const char *name = "sycl-max-linear-work-group-size";
339+
static constexpr size_t value = Size;
340+
};
286341

287342
template <typename T, typename = void>
288343
struct HasKernelPropertiesGetMethod : std::false_type {};

sycl/include/sycl/ext/oneapi/properties/property.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,10 @@ enum PropKind : uint32_t {
212212
Balanced = 71,
213213
InvocationCapacity = 72,
214214
ResponseCapacity = 73,
215+
MaxWorkGroupSize = 74,
216+
MaxLinearWorkGroupSize = 75,
215217
// PropKindSize must always be the last value.
216-
PropKindSize = 74,
218+
PropKindSize = 76,
217219
};
218220

219221
struct property_key_base_tag {};

0 commit comments

Comments
 (0)