Skip to content

Commit 52556d8

Browse files
authored
[SYCL] Introduce min_work_groups_per_cu and max_work_groups_per_mp (#11192)
The attributes match to CUDA's launch bounds `minBlocksPerMultiprocessor` and `maxBlocksPerCluster` respectively. See: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank and https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds for details.
1 parent 72a1024 commit 52556d8

17 files changed

+793
-11
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,6 +1587,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
15871587
let SupportsNonconformingLambdaSyntax = 1;
15881588
}
15891589

1590+
def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
1591+
let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
1592+
let Args = [ExprArgument<"Value">];
1593+
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
1594+
let Subjects = SubjectList<[Function], ErrorDiag>;
1595+
let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
1596+
}
1597+
1598+
def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
1599+
let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
1600+
let Args = [ExprArgument<"Value">];
1601+
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
1602+
let Subjects = SubjectList<[Function], ErrorDiag>;
1603+
let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
1604+
}
1605+
15901606
def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
15911607
let Spellings = [CXX11<"intel", "max_global_work_dim">];
15921608
let Args = [ExprArgument<"Value">];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
30173017
}];
30183018
}
30193019

3020+
def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
3021+
let Category = DocCatFunction;
3022+
let Heading = "intel::min_work_groups_per_cu";
3023+
let Content = [{
3024+
Applies to a device function/lambda function. Indicates the desired minimum
3025+
number of resident work_groups per multiprocessor. It complies to the
3026+
.minnctapersm PTX directive.
3027+
3028+
.. code-block:: c++
3029+
3030+
[[intel::min_work_groups_per_cu(2)]] void foo() {}
3031+
3032+
class Foo {
3033+
public:
3034+
[[intel::min_work_groups_per_cu(2)]] void operator()() const {}
3035+
};
3036+
3037+
template <int N>
3038+
class Functor {
3039+
public:
3040+
[[intel::min_work_groups_per_cu(N)]] void operator()() const {}
3041+
};
3042+
3043+
template <int N>
3044+
[[intel::min_work_groups_per_cu(N)]] void func() {}
3045+
3046+
}];
3047+
}
3048+
3049+
def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
3050+
let Category = DocCatFunction;
3051+
let Heading = "intel::max_work_groups_per_mp";
3052+
let Content = [{
3053+
Applies to a device function/lambda function. Indicates the desired maximum
3054+
number work_groups per cluster with which the application will ever launch. It
3055+
complies to the .maxclusterrank PTX directive. Note, that the feature requires
3056+
SM_90 or higher.
3057+
3058+
.. code-block:: c++
3059+
3060+
[[intel::max_work_groups_per_mp(2)]] void foo() {}
3061+
3062+
class Foo {
3063+
public:
3064+
[[intel::max_work_groups_per_mp(2)]] void operator()() const {}
3065+
};
3066+
3067+
template <int N>
3068+
class Functor {
3069+
public:
3070+
[[intel::max_work_groups_per_mp(N)]] void operator()() const {}
3071+
};
3072+
3073+
template <int N>
3074+
[[intel::max_work_groups_per_mp(N)]] void func() {}
3075+
3076+
}];
3077+
}
3078+
30203079
def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
30213080
let Category = DocCatFunction;
30223081
let Heading = "intel::max_global_work_dim";

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12016,9 +12016,12 @@ def warn_sycl_kernel_return_type : Warning<
1201612016
def err_sycl_special_type_num_init_method : Error<
1201712017
"types with 'sycl_special_class' attribute must have one and only one '__init' "
1201812018
"method defined">;
12019+
def warn_launch_bounds_is_cuda_specific : Warning<
12020+
"%0 attribute ignored, only applicable when targeting Nvidia devices">,
12021+
InGroup<IgnoredAttributes>;
1201912022

1202012023
def warn_cuda_maxclusterrank_sm_90 : Warning<
12021-
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
12024+
"'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
1202212025
"%1 attribute">, InGroup<IgnoredAttributes>;
1202312026

1202412027
def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "

clang/include/clang/Sema/Sema.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11432,6 +11432,16 @@ class Sema final {
1143211432
SYCLIntelMaxGlobalWorkDimAttr *
1143311433
MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
1143411434
const SYCLIntelMaxGlobalWorkDimAttr &A);
11435+
void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
11436+
Decl *D, const AttributeCommonInfo &CI, Expr *E);
11437+
SYCLIntelMinWorkGroupsPerComputeUnitAttr *
11438+
MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
11439+
Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
11440+
void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
11441+
Decl *D, const AttributeCommonInfo &CI, Expr *E);
11442+
SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
11443+
MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
11444+
Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
1143511445
void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
1143611446
Expr *E);
1143711447
SYCLIntelBankWidthAttr *

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
758758
llvm::MDNode::get(Context, AttrMDArgs));
759759
}
760760

761+
auto attrAsMDArg = [&](Expr *E) {
762+
const auto *CE = cast<ConstantExpr>(E);
763+
std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
764+
return llvm::ConstantAsMetadata::get(
765+
Builder.getInt32(ArgVal->getSExtValue()));
766+
};
767+
768+
if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
769+
Fn->setMetadata("min_work_groups_per_cu",
770+
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
771+
}
772+
773+
if (const auto *A =
774+
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
775+
Fn->setMetadata("max_work_groups_per_mp",
776+
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
777+
}
778+
761779
if (const SYCLIntelMaxWorkGroupSizeAttr *A =
762780
FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
763781

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
245245
// And kernel functions are not subject to inlining
246246
F->addFnAttr(llvm::Attribute::NoInline);
247247
}
248+
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
249+
auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
250+
(*MWGS->getYDimVal()).getExtValue() *
251+
(*MWGS->getXDimVal()).getExtValue();
252+
if (MaxThreads > 0)
253+
addNVVMMetadata(F, "maxntidx", MaxThreads);
254+
255+
auto attrValue = [&](Expr *E) {
256+
const auto *CE = cast<ConstantExpr>(E);
257+
std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
258+
return Val->getZExtValue();
259+
};
260+
261+
if (const auto *MWGPCU =
262+
FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
263+
// The value is guaranteed to be > 0, pass it to the metadata.
264+
addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
265+
266+
if (const auto *MWGPMP =
267+
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
268+
// The value is guaranteed to be > 0, pass it to the metadata.
269+
addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
270+
}
271+
}
272+
}
248273
}
249274

250275
// Perform special handling in CUDA mode.

clang/lib/Sema/SemaDecl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
29992999
NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
30003000
else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
30013001
NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
3002+
else if (const auto *A =
3003+
dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
3004+
NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
3005+
else if (const auto *A =
3006+
dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
3007+
NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
30023008
else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
30033009
NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
30043010
else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))

0 commit comments

Comments
 (0)