Skip to content

Commit bbe994d

Browse files
committed
[SYCL] Introduce min_work_groups_per_cu and max_work_groups_per_mp
The attributes match to CUDA's launch bounds minBlocksPerMultiprocessor and maxBlocksPerCluster respectively.
1 parent 77243d2 commit bbe994d

File tree

13 files changed

+504
-8
lines changed

13 files changed

+504
-8
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,24 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
15831583
let SupportsNonconformingLambdaSyntax = 1;
15841584
}
15851585

1586+
def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
1587+
let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
1588+
let Args = [ExprArgument<"Value">];
1589+
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
1590+
let Subjects = SubjectList<[Function], ErrorDiag>;
1591+
let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
1592+
let SupportsNonconformingLambdaSyntax = 1;
1593+
}
1594+
1595+
def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
1596+
let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
1597+
let Args = [ExprArgument<"Value">];
1598+
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
1599+
let Subjects = SubjectList<[Function], ErrorDiag>;
1600+
let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
1601+
let SupportsNonconformingLambdaSyntax = 1;
1602+
}
1603+
15861604
def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
15871605
let Spellings = [CXX11<"intel", "max_global_work_dim">];
15881606
let Args = [ExprArgument<"Value">];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
30173017
}];
30183018
}
30193019

3020+
def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
3021+
let Category = DocCatFunction;
3022+
let Heading = "intel::min_work_groups_per_cu";
3023+
let Content = [{
3024+
Applies to a device function/lambda function. Indicates the desired minimum
3025+
number of resident work_groups per multiprocessor. It complies to the
3026+
.minnctapersm PTX directive.
3027+
3028+
.. code-block:: c++
3029+
3030+
[[intel::min_work_groups_per_cu(2)]] void foo() {}
3031+
3032+
class Foo {
3033+
public:
3034+
[[intel::min_work_groups_per_cu(2)]] void operator()() const {}
3035+
};
3036+
3037+
template <int N>
3038+
class Functor {
3039+
public:
3040+
[[intel::min_work_groups_per_cu(N)]] void operator()() const {}
3041+
};
3042+
3043+
template <int N>
3044+
[[intel::min_work_groups_per_cu(N)]] void func() {}
3045+
3046+
}];
3047+
}
3048+
3049+
def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
3050+
let Category = DocCatFunction;
3051+
let Heading = "intel::max_work_groups_per_mp";
3052+
let Content = [{
3053+
Applies to a device function/lambda function. Indicates the desired maximum
3054+
number work_groups per cluster with which the application will ever launch. It
3055+
complies to the .maxclusterrank PTX directive. Note, that the feature requires
3056+
SM_90 or higher.
3057+
3058+
.. code-block:: c++
3059+
3060+
[[intel::max_work_groups_per_mp(2)]] void foo() {}
3061+
3062+
class Foo {
3063+
public:
3064+
[[intel::max_work_groups_per_mp(2)]] void operator()() const {}
3065+
};
3066+
3067+
template <int N>
3068+
class Functor {
3069+
public:
3070+
[[intel::max_work_groups_per_mp(N)]] void operator()() const {}
3071+
};
3072+
3073+
template <int N>
3074+
[[intel::max_work_groups_per_mp(N)]] void func() {}
3075+
3076+
}];
3077+
}
3078+
30203079
def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
30213080
let Category = DocCatFunction;
30223081
let Heading = "intel::max_global_work_dim";

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11986,6 +11986,9 @@ def warn_sycl_kernel_return_type : Warning<
1198611986
def err_sycl_special_type_num_init_method : Error<
1198711987
"types with 'sycl_special_class' attribute must have one and only one '__init' "
1198811988
"method defined">;
11989+
def warn_launch_bounds_is_cuda_specific : Warning<
11990+
"%0 attribute ignored, only applicable when targetting Nvidia devices">,
11991+
InGroup<IgnoredAttributes>;
1198911992

1199011993
def warn_cuda_maxclusterrank_sm_90 : Warning<
1199111994
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "

clang/include/clang/Sema/Sema.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11392,6 +11392,16 @@ class Sema final {
1139211392
SYCLIntelMaxGlobalWorkDimAttr *
1139311393
MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
1139411394
const SYCLIntelMaxGlobalWorkDimAttr &A);
11395+
void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
11396+
Decl *D, const AttributeCommonInfo &CI, Expr *E);
11397+
SYCLIntelMinWorkGroupsPerComputeUnitAttr *
11398+
MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
11399+
Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
11400+
void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
11401+
Decl *D, const AttributeCommonInfo &CI, Expr *E);
11402+
SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
11403+
MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
11404+
Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
1139511405
void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
1139611406
Expr *E);
1139711407
SYCLIntelBankWidthAttr *

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,25 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
758758
llvm::MDNode::get(Context, AttrMDArgs));
759759
}
760760

761+
if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
762+
const auto *CE = cast<ConstantExpr>(A->getValue());
763+
std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
764+
llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get(
765+
Builder.getInt32(ArgVal->getSExtValue()))};
766+
Fn->setMetadata("min_work_groups_per_cu",
767+
llvm::MDNode::get(Context, AttrMDArgs));
768+
}
769+
770+
if (const auto *A =
771+
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
772+
const auto *CE = cast<ConstantExpr>(A->getValue());
773+
std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
774+
llvm::Metadata *AttrMDArgs[] = {llvm::ConstantAsMetadata::get(
775+
Builder.getInt32(ArgVal->getSExtValue()))};
776+
Fn->setMetadata("max_work_groups_per_mp",
777+
llvm::MDNode::get(Context, AttrMDArgs));
778+
}
779+
761780
if (const SYCLIntelMaxWorkGroupSizeAttr *A =
762781
FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
763782

clang/lib/CodeGen/Targets/NVPTX.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
245245
// And kernel functions are not subject to inlining
246246
F->addFnAttr(llvm::Attribute::NoInline);
247247
}
248+
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
249+
auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
250+
(*MWGS->getYDimVal()).getExtValue() *
251+
(*MWGS->getXDimVal()).getExtValue();
252+
if (MaxThreads > 0)
253+
addNVVMMetadata(F, "maxntidx", MaxThreads);
254+
}
255+
if (const auto *MWGPCU =
256+
FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
257+
auto *MinWorkGroups = MWGPCU->getValue();
258+
if (const auto *CE = dyn_cast<ConstantExpr>(MinWorkGroups)) {
259+
auto MinVal = CE->getResultAsAPSInt();
260+
// The value is guaranteed to be > 0, pass it to the metadata.
261+
addNVVMMetadata(F, "minnctapersm", MinVal.getExtValue());
262+
}
263+
}
264+
if (const auto *MWGPMP =
265+
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
266+
auto *MaxWorkGroups = MWGPMP->getValue();
267+
if (const auto *CE = dyn_cast<ConstantExpr>(MaxWorkGroups)) {
268+
auto MaxVal = CE->getResultAsAPSInt();
269+
// The value is guaranteed to be > 0, pass it to the metadata.
270+
addNVVMMetadata(F, "maxclusterrank", MaxVal.getExtValue());
271+
}
272+
}
248273
}
249274

250275
// Perform special handling in CUDA mode.

clang/lib/Sema/SemaDecl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
29992999
NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
30003000
else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
30013001
NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
3002+
else if (const auto *A =
3003+
dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
3004+
NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
3005+
else if (const auto *A =
3006+
dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
3007+
NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
30023008
else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
30033009
NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
30043010
else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))

0 commit comments

Comments
 (0)