Skip to content

[SYCL] Introduce min_work_groups_per_cu and max_work_groups_per_mp #11192

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
let SupportsNonconformingLambdaSyntax = 1;
}

def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
let Args = [ExprArgument<"Value">];
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
}

def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
let Args = [ExprArgument<"Value">];
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
}

def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
let Spellings = [CXX11<"intel", "max_global_work_dim">];
let Args = [ExprArgument<"Value">];
Expand Down
59 changes: 59 additions & 0 deletions clang/include/clang/Basic/AttrDocs.td
Original file line number Diff line number Diff line change
Expand Up @@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
}];
}

def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
let Category = DocCatFunction;
let Heading = "intel::min_work_groups_per_cu";
let Content = [{
Applies to a device function/lambda function. Indicates the desired minimum
number of resident work_groups per multiprocessor. It complies to the
.minnctapersm PTX directive.

.. code-block:: c++

[[intel::min_work_groups_per_cu(2)]] void foo() {}

class Foo {
public:
[[intel::min_work_groups_per_cu(2)]] void operator()() const {}
};

template <int N>
class Functor {
public:
[[intel::min_work_groups_per_cu(N)]] void operator()() const {}
};

template <int N>
[[intel::min_work_groups_per_cu(N)]] void func() {}

}];
}

def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
let Category = DocCatFunction;
let Heading = "intel::max_work_groups_per_mp";
let Content = [{
Applies to a device function/lambda function. Indicates the desired maximum
number work_groups per cluster with which the application will ever launch. It
complies to the .maxclusterrank PTX directive. Note, that the feature requires
SM_90 or higher.

.. code-block:: c++

[[intel::max_work_groups_per_mp(2)]] void foo() {}

class Foo {
public:
[[intel::max_work_groups_per_mp(2)]] void operator()() const {}
};

template <int N>
class Functor {
public:
[[intel::max_work_groups_per_mp(N)]] void operator()() const {}
};

template <int N>
[[intel::max_work_groups_per_mp(N)]] void func() {}

}];
}

def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
let Category = DocCatFunction;
let Heading = "intel::max_global_work_dim";
Expand Down
5 changes: 4 additions & 1 deletion clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -12002,9 +12002,12 @@ def warn_sycl_kernel_return_type : Warning<
def err_sycl_special_type_num_init_method : Error<
"types with 'sycl_special_class' attribute must have one and only one '__init' "
"method defined">;
def warn_launch_bounds_is_cuda_specific : Warning<
"%0 attribute ignored, only applicable when targeting Nvidia devices">,
InGroup<IgnoredAttributes>;

def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;

def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
Expand Down
10 changes: 10 additions & 0 deletions clang/include/clang/Sema/Sema.h
Original file line number Diff line number Diff line change
Expand Up @@ -11432,6 +11432,16 @@ class Sema final {
SYCLIntelMaxGlobalWorkDimAttr *
MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
const SYCLIntelMaxGlobalWorkDimAttr &A);
void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
Decl *D, const AttributeCommonInfo &CI, Expr *E);
SYCLIntelMinWorkGroupsPerComputeUnitAttr *
MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
Decl *D, const AttributeCommonInfo &CI, Expr *E);
SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *E);
SYCLIntelBankWidthAttr *
Expand Down
18 changes: 18 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
llvm::MDNode::get(Context, AttrMDArgs));
}

auto attrAsMDArg = [&](Expr *E) {
const auto *CE = cast<ConstantExpr>(E);
std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
return llvm::ConstantAsMetadata::get(
Builder.getInt32(ArgVal->getSExtValue()));
};

if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
Fn->setMetadata("min_work_groups_per_cu",
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
}

if (const auto *A =
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
Fn->setMetadata("max_work_groups_per_mp",
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
}

if (const SYCLIntelMaxWorkGroupSizeAttr *A =
FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {

Expand Down
25 changes: 25 additions & 0 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
// And kernel functions are not subject to inlining
F->addFnAttr(llvm::Attribute::NoInline);
}
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
(*MWGS->getYDimVal()).getExtValue() *
(*MWGS->getXDimVal()).getExtValue();
if (MaxThreads > 0)
addNVVMMetadata(F, "maxntidx", MaxThreads);

auto attrValue = [&](Expr *E) {
const auto *CE = cast<ConstantExpr>(E);
std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
return Val->getZExtValue();
};

if (const auto *MWGPCU =
FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
// The value is guaranteed to be > 0, pass it to the metadata.
addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));

if (const auto *MWGPMP =
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
// The value is guaranteed to be > 0, pass it to the metadata.
addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
}
}
}
}

// Perform special handling in CUDA mode.
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Sema/SemaDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
else if (const auto *A =
dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
else if (const auto *A =
dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))
Expand Down
Loading