Skip to content

Commit c4e460b

Browse files
author
Jun Wang
committed
Support 2 attributes: one for min and one for max number of work groups.
1 parent 5c088a5 commit c4e460b

File tree

11 files changed

+444
-82
lines changed

11 files changed

+444
-82
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,10 +2031,17 @@ def AMDGPUNumVGPR : InheritableAttr {
20312031
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
20322032
}
20332033

2034-
def AMDGPUNumWorkGroups : InheritableAttr {
2035-
let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
2036-
let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
2037-
let Documentation = [AMDGPUNumWorkGroupsDocs];
2034+
def AMDGPUMinNumWorkGroups : InheritableAttr {
2035+
let Spellings = [Clang<"amdgpu_min_num_work_groups", 0>];
2036+
let Args = [UnsignedArgument<"MinNumWorkGroupsX">, UnsignedArgument<"MinNumWorkGroupsY">, UnsignedArgument<"MinNumWorkGroupsZ">];
2037+
let Documentation = [AMDGPUMinNumWorkGroupsDocs];
2038+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
2039+
}
2040+
2041+
def AMDGPUMaxNumWorkGroups : InheritableAttr {
2042+
let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
2043+
let Args = [UnsignedArgument<"MaxNumWorkGroupsX">, UnsignedArgument<"MaxNumWorkGroupsY">, UnsignedArgument<"MaxNumWorkGroupsZ">];
2044+
let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
20382045
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
20392046
}
20402047

clang/include/clang/Basic/AttrDocs.td

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,14 +2705,38 @@ An error will be given if:
27052705
}];
27062706
}
27072707

2708-
def AMDGPUNumWorkGroupsDocs : Documentation {
2708+
def AMDGPUMinNumWorkGroupsDocs : Documentation {
27092709
let Category = DocCatAMDGPUAttributes;
27102710
let Content = [{
2711-
The number of work groups specifies the number of work groups when the kernel
2711+
The min number of work groups specifies the min number of work groups when the kernel
27122712
is dispatched.
27132713

27142714
Clang supports the
2715-
``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
2715+
``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
2716+
AMDGPU target. This attribute may be attached to a kernel function definition
2717+
and is an optimization hint.
2718+
2719+
``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
2720+
Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
2721+
2722+
If specified, the AMDGPU target backend might be able to produce better machine
2723+
code.
2724+
2725+
An error will be given if:
2726+
- Specified values violate subtarget specifications;
2727+
- Specified values are not compatible with values provided through other
2728+
attributes.
2729+
}];
2730+
}
2731+
2732+
def AMDGPUMaxNumWorkGroupsDocs : Documentation {
2733+
let Category = DocCatAMDGPUAttributes;
2734+
let Content = [{
2735+
The max number of work groups specifies the max number of work groups when the kernel
2736+
is dispatched.
2737+
2738+
Clang supports the
2739+
``__attribute__((amdgpu_min_num_work_groups(<x>, <y>, <z>)))`` attribute for the
27162740
AMDGPU target. This attribute may be attached to a kernel function definition
27172741
and is an optimization hint.
27182742

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -357,18 +357,55 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
357357
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358358
}
359359

360-
if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
361-
uint32_t X = Attr->getNumWorkGroupsX();
362-
uint32_t Y = Attr->getNumWorkGroupsY();
363-
uint32_t Z = Attr->getNumWorkGroupsZ();
364-
365-
if (X != 0 && Y != 0 && Z != 0) {
366-
std::string AttrVal = llvm::utostr(X) + std::string(", ") +
367-
llvm::utostr(Y) + std::string(", ") +
368-
llvm::utostr(Z);
369-
F->addFnAttr("amdgpu-num-work-groups", AttrVal);
360+
uint32_t MinWGX = 0;
361+
uint32_t MinWGY = 0;
362+
uint32_t MinWGZ = 0;
363+
364+
uint32_t MaxWGX = 0;
365+
uint32_t MaxWGY = 0;
366+
uint32_t MaxWGZ = 0;
367+
368+
bool IsMinNumWGValid = false;
369+
bool IsMaxNumWGValid = false;
370+
371+
if (const auto *Attr = FD->getAttr<AMDGPUMinNumWorkGroupsAttr>()) {
372+
MinWGX = Attr->getMinNumWorkGroupsX();
373+
MinWGY = Attr->getMinNumWorkGroupsY();
374+
MinWGZ = Attr->getMinNumWorkGroupsZ();
375+
376+
if (MinWGX != 0 && MinWGY != 0 && MinWGZ != 0)
377+
IsMinNumWGValid = true;
378+
}
379+
380+
if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
381+
MaxWGX = Attr->getMaxNumWorkGroupsX();
382+
MaxWGY = Attr->getMaxNumWorkGroupsY();
383+
MaxWGZ = Attr->getMaxNumWorkGroupsZ();
384+
385+
if (MaxWGX != 0 && MaxWGY != 0 && MaxWGZ != 0)
386+
IsMaxNumWGValid = true;
387+
}
388+
389+
if (IsMinNumWGValid && IsMaxNumWGValid) {
390+
if (MinWGX > MaxWGX || MinWGY > MaxWGY || MinWGZ > MaxWGZ) {
391+
IsMinNumWGValid = false;
392+
IsMaxNumWGValid = false;
370393
}
371394
}
395+
396+
if (IsMinNumWGValid) {
397+
std::string AttrVal = llvm::utostr(MinWGX) + std::string(", ") +
398+
llvm::utostr(MinWGY) + std::string(", ") +
399+
llvm::utostr(MinWGZ);
400+
F->addFnAttr("amdgpu-min-num-work-groups", AttrVal);
401+
}
402+
403+
if (IsMaxNumWGValid) {
404+
std::string AttrVal = llvm::utostr(MaxWGX) + std::string(", ") +
405+
llvm::utostr(MaxWGY) + std::string(", ") +
406+
llvm::utostr(MaxWGZ);
407+
F->addFnAttr("amdgpu-max-num-work-groups", AttrVal);
408+
}
372409
}
373410

374411
/// Emits control constants used to change per-architecture behaviour in the

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8069,23 +8069,42 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
80698069
D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
80708070
}
80718071

8072-
static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
8073-
const ParsedAttr &AL) {
8074-
uint32_t NumWGX = 0;
8075-
uint32_t NumWGY = 0;
8076-
uint32_t NumWGZ = 0;
8077-
Expr *NumWGXExpr = AL.getArgAsExpr(0);
8078-
Expr *NumWGYExpr = AL.getArgAsExpr(1);
8079-
Expr *NumWGZExpr = AL.getArgAsExpr(2);
8080-
if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
8072+
static void handleAMDGPUMinNumWorkGroupsAttr(Sema &S, Decl *D,
8073+
const ParsedAttr &AL) {
8074+
uint32_t MinNumWGX = 0;
8075+
uint32_t MinNumWGY = 0;
8076+
uint32_t MinNumWGZ = 0;
8077+
Expr *MinNumWGXExpr = AL.getArgAsExpr(0);
8078+
Expr *MinNumWGYExpr = AL.getArgAsExpr(1);
8079+
Expr *MinNumWGZExpr = AL.getArgAsExpr(2);
8080+
if (!checkUInt32Argument(S, AL, MinNumWGXExpr, MinNumWGX))
8081+
return;
8082+
if (!checkUInt32Argument(S, AL, MinNumWGYExpr, MinNumWGY))
8083+
return;
8084+
if (!checkUInt32Argument(S, AL, MinNumWGZExpr, MinNumWGZ))
8085+
return;
8086+
8087+
D->addAttr(::new (S.Context) AMDGPUMinNumWorkGroupsAttr(
8088+
S.Context, AL, MinNumWGX, MinNumWGY, MinNumWGZ));
8089+
}
8090+
8091+
static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
8092+
const ParsedAttr &AL) {
8093+
uint32_t MaxNumWGX = 0;
8094+
uint32_t MaxNumWGY = 0;
8095+
uint32_t MaxNumWGZ = 0;
8096+
Expr *MaxNumWGXExpr = AL.getArgAsExpr(0);
8097+
Expr *MaxNumWGYExpr = AL.getArgAsExpr(1);
8098+
Expr *MaxNumWGZExpr = AL.getArgAsExpr(2);
8099+
if (!checkUInt32Argument(S, AL, MaxNumWGXExpr, MaxNumWGX))
80818100
return;
8082-
if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
8101+
if (!checkUInt32Argument(S, AL, MaxNumWGYExpr, MaxNumWGY))
80838102
return;
8084-
if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
8103+
if (!checkUInt32Argument(S, AL, MaxNumWGZExpr, MaxNumWGZ))
80858104
return;
80868105

8087-
D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
8088-
NumWGY, NumWGZ));
8106+
D->addAttr(::new (S.Context) AMDGPUMaxNumWorkGroupsAttr(
8107+
S.Context, AL, MaxNumWGX, MaxNumWGY, MaxNumWGZ));
80898108
}
80908109

80918110
static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
@@ -9192,8 +9211,11 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
91929211
case ParsedAttr::AT_AMDGPUNumVGPR:
91939212
handleAMDGPUNumVGPRAttr(S, D, AL);
91949213
break;
9195-
case ParsedAttr::AT_AMDGPUNumWorkGroups:
9196-
handleAMDGPUNumWorkGroupsAttr(S, D, AL);
9214+
case ParsedAttr::AT_AMDGPUMinNumWorkGroups:
9215+
handleAMDGPUMinNumWorkGroupsAttr(S, D, AL);
9216+
break;
9217+
case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
9218+
handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
91979219
break;
91989220
case ParsedAttr::AT_AVRSignal:
91999221
handleAVRSignalAttr(S, D, AL);

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44

55
// CHECK: #pragma clang attribute supports the following attributes:
66
// CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
7+
// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
8+
// CHECK-NEXT: AMDGPUMinNumWorkGroups (SubjectMatchRule_function)
79
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
810
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
9-
// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
1011
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
1112
// CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
1213
// CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -494,13 +494,39 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
494494

495495
Kern[".max_flat_workgroup_size"] =
496496
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
497-
unsigned NumWGX = MFI.getNumWorkGroupsX();
498-
unsigned NumWGY = MFI.getNumWorkGroupsY();
499-
unsigned NumWGZ = MFI.getNumWorkGroupsZ();
500-
if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
501-
Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
502-
Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
503-
Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
497+
498+
unsigned MinNumWGX = MFI.getMinNumWorkGroupsX();
499+
unsigned MinNumWGY = MFI.getMinNumWorkGroupsY();
500+
unsigned MinNumWGZ = MFI.getMinNumWorkGroupsZ();
501+
502+
unsigned MaxNumWGX = MFI.getMaxNumWorkGroupsX();
503+
unsigned MaxNumWGY = MFI.getMaxNumWorkGroupsY();
504+
unsigned MaxNumWGZ = MFI.getMaxNumWorkGroupsZ();
505+
506+
bool IsMinNumWGValid = false;
507+
bool IsMaxNumWGValid = false;
508+
if (MinNumWGX != 0 && MinNumWGY != 0 && MinNumWGZ != 0)
509+
IsMinNumWGValid = true;
510+
if (MaxNumWGX != 0 && MaxNumWGY != 0 && MaxNumWGZ != 0)
511+
IsMaxNumWGValid = true;
512+
if (IsMinNumWGValid && IsMaxNumWGValid) {
513+
if (MaxNumWGX < MinNumWGX || MaxNumWGY < MinNumWGY ||
514+
MaxNumWGZ < MinNumWGZ) {
515+
IsMinNumWGValid = false;
516+
IsMaxNumWGValid = false;
517+
}
518+
}
519+
520+
if (IsMinNumWGValid) {
521+
Kern[".min_num_work_groups_x"] = Kern.getDocument()->getNode(MinNumWGX);
522+
Kern[".min_num_work_groups_y"] = Kern.getDocument()->getNode(MinNumWGY);
523+
Kern[".min_num_work_groups_z"] = Kern.getDocument()->getNode(MinNumWGZ);
524+
}
525+
526+
if (IsMaxNumWGValid) {
527+
Kern[".max_num_work_groups_x"] = Kern.getDocument()->getNode(MaxNumWGX);
528+
Kern[".max_num_work_groups_y"] = Kern.getDocument()->getNode(MaxNumWGY);
529+
Kern[".max_num_work_groups_z"] = Kern.getDocument()->getNode(MaxNumWGZ);
504530
}
505531
Kern[".sgpr_spill_count"] =
506532
Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,6 +1110,10 @@ unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
11101110
}
11111111

11121112
SmallVector<unsigned>
1113-
AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
1114-
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
1113+
AMDGPUSubtarget::getMinNumWorkGroups(const Function &F) const {
1114+
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-min-num-work-groups", 3);
1115+
}
1116+
SmallVector<unsigned>
1117+
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
1118+
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-work-groups", 3);
11151119
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,11 @@ class AMDGPUSubtarget {
288288
/// 2) dimension.
289289
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290290

291-
/// Return the number of work groups for the function.
292-
SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
291+
/// Return the min number of work groups for the function.
292+
SmallVector<unsigned> getMinNumWorkGroups(const Function &F) const;
293+
294+
/// Return the max number of work groups for the function.
295+
SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
293296

294297
/// Return true if only a single workitem can be active in a wave.
295298
bool isSingleLaneExecution(const Function &Kernel) const;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
4646
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
4747
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
4848
WavesPerEU = ST.getWavesPerEU(F);
49-
NumWorkGroups = ST.getNumWorkGroups(F);
50-
assert(NumWorkGroups.size() == 3);
49+
MinNumWorkGroups = ST.getMinNumWorkGroups(F);
50+
assert(MinNumWorkGroups.size() == 3);
51+
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
52+
assert(MaxNumWorkGroups.size() == 3);
5153

5254
Occupancy = ST.computeOccupancy(F, getLDSSize());
5355
CallingConv::ID CC = F.getCallingConv();

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
427427
const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
428428

429429
// Default/requested number of work groups for the function.
430-
SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
430+
SmallVector<unsigned> MinNumWorkGroups = {0, 0, 0};
431+
SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
431432

432433
private:
433434
unsigned NumUserSGPRs = 0;
@@ -1077,11 +1078,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
10771078
bool usesAGPRs(const MachineFunction &MF) const;
10781079

10791080
/// \returns Default/requested number of work groups for this function.
1080-
SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
1081+
SmallVector<unsigned> getMinNumWorkGroups() const { return MinNumWorkGroups; }
1082+
SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
10811083

1082-
unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
1083-
unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
1084-
unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
1084+
unsigned getMinNumWorkGroupsX() const { return MinNumWorkGroups[0]; }
1085+
unsigned getMinNumWorkGroupsY() const { return MinNumWorkGroups[1]; }
1086+
unsigned getMinNumWorkGroupsZ() const { return MinNumWorkGroups[2]; }
1087+
1088+
unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
1089+
unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
1090+
unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
10851091
};
10861092

10871093
} // end namespace llvm

0 commit comments

Comments
 (0)