Skip to content

Commit 8ed74e1

Browse files
author
Jun Wang
committed
[AMDGPU] Adding the amdgpu-num-work-groups function attribute
A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information.
1 parent 1522333 commit 8ed74e1

File tree

13 files changed

+232
-0
lines changed

13 files changed

+232
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr {
20312031
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
20322032
}
20332033

2034+
def AMDGPUNumWorkGroups : InheritableAttr {
2035+
let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
2036+
let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
2037+
let Documentation = [AMDGPUNumWorkGroupsDocs];
2038+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
2039+
}
2040+
20342041
def AMDGPUKernelCall : DeclOrTypeAttr {
20352042
let Spellings = [Clang<"amdgpu_kernel">];
20362043
let Documentation = [Undocumented];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2702,6 +2702,30 @@ An error will be given if:
27022702
}];
27032703
}
27042704

2705+
def AMDGPUNumWorkGroupsDocs : Documentation {
2706+
let Category = DocCatAMDGPUAttributes;
2707+
let Content = [{
2708+
The number of work groups specifies the number of work groups when the kernel
2709+
is dispatched.
2710+
2711+
Clang supports the
2712+
``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
2713+
AMDGPU target. This attribute may be attached to a kernel function definition
2714+
and is an optimization hint.
2715+
2716+
``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
2717+
Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
2718+
2719+
If specified, the AMDGPU target backend might be able to produce better machine
2720+
code.
2721+
2722+
An error will be given if:
2723+
- Specified values violate subtarget specifications;
2724+
- Specified values are not compatible with values provided through other
2725+
attributes.
2726+
}];
2727+
}
2728+
27052729
def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
27062730
let Content = [{
27072731
Clang supports several different calling conventions, depending on the target

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
356356
if (NumVGPR != 0)
357357
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358358
}
359+
360+
if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
361+
uint32_t X = Attr->getNumWorkGroupsX();
362+
uint32_t Y = Attr->getNumWorkGroupsY();
363+
uint32_t Z = Attr->getNumWorkGroupsZ();
364+
365+
if (X != 0 && Y != 0 && Z != 0) {
366+
std::string AttrVal = llvm::utostr(X) + std::string(", ") +
367+
llvm::utostr(Y) + std::string(", ") +
368+
llvm::utostr(Z);
369+
F->addFnAttr("amdgpu-num-work-groups", AttrVal);
370+
}
371+
}
359372
}
360373

361374
/// Emits control constants used to change per-architecture behaviour in the

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8072,6 +8072,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
80728072
D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
80738073
}
80748074

8075+
static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
8076+
const ParsedAttr &AL) {
8077+
uint32_t NumWGX = 0;
8078+
uint32_t NumWGY = 0;
8079+
uint32_t NumWGZ = 0;
8080+
Expr *NumWGXExpr = AL.getArgAsExpr(0);
8081+
Expr *NumWGYExpr = AL.getArgAsExpr(1);
8082+
Expr *NumWGZExpr = AL.getArgAsExpr(2);
8083+
if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
8084+
return;
8085+
if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
8086+
return;
8087+
if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
8088+
return;
8089+
8090+
D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
8091+
NumWGY, NumWGZ));
8092+
}
8093+
80758094
static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
80768095
const ParsedAttr &AL) {
80778096
// If we try to apply it to a function pointer, don't warn, but don't
@@ -9170,6 +9189,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
91709189
case ParsedAttr::AT_AMDGPUNumVGPR:
91719190
handleAMDGPUNumVGPRAttr(S, D, AL);
91729191
break;
9192+
case ParsedAttr::AT_AMDGPUNumWorkGroups:
9193+
handleAMDGPUNumWorkGroupsAttr(S, D, AL);
9194+
break;
91739195
case ParsedAttr::AT_AVRSignal:
91749196
handleAVRSignalAttr(S, D, AL);
91759197
break;

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
// CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
77
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
88
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
9+
// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
910
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
1011
// CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
1112
// CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
494494

495495
Kern[".max_flat_workgroup_size"] =
496496
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
497+
unsigned NumWGX = MFI.getNumWorkGroupsX();
498+
unsigned NumWGY = MFI.getNumWorkGroupsY();
499+
unsigned NumWGZ = MFI.getNumWorkGroupsZ();
500+
if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
501+
Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
502+
Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
503+
Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
504+
}
497505
Kern[".sgpr_spill_count"] =
498506
Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
499507
Kern[".vgpr_spill_count"] =

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
11081108
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
11091109
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
11101110
}
1111+
1112+
SmallVector<unsigned>
1113+
AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
1114+
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
1115+
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
288288
/// 2) dimension.
289289
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290290

291+
/// Return the number of work groups for the function.
292+
SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
293+
291294
/// Return true if only a single workitem can be active in a wave.
292295
bool isSingleLaneExecution(const Function &Kernel) const;
293296

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
4646
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
4747
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
4848
WavesPerEU = ST.getWavesPerEU(F);
49+
NumWorkGroups = ST.getNumWorkGroups(F);
50+
assert(NumWorkGroups.size() == 3);
4951

5052
Occupancy = ST.computeOccupancy(F, getLDSSize());
5153
CallingConv::ID CC = F.getCallingConv();

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
426426

427427
const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
428428

429+
// Default/requested number of work groups for the function.
430+
SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
431+
429432
private:
430433
unsigned NumUserSGPRs = 0;
431434
unsigned NumSystemSGPRs = 0;
@@ -1095,6 +1098,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
10951098

10961099
// \returns true if a function needs or may need AGPRs.
10971100
bool usesAGPRs(const MachineFunction &MF) const;
1101+
1102+
/// \returns Default/requested number of work groups for this function.
1103+
SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
1104+
1105+
unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
1106+
unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
1107+
unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
10981108
};
10991109

11001110
} // end namespace llvm

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "AMDGPUAsmUtils.h"
1212
#include "AMDKernelCodeT.h"
1313
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14+
#include "llvm/ADT/StringExtras.h"
1415
#include "llvm/BinaryFormat/ELF.h"
1516
#include "llvm/IR/Attributes.h"
1617
#include "llvm/IR/Constants.h"
@@ -1261,6 +1262,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
12611262
return Ints;
12621263
}
12631264

1265+
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
1266+
unsigned Size) {
1267+
assert(Size > 2);
1268+
SmallVector<unsigned> Default(Size, 0);
1269+
1270+
Attribute A = F.getFnAttribute(Name);
1271+
if (!A.isStringAttribute())
1272+
return Default;
1273+
1274+
SmallVector<unsigned> Vals(Size, 0);
1275+
1276+
LLVMContext &Ctx = F.getContext();
1277+
1278+
StringRef S = A.getValueAsString();
1279+
unsigned i = 0;
1280+
for (; !S.empty() && i < Size; i++) {
1281+
std::pair<StringRef, StringRef> Strs = S.split(',');
1282+
unsigned IntVal;
1283+
if (Strs.first.trim().getAsInteger(0, IntVal)) {
1284+
Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
1285+
Name);
1286+
return Default;
1287+
}
1288+
Vals[i] = IntVal;
1289+
S = Strs.second;
1290+
}
1291+
1292+
if (!S.empty() || i < Size) {
1293+
Ctx.emitError("attribute " + Name +
1294+
" has incorrect number of integers; expected " +
1295+
llvm::utostr(Size));
1296+
return Default;
1297+
}
1298+
return Vals;
1299+
}
1300+
1301+
unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
1302+
unsigned Default) {
1303+
Attribute A = F.getFnAttribute(Name);
1304+
if (!A.isStringAttribute())
1305+
return Default;
1306+
1307+
LLVMContext &Ctx = F.getContext();
1308+
unsigned IntVal = Default;
1309+
StringRef Str = A.getValueAsString();
1310+
if (Str.trim().getAsInteger(0, IntVal)) {
1311+
Ctx.emitError("can't parse integer attribute " + Name);
1312+
return Default;
1313+
}
1314+
return IntVal;
1315+
}
1316+
12641317
unsigned getVmcntBitMask(const IsaVersion &Version) {
12651318
return (1 << (getVmcntBitWidthLo(Version.Major) +
12661319
getVmcntBitWidthHi(Version.Major))) -

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -818,6 +818,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
818818
/// to integer.
819819
int getIntegerAttribute(const Function &F, StringRef Name, int Default);
820820

821+
/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
822+
///
823+
/// \returns \p Default if attribute is not present.
824+
///
825+
/// \returns \p Default and emits error if requested value cannot be converted
826+
/// to integer.
827+
unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
828+
unsigned Default);
829+
821830
/// \returns A pair of integer values requested using \p F's \p Name attribute
822831
/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
823832
/// is false).
@@ -832,6 +841,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
832841
std::pair<unsigned, unsigned> Default,
833842
bool OnlyFirstRequired = false);
834843

844+
/// \returns Generate a vector of integer values requested using \p F's \p Name
845+
/// attribute.
846+
///
847+
/// \returns true if exactly Size (>2) number of integers are found in the
848+
/// attribute.
849+
///
850+
/// \returns false if any error occurs.
851+
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
852+
unsigned Size);
853+
835854
/// Represents the counter values to wait for in an s_waitcnt instruction.
836855
///
837856
/// Large values (including the maximum possible integer) can be used to
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
2+
3+
; Attribute not specified.
4+
; CHECK-LABEL: {{^}}empty_no_attribute:
5+
define amdgpu_kernel void @empty_no_attribute() {
6+
entry:
7+
ret void
8+
}
9+
10+
; Ignore if number of work groups for x dimension is 0.
11+
; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
12+
define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
13+
entry:
14+
ret void
15+
}
16+
attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
17+
18+
; Ignore if number of work groups for y dimension is 0.
19+
; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
20+
define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
21+
entry:
22+
ret void
23+
}
24+
attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
25+
26+
; Ignore if number of work groups for z dimension is 0.
27+
; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
28+
define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
29+
entry:
30+
ret void
31+
}
32+
attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
33+
34+
; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
35+
define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
36+
entry:
37+
ret void
38+
}
39+
attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
40+
41+
; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
42+
define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
43+
entry:
44+
ret void
45+
}
46+
attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
47+
48+
49+
; CHECK: .amdgpu_metadata
50+
; CHECK: .name: empty_no_attribute
51+
; CHECK-NEXT: .private_segment_fixed_size: 0
52+
; CHECK: .name: empty_num_work_groups_x0
53+
; CHECK-NEXT: .private_segment_fixed_size: 0
54+
; CHECK: .name: empty_num_work_groups_y0
55+
; CHECK-NEXT: .private_segment_fixed_size: 0
56+
; CHECK: .name: empty_num_work_groups_z0
57+
; CHECK-NEXT: .private_segment_fixed_size: 0
58+
; CHECK: .name: empty_num_work_groups_1_2_3
59+
; CHECK-NEXT: .num_work_groups_x: 1
60+
; CHECK-NEXT: .num_work_groups_y: 2
61+
; CHECK-NEXT: .num_work_groups_z: 3
62+
; CHECK: .name: empty_num_work_groups_1024_1024_1024
63+
; CHECK-NEXT: .num_work_groups_x: 1024
64+
; CHECK-NEXT: .num_work_groups_y: 1024
65+
; CHECK-NEXT: .num_work_groups_z: 1024

0 commit comments

Comments
 (0)