Skip to content

Commit 5c088a5

Browse files
author
Jun Wang
committed
[AMDGPU] Adding the amdgpu-num-work-groups function attribute
A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information.
1 parent 09fc333 commit 5c088a5

File tree

13 files changed

+232
-0
lines changed

13 files changed

+232
-0
lines changed

clang/include/clang/Basic/Attr.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr {
20312031
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
20322032
}
20332033

2034+
def AMDGPUNumWorkGroups : InheritableAttr {
2035+
let Spellings = [Clang<"amdgpu_num_work_groups", 0>];
2036+
let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">];
2037+
let Documentation = [AMDGPUNumWorkGroupsDocs];
2038+
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
2039+
}
2040+
20342041
def AMDGPUKernelCall : DeclOrTypeAttr {
20352042
let Spellings = [Clang<"amdgpu_kernel">];
20362043
let Documentation = [Undocumented];

clang/include/clang/Basic/AttrDocs.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2705,6 +2705,30 @@ An error will be given if:
27052705
}];
27062706
}
27072707

2708+
def AMDGPUNumWorkGroupsDocs : Documentation {
2709+
let Category = DocCatAMDGPUAttributes;
2710+
let Content = [{
2711+
The number of work groups specifies the number of work groups when the kernel
2712+
is dispatched.
2713+
2714+
Clang supports the
2715+
``__attribute__((amdgpu_num_work_groups(<x>, <y>, <z>)))`` attribute for the
2716+
AMDGPU target. This attribute may be attached to a kernel function definition
2717+
and is an optimization hint.
2718+
2719+
``<x>`` parameter specifies the maximum number of work groups in the x dimentsion.
2720+
Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
2721+
2722+
If specified, the AMDGPU target backend might be able to produce better machine
2723+
code.
2724+
2725+
An error will be given if:
2726+
- Specified values violate subtarget specifications;
2727+
- Specified values are not compatible with values provided through other
2728+
attributes.
2729+
}];
2730+
}
2731+
27082732
def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
27092733
let Content = [{
27102734
Clang supports several different calling conventions, depending on the target

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
356356
if (NumVGPR != 0)
357357
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358358
}
359+
360+
if (const auto *Attr = FD->getAttr<AMDGPUNumWorkGroupsAttr>()) {
361+
uint32_t X = Attr->getNumWorkGroupsX();
362+
uint32_t Y = Attr->getNumWorkGroupsY();
363+
uint32_t Z = Attr->getNumWorkGroupsZ();
364+
365+
if (X != 0 && Y != 0 && Z != 0) {
366+
std::string AttrVal = llvm::utostr(X) + std::string(", ") +
367+
llvm::utostr(Y) + std::string(", ") +
368+
llvm::utostr(Z);
369+
F->addFnAttr("amdgpu-num-work-groups", AttrVal);
370+
}
371+
}
359372
}
360373

361374
/// Emits control constants used to change per-architecture behaviour in the

clang/lib/Sema/SemaDeclAttr.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
80698069
D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
80708070
}
80718071

8072+
static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D,
8073+
const ParsedAttr &AL) {
8074+
uint32_t NumWGX = 0;
8075+
uint32_t NumWGY = 0;
8076+
uint32_t NumWGZ = 0;
8077+
Expr *NumWGXExpr = AL.getArgAsExpr(0);
8078+
Expr *NumWGYExpr = AL.getArgAsExpr(1);
8079+
Expr *NumWGZExpr = AL.getArgAsExpr(2);
8080+
if (!checkUInt32Argument(S, AL, NumWGXExpr, NumWGX))
8081+
return;
8082+
if (!checkUInt32Argument(S, AL, NumWGYExpr, NumWGY))
8083+
return;
8084+
if (!checkUInt32Argument(S, AL, NumWGZExpr, NumWGZ))
8085+
return;
8086+
8087+
D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWGX,
8088+
NumWGY, NumWGZ));
8089+
}
8090+
80728091
static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
80738092
const ParsedAttr &AL) {
80748093
// If we try to apply it to a function pointer, don't warn, but don't
@@ -9173,6 +9192,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
91739192
case ParsedAttr::AT_AMDGPUNumVGPR:
91749193
handleAMDGPUNumVGPRAttr(S, D, AL);
91759194
break;
9195+
case ParsedAttr::AT_AMDGPUNumWorkGroups:
9196+
handleAMDGPUNumWorkGroupsAttr(S, D, AL);
9197+
break;
91769198
case ParsedAttr::AT_AVRSignal:
91779199
handleAVRSignalAttr(S, D, AL);
91789200
break;

clang/test/Misc/pragma-attribute-supported-attributes-list.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
// CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
77
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
88
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
9+
// CHECK-NEXT: AMDGPUNumWorkGroups (SubjectMatchRule_function)
910
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
1011
// CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
1112
// CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
494494

495495
Kern[".max_flat_workgroup_size"] =
496496
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
497+
unsigned NumWGX = MFI.getNumWorkGroupsX();
498+
unsigned NumWGY = MFI.getNumWorkGroupsY();
499+
unsigned NumWGZ = MFI.getNumWorkGroupsZ();
500+
if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
501+
Kern[".num_work_groups_x"] = Kern.getDocument()->getNode(NumWGX);
502+
Kern[".num_work_groups_y"] = Kern.getDocument()->getNode(NumWGY);
503+
Kern[".num_work_groups_z"] = Kern.getDocument()->getNode(NumWGZ);
504+
}
497505
Kern[".sgpr_spill_count"] =
498506
Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
499507
Kern[".vgpr_spill_count"] =

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
11081108
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
11091109
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
11101110
}
1111+
1112+
SmallVector<unsigned>
1113+
AMDGPUSubtarget::getNumWorkGroups(const Function &F) const {
1114+
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-num-work-groups", 3);
1115+
}

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,9 @@ class AMDGPUSubtarget {
288288
/// 2) dimension.
289289
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290290

291+
/// Return the number of work groups for the function.
292+
SmallVector<unsigned> getNumWorkGroups(const Function &F) const;
293+
291294
/// Return true if only a single workitem can be active in a wave.
292295
bool isSingleLaneExecution(const Function &Kernel) const;
293296

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
4646
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
4747
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
4848
WavesPerEU = ST.getWavesPerEU(F);
49+
NumWorkGroups = ST.getNumWorkGroups(F);
50+
assert(NumWorkGroups.size() == 3);
4951

5052
Occupancy = ST.computeOccupancy(F, getLDSSize());
5153
CallingConv::ID CC = F.getCallingConv();

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
426426

427427
const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
428428

429+
// Default/requested number of work groups for the function.
430+
SmallVector<unsigned> NumWorkGroups = {0, 0, 0};
431+
429432
private:
430433
unsigned NumUserSGPRs = 0;
431434
unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
10721075

10731076
// \returns true if a function needs or may need AGPRs.
10741077
bool usesAGPRs(const MachineFunction &MF) const;
1078+
1079+
/// \returns Default/requested number of work groups for this function.
1080+
SmallVector<unsigned> getNumWorkGroups() const { return NumWorkGroups; }
1081+
1082+
unsigned getNumWorkGroupsX() const { return NumWorkGroups[0]; }
1083+
unsigned getNumWorkGroupsY() const { return NumWorkGroups[1]; }
1084+
unsigned getNumWorkGroupsZ() const { return NumWorkGroups[2]; }
10751085
};
10761086

10771087
} // end namespace llvm

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "AMDGPUAsmUtils.h"
1212
#include "AMDKernelCodeT.h"
1313
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14+
#include "llvm/ADT/StringExtras.h"
1415
#include "llvm/BinaryFormat/ELF.h"
1516
#include "llvm/IR/Attributes.h"
1617
#include "llvm/IR/Constants.h"
@@ -1253,6 +1254,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
12531254
return Ints;
12541255
}
12551256

1257+
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
1258+
unsigned Size) {
1259+
assert(Size > 2);
1260+
SmallVector<unsigned> Default(Size, 0);
1261+
1262+
Attribute A = F.getFnAttribute(Name);
1263+
if (!A.isStringAttribute())
1264+
return Default;
1265+
1266+
SmallVector<unsigned> Vals(Size, 0);
1267+
1268+
LLVMContext &Ctx = F.getContext();
1269+
1270+
StringRef S = A.getValueAsString();
1271+
unsigned i = 0;
1272+
for (; !S.empty() && i < Size; i++) {
1273+
std::pair<StringRef, StringRef> Strs = S.split(',');
1274+
unsigned IntVal;
1275+
if (Strs.first.trim().getAsInteger(0, IntVal)) {
1276+
Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
1277+
Name);
1278+
return Default;
1279+
}
1280+
Vals[i] = IntVal;
1281+
S = Strs.second;
1282+
}
1283+
1284+
if (!S.empty() || i < Size) {
1285+
Ctx.emitError("attribute " + Name +
1286+
" has incorrect number of integers; expected " +
1287+
llvm::utostr(Size));
1288+
return Default;
1289+
}
1290+
return Vals;
1291+
}
1292+
1293+
unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
1294+
unsigned Default) {
1295+
Attribute A = F.getFnAttribute(Name);
1296+
if (!A.isStringAttribute())
1297+
return Default;
1298+
1299+
LLVMContext &Ctx = F.getContext();
1300+
unsigned IntVal = Default;
1301+
StringRef Str = A.getValueAsString();
1302+
if (Str.trim().getAsInteger(0, IntVal)) {
1303+
Ctx.emitError("can't parse integer attribute " + Name);
1304+
return Default;
1305+
}
1306+
return IntVal;
1307+
}
1308+
12561309
unsigned getVmcntBitMask(const IsaVersion &Version) {
12571310
return (1 << (getVmcntBitWidthLo(Version.Major) +
12581311
getVmcntBitWidthHi(Version.Major))) -

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT);
814814
/// to integer.
815815
int getIntegerAttribute(const Function &F, StringRef Name, int Default);
816816

817+
/// \returns Unsigned Integer value requested using \p F's \p Name attribute.
818+
///
819+
/// \returns \p Default if attribute is not present.
820+
///
821+
/// \returns \p Default and emits error if requested value cannot be converted
822+
/// to integer.
823+
unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name,
824+
unsigned Default);
825+
817826
/// \returns A pair of integer values requested using \p F's \p Name attribute
818827
/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
819828
/// is false).
@@ -828,6 +837,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
828837
std::pair<unsigned, unsigned> Default,
829838
bool OnlyFirstRequired = false);
830839

840+
/// \returns Generate a vector of integer values requested using \p F's \p Name
841+
/// attribute.
842+
///
843+
/// \returns true if exactly Size (>2) number of integers are found in the
844+
/// attribute.
845+
///
846+
/// \returns false if any error occurs.
847+
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
848+
unsigned Size);
849+
831850
/// Represents the counter values to wait for in an s_waitcnt instruction.
832851
///
833852
/// Large values (including the maximum possible integer) can be used to
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
2+
3+
; Attribute not specified.
4+
; CHECK-LABEL: {{^}}empty_no_attribute:
5+
define amdgpu_kernel void @empty_no_attribute() {
6+
entry:
7+
ret void
8+
}
9+
10+
; Ignore if number of work groups for x dimension is 0.
11+
; CHECK-LABEL: {{^}}empty_num_work_groups_x0:
12+
define amdgpu_kernel void @empty_num_work_groups_x0() #0 {
13+
entry:
14+
ret void
15+
}
16+
attributes #0 = {"amdgpu-num-work-groups"="0,2,3"}
17+
18+
; Ignore if number of work groups for y dimension is 0.
19+
; CHECK-LABEL: {{^}}empty_num_work_groups_y0:
20+
define amdgpu_kernel void @empty_num_work_groups_y0() #1 {
21+
entry:
22+
ret void
23+
}
24+
attributes #1 = {"amdgpu-num-work-groups"="1,0,3"}
25+
26+
; Ignore if number of work groups for z dimension is 0.
27+
; CHECK-LABEL: {{^}}empty_num_work_groups_z0:
28+
define amdgpu_kernel void @empty_num_work_groups_z0() #2 {
29+
entry:
30+
ret void
31+
}
32+
attributes #2 = {"amdgpu-num-work-groups"="1,2,0"}
33+
34+
; CHECK-LABEL: {{^}}empty_num_work_groups_1_2_3:
35+
define amdgpu_kernel void @empty_num_work_groups_1_2_3() #3 {
36+
entry:
37+
ret void
38+
}
39+
attributes #3 = {"amdgpu-num-work-groups"="1,2,3"}
40+
41+
; CHECK-LABEL: {{^}}empty_num_work_groups_1024_1024_1024:
42+
define amdgpu_kernel void @empty_num_work_groups_1024_1024_1024() #4 {
43+
entry:
44+
ret void
45+
}
46+
attributes #4 = {"amdgpu-num-work-groups"="1024,1024,1024"}
47+
48+
49+
; CHECK: .amdgpu_metadata
50+
; CHECK: .name: empty_no_attribute
51+
; CHECK-NEXT: .private_segment_fixed_size: 0
52+
; CHECK: .name: empty_num_work_groups_x0
53+
; CHECK-NEXT: .private_segment_fixed_size: 0
54+
; CHECK: .name: empty_num_work_groups_y0
55+
; CHECK-NEXT: .private_segment_fixed_size: 0
56+
; CHECK: .name: empty_num_work_groups_z0
57+
; CHECK-NEXT: .private_segment_fixed_size: 0
58+
; CHECK: .name: empty_num_work_groups_1_2_3
59+
; CHECK-NEXT: .num_work_groups_x: 1
60+
; CHECK-NEXT: .num_work_groups_y: 2
61+
; CHECK-NEXT: .num_work_groups_z: 3
62+
; CHECK: .name: empty_num_work_groups_1024_1024_1024
63+
; CHECK-NEXT: .num_work_groups_x: 1024
64+
; CHECK-NEXT: .num_work_groups_y: 1024
65+
; CHECK-NEXT: .num_work_groups_z: 1024

0 commit comments

Comments
 (0)