Skip to content

Commit 3c09c1d

Browse files
committed
[AMDGPU][Attributor] Rework calculation of waves per eu
1 parent 35fda65 commit 3c09c1d

29 files changed

+347
-215
lines changed

builtin.diff

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
2+
index 39fef9e4601f..55befac12b46 100644
3+
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
4+
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
5+
@@ -635,5 +635,7 @@ TARGET_BUILTIN(__builtin_amdgcn_bitop3_b16, "ssssIUi", "nc", "bitop3-insts")
6+
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf16_f32, "V2yV2yfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
7+
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16bf16-cvt-sr-insts")
8+
9+
+TARGET_BUILTIN(__builtin_amdgcn_load_mcast_b32, "vi*3i*Iii", "", "gfx950-insts")
10+
+
11+
#undef BUILTIN
12+
#undef TARGET_BUILTIN
13+
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
14+
index ad012d98635f..601319b2c225 100644
15+
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
16+
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
17+
@@ -1083,6 +1083,20 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
18+
19+
return Builder.CreateBitCast(RMW, OrigTy);
20+
}
21+
+ case AMDGPU::BI__builtin_amdgcn_load_mcast_b32: {
22+
+ Intrinsic::ID IID;
23+
+ switch (BuiltinID) {
24+
+ case AMDGPU::BI__builtin_amdgcn_load_mcast_b32:
25+
+ IID = Intrinsic::amdgcn_load_mcast_b32;
26+
+ break;
27+
+ }
28+
+ SmallVector<Value *, 3> Args;
29+
+ auto *SrcType = ConvertType(E->getArg(1)->getType());
30+
+ for (int i = 0, e = E->getNumArgs(); i != e; ++i)
31+
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
32+
+ llvm::Function *F = CGM.getIntrinsic(IID, {SrcType});
33+
+ return Builder.CreateCall(F, {Args});
34+
+ }
35+
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
36+
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
37+
llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
38+
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
39+
new file mode 100644
40+
index 000000000000..f6b058cb3fb5
41+
--- /dev/null
42+
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx13-load-mcast.cl
43+
@@ -0,0 +1,14 @@
44+
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
45+
+// REQUIRES: amdgpu-registered-target
46+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX13
47+
+
48+
+void test_amdgcn_load_mcast_b32(local void* outptr, global void* inptr, int mask)
49+
+{
50+
+ __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
51+
+}
52+
+
53+
+//void test_amdgcn_load_mcast_local_b32(local void* outptr, local void* inptr, int mask)
54+
+//{
55+
+// __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
56+
+//}
57+
+
58+
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
59+
index a57eb4a6dba4..a765ef8c3a42 100644
60+
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
61+
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
62+
@@ -3499,6 +3499,22 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
63+
[IntrNoMem, IntrSpeculatable]
64+
>;
65+
66+
+class AMDGPULoadMCast:
67+
+ Intrinsic<
68+
+ [],
69+
+ [local_ptr_ty, // laneshared destination pointer
70+
+ llvm_anyptr_ty, // Global, LDS, or DDS pointer to load from
71+
+ llvm_i32_ty, // gfx12+ cachepolicy:
72+
+ // bits [0-2] = th
73+
+ // bits [3-4] = scope
74+
+ llvm_i32_ty], // workgroup broadcast mask (in M0)
75+
+ [IntrArgMemOnly, ReadOnly<ArgIndex<1>>, IntrConvergent, NoCapture<ArgIndex<0>>,
76+
+ NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
77+
+ "", [SDNPMemOperand]
78+
+ >;
79+
+
80+
+def int_amdgcn_load_mcast_b32 : AMDGPULoadMCast;
81+
+
82+
/// Emit an addrspacecast without null pointer checking.
83+
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
84+
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX13
4+
5+
void test_amdgcn_load_mcast_b32(local int* outptr, global int* inptr, int mask)
6+
{
7+
__builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
8+
}
9+
10+
//void test_amdgcn_load_mcast_local_b32(local void* outptr, local void* inptr, int mask)
11+
//{
12+
// __builtin_amdgcn_load_mcast_b32(outptr, inptr, 10, mask);
13+
//}
14+

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -903,4 +903,5 @@ void test_set_fpenv(unsigned long env) {
903903

904904
// CHECK-DAG: [[$GRID_RANGE]] = !{i32 1, i32 0}
905905
// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
906-
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
906+
// CHECK-SPIRV-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
907+
// CHECK-AMDGCN-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) "amdgpu-waves-per-eu"="4,10" }

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

Lines changed: 82 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,47 +1117,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11171117
Function *F = getAssociatedFunction();
11181118
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
11191119

1120-
auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
1121-
auto [Min, Max] = R;
1122-
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1123-
IntegerRangeState RangeState(Range);
1124-
clampStateAndIndicateChange(this->getState(), RangeState);
1125-
indicateOptimisticFixpoint();
1126-
};
1127-
1128-
std::pair<unsigned, unsigned> MaxWavesPerEURange{
1129-
1U, InfoCache.getMaxWavesPerEU(*F)};
1130-
11311120
// If the attribute exists, we will honor it if it is not the default.
11321121
if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
1122+
std::pair<unsigned, unsigned> MaxWavesPerEURange{
1123+
1U, InfoCache.getMaxWavesPerEU(*F)};
11331124
if (*Attr != MaxWavesPerEURange) {
1134-
TakeRange(*Attr);
1125+
auto [Min, Max] = *Attr;
1126+
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
1127+
IntegerRangeState RangeState(Range);
1128+
this->getState() = RangeState;
1129+
indicateOptimisticFixpoint();
11351130
return;
11361131
}
11371132
}
11381133

1139-
// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1140-
// calculation of waves per EU involves flat work group size, we can't
1141-
// simply use an assumed flat work group size as a start point, because the
1142-
// update of flat work group size is in an inverse direction of waves per
1143-
// EU. However, we can still do something if it is an entry function. Since
1144-
// an entry function is a terminal node, and flat work group size either
1145-
// from attribute or default will be used anyway, we can take that value and
1146-
// calculate the waves per EU based on it. This result can't be updated by
1147-
// no means, but that could still allow us to propagate it.
1148-
if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
1149-
std::pair<unsigned, unsigned> FlatWorkGroupSize;
1150-
if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
1151-
FlatWorkGroupSize = *Attr;
1152-
else
1153-
FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
1154-
TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
1155-
FlatWorkGroupSize));
1156-
}
1134+
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
1135+
indicatePessimisticFixpoint();
11571136
}
11581137

11591138
ChangeStatus updateImpl(Attributor &A) override {
1160-
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
11611139
ChangeStatus Change = ChangeStatus::UNCHANGED;
11621140

11631141
auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1166,24 +1144,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
11661144
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
11671145
<< "->" << Func->getName() << '\n');
11681146

1169-
const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
1147+
const auto *CallerAA = A.getAAFor<AAAMDWavesPerEU>(
11701148
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
1171-
const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
1172-
*this, IRPosition::function(*Func), DepClassTy::REQUIRED);
1173-
if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
1174-
!AssumedGroupSize->isValidState())
1149+
if (!CallerAA || !CallerAA->isValidState())
11751150
return false;
11761151

1177-
unsigned Min, Max;
1178-
std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
1179-
*Caller,
1180-
{CallerInfo->getAssumed().getLower().getZExtValue(),
1181-
CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
1182-
{AssumedGroupSize->getAssumed().getLower().getZExtValue(),
1183-
AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
1184-
ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
1185-
IntegerRangeState CallerRangeState(CallerRange);
1186-
Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
1152+
auto Assumed = this->getAssumed();
1153+
unsigned Min = std::max(Assumed.getLower().getZExtValue(),
1154+
CallerAA->getAssumed().getLower().getZExtValue());
1155+
unsigned Max = std::max(Assumed.getUpper().getZExtValue(),
1156+
CallerAA->getAssumed().getUpper().getZExtValue());
1157+
ConstantRange Range(APInt(32, Min), APInt(32, Max));
1158+
IntegerRangeState RangeState(Range);
1159+
this->getState() = RangeState;
1160+
Change |= this->getState() == Assumed ? ChangeStatus::UNCHANGED
1161+
: ChangeStatus::CHANGED;
11871162

11881163
return true;
11891164
};
@@ -1342,6 +1317,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
13421317
}
13431318
}
13441319

1320+
static void checkWavesPerEU(Module &M, TargetMachine &TM) {
1321+
for (Function &F : M) {
1322+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1323+
1324+
auto FlatWgrpSizeAttr =
1325+
AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
1326+
auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute(
1327+
F, "amdgpu-waves-per-eu", /*OnlyFirstRequired=*/true);
1328+
1329+
unsigned MinWavesPerEU = ST.getMinWavesPerEU();
1330+
unsigned MaxWavesPerEU = ST.getMaxWavesPerEU();
1331+
1332+
unsigned MinFlatWgrpSize = 1U;
1333+
unsigned MaxFlatWgrpSize = 1024U;
1334+
if (FlatWgrpSizeAttr.has_value()) {
1335+
MinFlatWgrpSize = FlatWgrpSizeAttr->first;
1336+
MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second);
1337+
}
1338+
1339+
// Start with the max range.
1340+
unsigned Min = MinWavesPerEU;
1341+
unsigned Max = MaxWavesPerEU;
1342+
1343+
// If the attribute exists, set them to the value from the attribute.
1344+
if (WavesPerEUAttr.has_value()) {
1345+
Min = WavesPerEUAttr->first;
1346+
if (WavesPerEUAttr->second.has_value())
1347+
Max = *(WavesPerEUAttr->second);
1348+
}
1349+
1350+
// Compute the range from flat workgroup size.
1351+
auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1352+
ST.getWavesPerEU(F, std::make_pair(MinFlatWgrpSize, MaxFlatWgrpSize));
1353+
1354+
// For the lower bound, we have to "tighten" it.
1355+
Min = std::max(Min, MinFromFlatWgrpSize);
1356+
// For the upper bound, we have to "extend" it.
1357+
Max = std::max(Max, MaxFromFlatWgrpSize);
1358+
1359+
// Clamp the range to the max range.
1360+
Min = std::max(Min, MinWavesPerEU);
1361+
Max = std::min(Max, MaxWavesPerEU);
1362+
1363+
// Update the attribute if it is not the max.
1364+
if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1365+
SmallString<10> Buffer;
1366+
raw_svector_ostream OS(Buffer);
1367+
OS << Min << ',' << Max;
1368+
F.addFnAttr("amdgpu-waves-per-eu", OS.str());
1369+
}
1370+
}
1371+
}
1372+
13451373
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
13461374
AMDGPUAttributorOptions Options,
13471375
ThinOrFullLTOPhase LTOPhase) {
@@ -1417,8 +1445,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
14171445
}
14181446
}
14191447

1420-
ChangeStatus Change = A.run();
1421-
return Change == ChangeStatus::CHANGED;
1448+
bool Changed = A.run() == ChangeStatus::CHANGED;
1449+
1450+
if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1451+
LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1452+
LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1453+
checkWavesPerEU(M, TM);
1454+
1455+
return Changed;
14221456
}
14231457

14241458
class AMDGPUAttributorLegacy : public ModulePass {

llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,13 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
252252
}
253253

254254

255-
attributes #0 = { "amdgpu-agpr-alloc"="0" }
255+
attributes #0 = { "amdgpu-no-agpr" }
256256
;.
257-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259-
; CHECK: attributes #[[ATTR2]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260-
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
261-
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
262-
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
263-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
257+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
258+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
259+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
260+
; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
261+
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
262+
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" }
263+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-agpr" }
264264
;.

llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,14 @@ define void @call_no_dispatch_id() {
117117
ret void
118118
}
119119
;.
120-
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
121-
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
122-
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
123-
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "uniform-work-group-size"="false" }
124-
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "uniform-work-group-size"="false" }
125-
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "uniform-work-group-size"="false" }
126-
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "uniform-work-group-size"="false" }
127-
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "uniform-work-group-size"="false" }
128-
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
129-
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
120+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-workitem-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
121+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
122+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
123+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-workgroup-id-x" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
124+
; CHECK: attributes #[[ATTR4]] = { "amdgpu-no-workgroup-id-y" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
125+
; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-workgroup-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
126+
; CHECK: attributes #[[ATTR6]] = { "amdgpu-no-dispatch-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
127+
; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-queue-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
128+
; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
129+
; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-dispatch-id" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
130130
;.

0 commit comments

Comments
 (0)