Skip to content

Commit 67c55b1

Browse files
authored
[AMDGPU] Make max dwords of memory cluster configurable (#119342)
We find it helpful to increase the value for graphics workload. Make it configurable so we can experiment with a different value.
1 parent 4c6e13f commit 67c55b1

11 files changed

+112
-59
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -554,31 +554,38 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
554554
unsigned NumBytes) const {
555555
// If the mem ops (to be clustered) do not have the same base ptr, then they
556556
// should not be clustered
557+
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
557558
if (!BaseOps1.empty() && !BaseOps2.empty()) {
558559
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559560
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
560561
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
561562
return false;
563+
564+
const SIMachineFunctionInfo *MFI =
565+
FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566+
MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
562567
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563568
// If only one base op is empty, they do not have the same base ptr
564569
return false;
565570
}
566571

567572
// In order to avoid register pressure, on an average, the number of DWORDS
568-
// loaded together by all clustered mem ops should not exceed 8. This is an
569-
// empirical value based on certain observations and performance related
570-
// experiments.
573+
// loaded together by all clustered mem ops should not exceed
574+
// MaxMemoryClusterDWords. This is an empirical value based on certain
575+
// observations and performance related experiments.
571576
// The good thing about this heuristic is - it avoids clustering of too many
572577
// sub-word loads, and also avoids clustering of wide loads. Below is the
573-
// brief summary of how the heuristic behaves for various `LoadSize`.
578+
// brief summary of how the heuristic behaves for various `LoadSize` when
579+
// MaxMemoryClusterDWords is 8.
580+
//
574581
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575582
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576583
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577584
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578585
// (5) LoadSize >= 17: do not cluster
579586
const unsigned LoadSize = NumBytes / ClusterSize;
580-
const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
581-
return NumDWORDs <= 8;
587+
const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588+
return NumDWords <= MaxMemoryClusterDWords;
582589
}
583590

584591
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class RegScavenger;
3636
class TargetRegisterClass;
3737
class ScheduleHazardRecognizer;
3838

39+
constexpr unsigned DefaultMemoryClusterDWordsLimit = 8;
40+
3941
/// Mark the MMO of a uniform load if there are no potentially clobbering stores
4042
/// on any path from the start of an entry function to this load.
4143
static const MachineMemOperand::Flags MONoClobber =

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
163163
if (!S.empty())
164164
S.consumeInteger(0, HighBitsOf32BitAddress);
165165

166+
MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
167+
"amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
168+
166169
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
167170
// VGPR available at all times. For now, reserve highest available VGPR. After
168171
// RA, shift it to the lowest available unused VGPR if the one exist.
@@ -694,8 +697,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
694697
const llvm::MachineFunction &MF)
695698
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
696699
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
697-
GDSSize(MFI.getGDSSize()),
698-
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
700+
GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
701+
IsEntryFunction(MFI.isEntryFunction()),
699702
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
700703
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
701704
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
@@ -708,8 +711,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
708711
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
709712
ReturnsVoid(MFI.returnsVoid()),
710713
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
711-
PSInputAddr(MFI.getPSInputAddr()),
712-
PSInputEnable(MFI.getPSInputEnable()),
714+
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
715+
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
713716
Mode(MFI.getMode()) {
714717
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
715718
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
@@ -744,6 +747,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
744747
DynLDSAlign = YamlMFI.DynLDSAlign;
745748
PSInputAddr = YamlMFI.PSInputAddr;
746749
PSInputEnable = YamlMFI.PSInputEnable;
750+
MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
747751
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
748752
Occupancy = YamlMFI.Occupancy;
749753
IsEntryFunction = YamlMFI.IsEntryFunction;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
289289

290290
unsigned PSInputAddr = 0;
291291
unsigned PSInputEnable = 0;
292+
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
292293

293294
SIMode Mode;
294295
std::optional<FrameIndex> ScavengeFI;
@@ -333,6 +334,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
333334
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
334335
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
335336
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
337+
YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords,
338+
DefaultMemoryClusterDWordsLimit);
336339
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
337340
YamlIO.mapOptional("highBitsOf32BitAddress",
338341
MFI.HighBitsOf32BitAddress, 0u);
@@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
487490
// Current recorded maximum possible occupancy.
488491
unsigned Occupancy;
489492

493+
// Maximum number of dwords that can be clusterred during instruction
494+
// scheduler stage.
495+
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
496+
490497
mutable std::optional<bool> UsesAGPRs;
491498

492499
MCPhysReg getNextUserSGPR() const;
@@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
11091116
limitOccupancy(MF);
11101117
}
11111118

1119+
unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }
1120+
11121121
bool mayNeedAGPRs() const {
11131122
return MayNeedAGPRs;
11141123
}

llvm/test/CodeGen/AMDGPU/group-image-instructions.ll

Lines changed: 55 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
55
; GFX11-LABEL: group_image_sample:
66
; GFX11: ; %bb.0: ; %.entry
7-
; GFX11-NEXT: s_mov_b32 s24, exec_lo
7+
; GFX11-NEXT: s_mov_b32 s33, exec_lo
88
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
99
; GFX11-NEXT: s_mov_b32 m0, s4
1010
; GFX11-NEXT: s_getpc_b64 s[4:5]
@@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
2121
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
2222
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
2323
; GFX11-NEXT: s_mov_b32 exec_lo, s16
24+
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
25+
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
2426
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
25-
; GFX11-NEXT: s_clause 0x3
27+
; GFX11-NEXT: s_clause 0xf
2628
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
2729
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
2830
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
2931
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
30-
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
31-
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
32-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
32+
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50
33+
; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60
34+
; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70
35+
; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80
36+
; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90
37+
; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0
38+
; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0
39+
; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0
40+
; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0
41+
; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0
42+
; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0
43+
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
44+
; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
3345
; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
34-
; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
3546
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
36-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
37-
; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
38-
; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
39-
; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
40-
; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
41-
; GFX11-NEXT: s_clause 0x3
47+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
48+
; GFX11-NEXT: v_add_f32_e32 v5, s17, v36
49+
; GFX11-NEXT: v_add_f32_e32 v4, s16, v0
50+
; GFX11-NEXT: v_add_f32_e32 v8, s18, v0
51+
; GFX11-NEXT: v_add_f32_e32 v9, s19, v36
52+
; GFX11-NEXT: v_add_f32_e32 v12, s20, v0
53+
; GFX11-NEXT: v_add_f32_e32 v13, s21, v36
54+
; GFX11-NEXT: v_add_f32_e32 v16, s22, v0
55+
; GFX11-NEXT: v_add_f32_e32 v17, s23, v36
56+
; GFX11-NEXT: v_add_f32_e32 v20, s24, v0
57+
; GFX11-NEXT: v_add_f32_e32 v21, s25, v36
58+
; GFX11-NEXT: v_add_f32_e32 v24, s26, v0
59+
; GFX11-NEXT: v_add_f32_e32 v25, s27, v36
60+
; GFX11-NEXT: v_add_f32_e32 v28, s28, v0
61+
; GFX11-NEXT: v_add_f32_e32 v29, s29, v36
62+
; GFX11-NEXT: v_add_f32_e32 v32, s30, v0
63+
; GFX11-NEXT: v_add_f32_e32 v33, s31, v36
64+
; GFX11-NEXT: s_clause 0x7
4265
; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4366
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4467
; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
4568
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
46-
; GFX11-NEXT: s_clause 0x3
47-
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
48-
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
49-
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
50-
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
51-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52-
; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
53-
; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
54-
; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
55-
; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
56-
; GFX11-NEXT: s_clause 0x3
5769
; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
5870
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
5971
; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
6072
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
61-
; GFX11-NEXT: s_clause 0x3
62-
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
63-
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
64-
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
65-
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
66-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67-
; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
68-
; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
69-
; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
70-
; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
71-
; GFX11-NEXT: s_clause 0x3
72-
; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
73+
; GFX11-NEXT: v_add_f32_e32 v37, s34, v0
74+
; GFX11-NEXT: v_add_f32_e32 v38, s35, v36
75+
; GFX11-NEXT: v_add_f32_e32 v40, s36, v0
76+
; GFX11-NEXT: v_add_f32_e32 v41, s37, v36
77+
; GFX11-NEXT: v_add_f32_e32 v44, s38, v0
78+
; GFX11-NEXT: v_add_f32_e32 v45, s39, v36
79+
; GFX11-NEXT: v_add_f32_e32 v48, s40, v0
80+
; GFX11-NEXT: v_add_f32_e32 v49, s41, v36
81+
; GFX11-NEXT: v_add_f32_e32 v52, s42, v0
82+
; GFX11-NEXT: v_add_f32_e32 v53, s43, v36
83+
; GFX11-NEXT: v_add_f32_e32 v56, s44, v0
84+
; GFX11-NEXT: v_add_f32_e32 v57, s45, v36
85+
; GFX11-NEXT: v_add_f32_e32 v60, s46, v0
86+
; GFX11-NEXT: v_add_f32_e32 v61, s47, v36
87+
; GFX11-NEXT: v_add_f32_e32 v0, s12, v0
88+
; GFX11-NEXT: v_add_f32_e32 v1, s13, v36
89+
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33
90+
; GFX11-NEXT: s_clause 0x7
91+
; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7392
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7493
; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
7594
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
76-
; GFX11-NEXT: s_clause 0x3
77-
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
78-
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
79-
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
80-
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
81-
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
82-
; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
83-
; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
84-
; GFX11-NEXT: s_clause 0x1
8595
; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
8696
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
87-
; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
88-
; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
89-
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
90-
; GFX11-NEXT: s_clause 0x1
9197
; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
9298
; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
9399
; GFX11-NEXT: s_waitcnt vmcnt(14)
@@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
446452
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
447453
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
448454

449-
attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
455+
attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"}
450456
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
451457
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
452458
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }

llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
3030
; CHECK-NEXT: psInputAddr: 0
3131
; CHECK-NEXT: psInputEnable: 0
32+
; CHECK-NEXT: maxMemoryClusterDWords: 8
3233
; CHECK-NEXT: mode:
3334
; CHECK-NEXT: ieee: true
3435
; CHECK-NEXT: dx10-clamp: true
@@ -295,6 +296,7 @@
295296
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
296297
; CHECK-NEXT: psInputAddr: 0
297298
; CHECK-NEXT: psInputEnable: 0
299+
; CHECK-NEXT: maxMemoryClusterDWords: 8
298300
; CHECK-NEXT: mode:
299301
; CHECK-NEXT: ieee: true
300302
; CHECK-NEXT: dx10-clamp: true

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' }
3030
; AFTER-PEI-NEXT: psInputAddr: 0
3131
; AFTER-PEI-NEXT: psInputEnable: 0
32+
; AFTER-PEI-NEXT: maxMemoryClusterDWords: 8
3233
; AFTER-PEI-NEXT: mode:
3334
; AFTER-PEI-NEXT: ieee: true
3435
; AFTER-PEI-NEXT: dx10-clamp: true

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
3131
; CHECK-NEXT: psInputAddr: 0
3232
; CHECK-NEXT: psInputEnable: 0
33+
; CHECK-NEXT: maxMemoryClusterDWords: 8
3334
; CHECK-NEXT: mode:
3435
; CHECK-NEXT: ieee: true
3536
; CHECK-NEXT: dx10-clamp: true

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
3131
; CHECK-NEXT: psInputAddr: 0
3232
; CHECK-NEXT: psInputEnable: 0
33+
; CHECK-NEXT: maxMemoryClusterDWords: 8
3334
; CHECK-NEXT: mode:
3435
; CHECK-NEXT: ieee: true
3536
; CHECK-NEXT: dx10-clamp: true

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
4040
# FULL-NEXT: psInputAddr: 0
4141
# FULL-NEXT: psInputEnable: 0
42+
# FULL-NEXT: maxMemoryClusterDWords: 8
4243
# FULL-NEXT: mode:
4344
# FULL-NEXT: ieee: true
4445
# FULL-NEXT: dx10-clamp: true
@@ -143,6 +144,7 @@ body: |
143144
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
144145
# FULL-NEXT: psInputAddr: 0
145146
# FULL-NEXT: psInputEnable: 0
147+
# FULL-NEXT: maxMemoryClusterDWords: 8
146148
# FULL-NEXT: mode:
147149
# FULL-NEXT: ieee: true
148150
# FULL-NEXT: dx10-clamp: true
@@ -218,6 +220,7 @@ body: |
218220
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
219221
# FULL-NEXT: psInputAddr: 0
220222
# FULL-NEXT: psInputEnable: 0
223+
# FULL-NEXT: maxMemoryClusterDWords: 8
221224
# FULL-NEXT: mode:
222225
# FULL-NEXT: ieee: true
223226
# FULL-NEXT: dx10-clamp: true
@@ -294,6 +297,7 @@ body: |
294297
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
295298
# FULL-NEXT: psInputAddr: 0
296299
# FULL-NEXT: psInputEnable: 0
300+
# FULL-NEXT: maxMemoryClusterDWords: 8
297301
# FULL-NEXT: mode:
298302
# FULL-NEXT: ieee: true
299303
# FULL-NEXT: dx10-clamp: true
@@ -593,3 +597,15 @@ body: |
593597
%2:sgpr_64 = COPY %1
594598
%1:sgpr_64 = COPY %0
595599
...
600+
601+
---
602+
# ALL-LABEL: name: max_memory_cluster_dwords
603+
# ALL: maxMemoryClusterDWords: 16
604+
name: max_memory_cluster_dwords
605+
machineFunctionInfo:
606+
maxMemoryClusterDWords: 16
607+
body: |
608+
bb.0:
609+
SI_RETURN
610+
611+
...

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
4141
; CHECK-NEXT: psInputAddr: 0
4242
; CHECK-NEXT: psInputEnable: 0
43+
; CHECK-NEXT: maxMemoryClusterDWords: 8
4344
; CHECK-NEXT: mode:
4445
; CHECK-NEXT: ieee: true
4546
; CHECK-NEXT: dx10-clamp: true
@@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
8687
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
8788
; CHECK-NEXT: psInputAddr: 1
8889
; CHECK-NEXT: psInputEnable: 1
90+
; CHECK-NEXT: maxMemoryClusterDWords: 8
8991
; CHECK-NEXT: mode:
9092
; CHECK-NEXT: ieee: false
9193
; CHECK-NEXT: dx10-clamp: true
@@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
156158
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
157159
; CHECK-NEXT: psInputAddr: 0
158160
; CHECK-NEXT: psInputEnable: 0
161+
; CHECK-NEXT: maxMemoryClusterDWords: 8
159162
; CHECK-NEXT: mode:
160163
; CHECK-NEXT: ieee: true
161164
; CHECK-NEXT: dx10-clamp: true
@@ -208,6 +211,7 @@ define void @function() {
208211
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
209212
; CHECK-NEXT: psInputAddr: 0
210213
; CHECK-NEXT: psInputEnable: 0
214+
; CHECK-NEXT: maxMemoryClusterDWords: 8
211215
; CHECK-NEXT: mode:
212216
; CHECK-NEXT: ieee: true
213217
; CHECK-NEXT: dx10-clamp: true

0 commit comments

Comments
 (0)