Skip to content

[AMDGPU] Make max dwords of memory cluster configurable #119342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,31 +554,38 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;

const SIMachineFunctionInfo *MFI =
FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
}

// In order to avoid register pressure, on an average, the number of DWORDS
// loaded together by all clustered mem ops should not exceed 8. This is an
// empirical value based on certain observations and performance related
// experiments.
// loaded together by all clustered mem ops should not exceed
// MaxMemoryClusterDWords. This is an empirical value based on certain
// observations and performance related experiments.
// The good thing about this heuristic is - it avoids clustering of too many
// sub-word loads, and also avoids clustering of wide loads. Below is the
// brief summary of how the heuristic behaves for various `LoadSize`.
// brief summary of how the heuristic behaves for various `LoadSize` when
// MaxMemoryClusterDWords is 8.
//
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
const unsigned LoadSize = NumBytes / ClusterSize;
const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
return NumDWORDs <= 8;
const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
return NumDWords <= MaxMemoryClusterDWords;
}

// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class RegScavenger;
class TargetRegisterClass;
class ScheduleHazardRecognizer;

constexpr unsigned DefaultMemoryClusterDWordsLimit = 8;

/// Mark the MMO of a uniform load if there are no potentially clobbering stores
/// on any path from the start of an entry function to this load.
static const MachineMemOperand::Flags MONoClobber =
Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (!S.empty())
S.consumeInteger(0, HighBitsOf32BitAddress);

MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
"amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);

// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
// RA, shift it to the lowest available unused VGPR if the one exist.
Expand Down Expand Up @@ -694,8 +697,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
const llvm::MachineFunction &MF)
: ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
GDSSize(MFI.getGDSSize()),
DynLDSAlign(MFI.getDynLDSAlign()), IsEntryFunction(MFI.isEntryFunction()),
GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
IsEntryFunction(MFI.isEntryFunction()),
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
Expand All @@ -708,8 +711,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
PSInputAddr(MFI.getPSInputAddr()),
PSInputEnable(MFI.getPSInputEnable()),
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
Expand Down Expand Up @@ -744,6 +747,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
DynLDSAlign = YamlMFI.DynLDSAlign;
PSInputAddr = YamlMFI.PSInputAddr;
PSInputEnable = YamlMFI.PSInputEnable;
MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {

unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;

SIMode Mode;
std::optional<FrameIndex> ScavengeFI;
Expand Down Expand Up @@ -333,6 +334,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
YamlIO.mapOptional("maxMemoryClusterDWords", MFI.MaxMemoryClusterDWords,
DefaultMemoryClusterDWordsLimit);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
Expand Down Expand Up @@ -487,6 +490,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// Current recorded maximum possible occupancy.
unsigned Occupancy;

// Maximum number of dwords that can be clusterred during instruction
// scheduler stage.
unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;

mutable std::optional<bool> UsesAGPRs;

MCPhysReg getNextUserSGPR() const;
Expand Down Expand Up @@ -1109,6 +1116,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
limitOccupancy(MF);
}

unsigned getMaxMemoryClusterDWords() const { return MaxMemoryClusterDWords; }

bool mayNeedAGPRs() const {
return MayNeedAGPRs;
}
Expand Down
104 changes: 55 additions & 49 deletions llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
; GFX11: ; %bb.0: ; %.entry
; GFX11-NEXT: s_mov_b32 s24, exec_lo
; GFX11-NEXT: s_mov_b32 s33, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
Expand All @@ -21,73 +21,79 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s16
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_clause 0xf
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x50
; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0x60
; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0x70
; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0x80
; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0x90
; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xa0
; GFX11-NEXT: s_buffer_load_b64 s[38:39], s[12:15], 0xb0
; GFX11-NEXT: s_buffer_load_b64 s[40:41], s[12:15], 0xc0
; GFX11-NEXT: s_buffer_load_b64 s[42:43], s[12:15], 0xd0
; GFX11-NEXT: s_buffer_load_b64 s[44:45], s[12:15], 0xe0
; GFX11-NEXT: s_buffer_load_b64 s[46:47], s[12:15], 0xf0
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
; GFX11-NEXT: v_interp_p2_f32 v36, v2, v1, v4 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f32_e32 v5, s17, v36
; GFX11-NEXT: v_add_f32_e32 v4, s16, v0
; GFX11-NEXT: v_add_f32_e32 v8, s18, v0
; GFX11-NEXT: v_add_f32_e32 v9, s19, v36
; GFX11-NEXT: v_add_f32_e32 v12, s20, v0
; GFX11-NEXT: v_add_f32_e32 v13, s21, v36
; GFX11-NEXT: v_add_f32_e32 v16, s22, v0
; GFX11-NEXT: v_add_f32_e32 v17, s23, v36
; GFX11-NEXT: v_add_f32_e32 v20, s24, v0
; GFX11-NEXT: v_add_f32_e32 v21, s25, v36
; GFX11-NEXT: v_add_f32_e32 v24, s26, v0
; GFX11-NEXT: v_add_f32_e32 v25, s27, v36
; GFX11-NEXT: v_add_f32_e32 v28, s28, v0
; GFX11-NEXT: v_add_f32_e32 v29, s29, v36
; GFX11-NEXT: v_add_f32_e32 v32, s30, v0
; GFX11-NEXT: v_add_f32_e32 v33, s31, v36
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_add_f32_e32 v37, s34, v0
; GFX11-NEXT: v_add_f32_e32 v38, s35, v36
; GFX11-NEXT: v_add_f32_e32 v40, s36, v0
; GFX11-NEXT: v_add_f32_e32 v41, s37, v36
; GFX11-NEXT: v_add_f32_e32 v44, s38, v0
; GFX11-NEXT: v_add_f32_e32 v45, s39, v36
; GFX11-NEXT: v_add_f32_e32 v48, s40, v0
; GFX11-NEXT: v_add_f32_e32 v49, s41, v36
; GFX11-NEXT: v_add_f32_e32 v52, s42, v0
; GFX11-NEXT: v_add_f32_e32 v53, s43, v36
; GFX11-NEXT: v_add_f32_e32 v56, s44, v0
; GFX11-NEXT: v_add_f32_e32 v57, s45, v36
; GFX11-NEXT: v_add_f32_e32 v60, s46, v0
; GFX11-NEXT: v_add_f32_e32 v61, s47, v36
; GFX11-NEXT: v_add_f32_e32 v0, s12, v0
; GFX11-NEXT: v_add_f32_e32 v1, s13, v36
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s33
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: image_sample v[36:39], v[37:38], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(14)
Expand Down Expand Up @@ -446,7 +452,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8

attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "amdgpu-max-memory-cluster-dwords"="32"}
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -295,6 +296,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
; AFTER-PEI-NEXT: workItemIDX: { reg: '$vgpr0' }
; AFTER-PEI-NEXT: psInputAddr: 0
; AFTER-PEI-NEXT: psInputEnable: 0
; AFTER-PEI-NEXT: maxMemoryClusterDWords: 8
; AFTER-PEI-NEXT: mode:
; AFTER-PEI-NEXT: ieee: true
; AFTER-PEI-NEXT: dx10-clamp: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down
16 changes: 16 additions & 0 deletions llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -143,6 +144,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -218,6 +220,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -294,6 +297,7 @@ body: |
# FULL-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
# FULL-NEXT: psInputAddr: 0
# FULL-NEXT: psInputEnable: 0
# FULL-NEXT: maxMemoryClusterDWords: 8
# FULL-NEXT: mode:
# FULL-NEXT: ieee: true
# FULL-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -593,3 +597,15 @@ body: |
%2:sgpr_64 = COPY %1
%1:sgpr_64 = COPY %0
...

---
# ALL-LABEL: name: max_memory_cluster_dwords
# ALL: maxMemoryClusterDWords: 16
name: max_memory_cluster_dwords
machineFunctionInfo:
maxMemoryClusterDWords: 16
body: |
bb.0:
SI_RETURN

...
4 changes: 4 additions & 0 deletions llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -86,6 +87,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
; CHECK-NEXT: psInputAddr: 1
; CHECK-NEXT: psInputEnable: 1
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: false
; CHECK-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -156,6 +158,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down Expand Up @@ -208,6 +211,7 @@ define void @function() {
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
; CHECK-NEXT: maxMemoryClusterDWords: 8
; CHECK-NEXT: mode:
; CHECK-NEXT: ieee: true
; CHECK-NEXT: dx10-clamp: true
Expand Down
Loading