Skip to content

Commit 9ea79f3

Browse files
committed
[AMDGPU] Fold multiple aligned v_mov_b32 to v_mov_b64 on gfx942
1 parent 572add0 commit 9ea79f3

File tree

8 files changed

+305
-120
lines changed

8 files changed

+305
-120
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 99 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ class SIFoldOperandsImpl {
160160
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
161161
bool tryFoldOMod(MachineInstr &MI);
162162
bool tryFoldRegSequence(MachineInstr &MI);
163+
bool tryFoldImmRegSequence(MachineInstr &MI);
163164
bool tryFoldPhiAGPR(MachineInstr &MI);
164165
bool tryFoldLoad(MachineInstr &MI);
165166

@@ -2189,6 +2190,99 @@ bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
21892190
return true;
21902191
}
21912192

2193+
// gfx942+ can use V_MOV_B64 for materializing constant immediates.
2194+
// For example:
2195+
// %0:vgpr_32 = V_MOV_B32 0, implicit $exec
2196+
// %1:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1
2197+
// ->
2198+
// %1:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
2199+
bool SIFoldOperandsImpl::tryFoldImmRegSequence(MachineInstr &MI) {
2200+
assert(MI.isRegSequence());
2201+
auto Reg = MI.getOperand(0).getReg();
2202+
const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
2203+
2204+
if (!ST->hasMovB64() || !TRI->isVGPR(*MRI, Reg) ||
2205+
!MRI->hasOneNonDBGUse(Reg) || !TRI->isProperlyAlignedRC(*DefRC))
2206+
return false;
2207+
2208+
SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
2209+
if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
2210+
return false;
2211+
2212+
// Only attempting to fold immediate materializations.
2213+
if (!Defs.empty() &&
2214+
!std::all_of(Defs.begin(), Defs.end(),
2215+
[](const std::pair<MachineOperand *, unsigned> &Op) {
2216+
return Op.first->isImm();
2217+
}))
2218+
return false;
2219+
2220+
SmallVector<uint64_t, 8> ImmVals;
2221+
uint64_t ImmVal = 0;
2222+
uint64_t ImmSize = 0;
2223+
for (unsigned i = 0; i < Defs.size(); ++i) {
2224+
auto &[Op, SubIdx] = Defs[i];
2225+
unsigned SubRegSize = TRI->getSubRegIdxSize(SubIdx);
2226+
unsigned Shift = (TRI->getChannelFromSubReg(SubIdx) % 2) * SubRegSize;
2227+
ImmSize += SubRegSize;
2228+
ImmVal |= Op->getImm() << Shift;
2229+
2230+
if (ImmSize > 64 || SubRegSize == 64)
2231+
return false;
2232+
2233+
if (ImmSize == 64) {
2234+
// Only 32 bit literals can be encoded.
2235+
if (!isUInt<32>(ImmVal))
2236+
return false;
2237+
ImmVals.push_back(ImmVal);
2238+
ImmVal = 0;
2239+
ImmSize = 0;
2240+
}
2241+
}
2242+
2243+
assert(ImmVals.size() > 0 &&
2244+
"REG_SEQUENCE should have at least 1 operand pair");
2245+
2246+
// Can only combine REG_SEQUENCE into one 64b immediate materialization mov.
2247+
if (DefRC == TRI->getVGPR64Class()) {
2248+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2249+
TII->get(AMDGPU::V_MOV_B64_PSEUDO), Reg)
2250+
.addImm(ImmVals[0]);
2251+
MI.eraseFromParent();
2252+
return true;
2253+
}
2254+
2255+
if (ImmVals.size() == 1)
2256+
return false;
2257+
2258+
// Can't bail from here on out: modifying the MI.
2259+
2260+
// Remove source operands.
2261+
for (unsigned i = MI.getNumOperands() - 1; i > 0; --i)
2262+
MI.removeOperand(i);
2263+
2264+
for (unsigned i = 0; i < ImmVals.size(); ++i) {
2265+
const TargetRegisterClass *RC = TRI->getVGPR64Class();
2266+
auto MovReg = MRI->createVirtualRegister(RC);
2267+
// Duplicate vmov imm materializations (e.g., splatted operands) should get
2268+
// combined by MachineCSE pass.
2269+
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2270+
TII->get(AMDGPU::V_MOV_B64_PSEUDO), MovReg)
2271+
.addImm(ImmVals[i]);
2272+
2273+
// 2 subregs with no overlap (i.e., sub0_sub1, sub2_sub3, etc.).
2274+
unsigned SubReg64B =
2275+
SIRegisterInfo::getSubRegFromChannel(/*Channel=*/i * 2, /*SubRegs=*/2);
2276+
2277+
MI.addOperand(MachineOperand::CreateReg(MovReg, /*isDef=*/false));
2278+
MI.addOperand(MachineOperand::CreateImm(SubReg64B));
2279+
}
2280+
2281+
LLVM_DEBUG(dbgs() << "Folded into " << MI);
2282+
2283+
return true;
2284+
}
2285+
21922286
// Try to fold a reg_sequence with vgpr output and agpr inputs into an
21932287
// instruction which can take an agpr. So far that means a store.
21942288
bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
@@ -2618,9 +2712,11 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
26182712
continue;
26192713
}
26202714

2621-
if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2622-
Changed = true;
2623-
continue;
2715+
if (MI.isRegSequence()) {
2716+
if (tryFoldImmRegSequence(MI) || tryFoldRegSequence(MI)) {
2717+
Changed = true;
2718+
continue;
2719+
}
26242720
}
26252721

26262722
if (MI.isPHI() && tryFoldPhiAGPR(MI)) {

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4139,8 +4139,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
41394139
; GFX942-LABEL: store_load_i64_aligned:
41404140
; GFX942: ; %bb.0: ; %bb
41414141
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4142-
; GFX942-NEXT: v_mov_b32_e32 v2, 15
4143-
; GFX942-NEXT: v_mov_b32_e32 v3, 0
4142+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
41444143
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
41454144
; GFX942-NEXT: s_waitcnt vmcnt(0)
41464145
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -4250,8 +4249,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
42504249
; GFX942-LABEL: store_load_i64_unaligned:
42514250
; GFX942: ; %bb.0: ; %bb
42524251
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4253-
; GFX942-NEXT: v_mov_b32_e32 v2, 15
4254-
; GFX942-NEXT: v_mov_b32_e32 v3, 0
4252+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15
42554253
; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
42564254
; GFX942-NEXT: s_waitcnt vmcnt(0)
42574255
; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
@@ -5010,10 +5008,8 @@ define amdgpu_ps void @large_offset() {
50105008
;
50115009
; GFX942-LABEL: large_offset:
50125010
; GFX942: ; %bb.0: ; %bb
5013-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
5014-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
5015-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
5016-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
5011+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
5012+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
50175013
; GFX942-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
50185014
; GFX942-NEXT: s_waitcnt vmcnt(0)
50195015
; GFX942-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ declare i32 @llvm.amdgcn.workitem.id.x()
1313
; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
1414
; GCN-DAG: s_load_dwordx16
1515
; GCN-DAG: s_load_dwordx16
16-
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
17-
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
16+
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
17+
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
18+
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
19+
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
1820
; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
1921
; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
2022
; GFX942: v_mfma_f32_32x32x4_2b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
@@ -32,8 +34,10 @@ bb:
3234

3335
; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
3436
; GCN-DAG: s_load_dwordx16
35-
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
36-
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
37+
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
38+
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
39+
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
40+
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
3741
; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
3842
; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
3943
; GFX942: v_mfma_f32_16x16x4_4b_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
@@ -51,8 +55,10 @@ bb:
5155

5256
; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
5357
; GCN-DAG: s_load_dwordx4
54-
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
55-
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
58+
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
59+
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
60+
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
61+
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
5662
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
5763
; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
5864
; GFX942: v_mfma_f32_4x4x4_16b_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
@@ -70,8 +76,10 @@ bb:
7076

7177
; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
7278
; GCN-DAG: s_load_dwordx16
73-
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
74-
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
79+
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
80+
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
81+
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
82+
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
7583
; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
7684
; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
7785
; GFX942: v_mfma_f32_32x32x8_bf16 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
@@ -89,8 +97,10 @@ bb:
8997

9098
; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
9199
; GCN-DAG: s_load_dwordx4
92-
; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
93-
; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
100+
; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2
101+
; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1
102+
; GFX942-DAG: v_mov_b64_e32 v[[[ONE:[0-9]+]]:{{[0-9]+}}], 1
103+
; GFX942-DAG: v_mov_b64_e32 v[[[TWO:[0-9]+]]:{{[0-9]+}}], 2
94104
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
95105
; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9]+}}], v[[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
96106
; GFX942: v_mfma_f32_16x16x16_bf16 [[RES:a\[[0-9]+:[0-9]+\]]], v[[[ONE]]:{{[0-9+]}}], v[[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3

llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll

Lines changed: 37 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ define <2 x i32> @uniform_masked_load_ptr1_mask_v2i32(ptr addrspace(1) inreg noc
77
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
99
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
10-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
11-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
10+
; GFX942-NEXT: v_mov_b32_e32 v2, 0
11+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
1212
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
1313
; GFX942-NEXT: s_cbranch_execz .LBB0_2
1414
; GFX942-NEXT: ; %bb.1: ; %cond.load
15-
; GFX942-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1]
15+
; GFX942-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
1616
; GFX942-NEXT: .LBB0_2:
1717
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
1818
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -30,14 +30,13 @@ define <4 x i32> @uniform_masked_load_ptr1_mask_v4i32(ptr addrspace(1) inreg noc
3030
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3131
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
3232
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
33-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
34-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
35-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
36-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
33+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
34+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
35+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
3736
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
3837
; GFX942-NEXT: s_cbranch_execz .LBB1_2
3938
; GFX942-NEXT: ; %bb.1: ; %cond.load
40-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
39+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
4140
; GFX942-NEXT: .LBB1_2:
4241
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
4342
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -55,14 +54,13 @@ define <4 x float> @uniform_masked_load_ptr1_mask_v4f32(ptr addrspace(1) inreg n
5554
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5655
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
5756
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
58-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
59-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
60-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
61-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
57+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
58+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
59+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
6260
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
6361
; GFX942-NEXT: s_cbranch_execz .LBB2_2
6462
; GFX942-NEXT: ; %bb.1: ; %cond.load
65-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
63+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
6664
; GFX942-NEXT: .LBB2_2:
6765
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
6866
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -80,20 +78,16 @@ define <8 x i32> @uniform_masked_load_ptr1_mask_v8i32(ptr addrspace(1) inreg noc
8078
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8179
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
8280
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
83-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
84-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
85-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
86-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
87-
; GFX942-NEXT: v_mov_b32_e32 v4, v0
88-
; GFX942-NEXT: v_mov_b32_e32 v5, v0
89-
; GFX942-NEXT: v_mov_b32_e32 v6, v0
90-
; GFX942-NEXT: v_mov_b32_e32 v7, v0
81+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
82+
; GFX942-NEXT: v_mov_b32_e32 v8, 0
83+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
84+
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
85+
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
9186
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
9287
; GFX942-NEXT: s_cbranch_execz .LBB3_2
9388
; GFX942-NEXT: ; %bb.1: ; %cond.load
94-
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
95-
; GFX942-NEXT: s_nop 0
96-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
89+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
90+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
9791
; GFX942-NEXT: .LBB3_2:
9892
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
9993
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -111,20 +105,16 @@ define <8 x float> @uniform_masked_load_ptr1_mask_v8f32(ptr addrspace(1) inreg n
111105
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112106
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
113107
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
114-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
115-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
116-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
117-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
118-
; GFX942-NEXT: v_mov_b32_e32 v4, v0
119-
; GFX942-NEXT: v_mov_b32_e32 v5, v0
120-
; GFX942-NEXT: v_mov_b32_e32 v6, v0
121-
; GFX942-NEXT: v_mov_b32_e32 v7, v0
108+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
109+
; GFX942-NEXT: v_mov_b32_e32 v8, 0
110+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
111+
; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
112+
; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
122113
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
123114
; GFX942-NEXT: s_cbranch_execz .LBB4_2
124115
; GFX942-NEXT: ; %bb.1: ; %cond.load
125-
; GFX942-NEXT: global_load_dwordx4 v[4:7], v0, s[0:1] offset:16
126-
; GFX942-NEXT: s_nop 0
127-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
116+
; GFX942-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
117+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
128118
; GFX942-NEXT: .LBB4_2:
129119
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
130120
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -142,14 +132,13 @@ define <8 x i16> @uniform_masked_load_ptr1_mask_v8i16(ptr addrspace(1) inreg noc
142132
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143133
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
144134
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
145-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
146-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
147-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
148-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
135+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
136+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
137+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
149138
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
150139
; GFX942-NEXT: s_cbranch_execz .LBB5_2
151140
; GFX942-NEXT: ; %bb.1: ; %cond.load
152-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
141+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
153142
; GFX942-NEXT: .LBB5_2:
154143
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
155144
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -167,14 +156,13 @@ define <8 x half> @uniform_masked_load_ptr1_mask_v8f16(ptr addrspace(1) inreg no
167156
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168157
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
169158
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
170-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
171-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
172-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
173-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
159+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
160+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
161+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
174162
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
175163
; GFX942-NEXT: s_cbranch_execz .LBB6_2
176164
; GFX942-NEXT: ; %bb.1: ; %cond.load
177-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
165+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
178166
; GFX942-NEXT: .LBB6_2:
179167
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
180168
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -192,14 +180,13 @@ define <8 x bfloat> @uniform_masked_load_ptr1_mask_v8bf16(ptr addrspace(1) inreg
192180
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193181
; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
194182
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
195-
; GFX942-NEXT: v_mov_b32_e32 v0, 0
196-
; GFX942-NEXT: v_mov_b32_e32 v1, v0
197-
; GFX942-NEXT: v_mov_b32_e32 v2, v0
198-
; GFX942-NEXT: v_mov_b32_e32 v3, v0
183+
; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0
184+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
185+
; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
199186
; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc
200187
; GFX942-NEXT: s_cbranch_execz .LBB7_2
201188
; GFX942-NEXT: ; %bb.1: ; %cond.load
202-
; GFX942-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1]
189+
; GFX942-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1]
203190
; GFX942-NEXT: .LBB7_2:
204191
; GFX942-NEXT: s_or_b64 exec, exec, s[2:3]
205192
; GFX942-NEXT: s_waitcnt vmcnt(0)

0 commit comments

Comments
 (0)