Skip to content

Commit ca4d90b

Browse files
committed
[AMDGPU][True16] add PreRA hint to improve elimination for 16bit
and 32bit register copy
1 parent f62e168 commit ca4d90b

26 files changed

+283
-306
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,22 @@
2222
/// although the same shall be possible with other register classes and
2323
/// instructions if necessary.
2424
///
25+
/// This pass also adds register allocation hints to COPY.
26+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28+
/// This pass also adds register allocation hints to COPY.
29+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
30+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
31+
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
32+
/// the VGPR_32, the COPY can be completely eliminated.
33+
///
2534
//===----------------------------------------------------------------------===//
2635

2736
#include "GCNPreRAOptimizations.h"
2837
#include "AMDGPU.h"
2938
#include "GCNSubtarget.h"
3039
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
40+
#include "SIRegisterInfo.h"
3141
#include "llvm/CodeGen/LiveIntervals.h"
3242
#include "llvm/CodeGen/MachineFunctionPass.h"
3343
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253263
Changed |= processReg(Reg);
254264
}
255265

266+
if (!ST.useRealTrue16Insts())
267+
return Changed;
268+
269+
// Add RA hints to improve True16 COPY elimination.
270+
for (const MachineBasicBlock &MBB : MF) {
271+
for (const MachineInstr &MI : MBB) {
272+
if (MI.getOpcode() != AMDGPU::COPY)
273+
continue;
274+
Register Dst = MI.getOperand(0).getReg();
275+
Register Src = MI.getOperand(1).getReg();
276+
if (Dst.isVirtual() &&
277+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
278+
Src.isPhysical() &&
279+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
280+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
281+
if (Src.isVirtual() &&
282+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
283+
Dst.isPhysical() &&
284+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
285+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
286+
if (!Dst.isVirtual() || !Src.isVirtual())
287+
continue;
288+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
289+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
290+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
291+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
292+
}
293+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
294+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
295+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
296+
}
297+
}
298+
256299
return Changed;
257300
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3713,6 +3713,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
37133713
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
37143714
}
37153715

3716+
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3717+
ArrayRef<MCPhysReg> Order,
3718+
SmallVectorImpl<MCPhysReg> &Hints,
3719+
const MachineFunction &MF,
3720+
const VirtRegMap *VRM,
3721+
const LiveRegMatrix *Matrix) const {
3722+
3723+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3724+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3725+
3726+
std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3727+
3728+
switch (Hint.first) {
3729+
case AMDGPURI::Size32: {
3730+
Register Paired = Hint.second;
3731+
assert(Paired);
3732+
Register PairedPhys;
3733+
if (Paired.isPhysical()) {
3734+
PairedPhys =
3735+
getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3736+
} else if (VRM && VRM->hasPhys(Paired)) {
3737+
PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3738+
&AMDGPU::VGPR_32RegClass);
3739+
}
3740+
3741+
// Prefer the paired physreg.
3742+
if (PairedPhys)
3743+
// isLo(Paired) is implicitly true here from the API of
3744+
// getMatchingSuperReg.
3745+
Hints.push_back(PairedPhys);
3746+
return false;
3747+
}
3748+
case AMDGPURI::Size16: {
3749+
Register Paired = Hint.second;
3750+
assert(Paired);
3751+
Register PairedPhys;
3752+
if (Paired.isPhysical()) {
3753+
PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3754+
} else if (VRM && VRM->hasPhys(Paired)) {
3755+
PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3756+
}
3757+
3758+
// First prefer the paired physreg.
3759+
if (PairedPhys)
3760+
Hints.push_back(PairedPhys);
3761+
else {
3762+
// Add all the lo16 physregs.
3763+
// When the Paired operand has not yet been assigned a physreg it is
3764+
// better to try putting VirtReg in a lo16 register, because possibly
3765+
// later Paired can be assigned to the overlapping register and the COPY
3766+
// can be eliminated.
3767+
for (MCPhysReg PhysReg : Order) {
3768+
if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3769+
continue;
3770+
if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3771+
!MRI.isReserved(PhysReg))
3772+
Hints.push_back(PhysReg);
3773+
}
3774+
}
3775+
return false;
3776+
}
3777+
default:
3778+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3779+
VRM);
3780+
}
3781+
}
3782+
37163783
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
37173784
// Not a callee saved register.
37183785
return AMDGPU::SGPR30_SGPR31;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class LiveRegUnits;
2929
class RegisterBank;
3030
struct SGPRSpillBuilder;
3131

32+
/// Register allocation hint types. Helps eliminate unneeded COPY with True16
33+
namespace AMDGPURI {
34+
35+
enum { Size16 = 1, Size32 = 2 };
36+
37+
} // end namespace AMDGPURI
38+
3239
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3340
private:
3441
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
329336
unsigned getRegPressureSetLimit(const MachineFunction &MF,
330337
unsigned Idx) const override;
331338

339+
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
340+
SmallVectorImpl<MCPhysReg> &Hints,
341+
const MachineFunction &MF, const VirtRegMap *VRM,
342+
const LiveRegMatrix *Matrix) const override;
343+
332344
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
333345

334346
MCRegister getReturnAddressReg(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 41 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
3771237712
; GFX11TRUE16-LABEL: v_select_bf16:
3771337713
; GFX11TRUE16: ; %bb.0:
3771437714
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37715-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37716-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37717-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37718-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37719-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37720-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37715+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37716+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37717+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37718+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
3772137719
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3772237720
;
3772337721
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3778537783
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
3778637784
; GFX11TRUE16: ; %bb.0:
3778737785
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37788-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37789-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
37790-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
37791-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37792-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37793-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37794-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37795-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
37786+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37787+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37788+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37789+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
37790+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
3779637791
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3779737792
;
3779837793
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3786237857
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
3786337858
; GFX11TRUE16: ; %bb.0:
3786437859
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37865-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37866-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37867-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37868-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37869-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37870-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37871-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37872-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37860+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37861+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37862+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37863+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
37864+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
3787337865
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3787437866
;
3787537867
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42659,17 +42651,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4265942651
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
4266042652
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
4266142653
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42662-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4266342654
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
4266442655
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42656+
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4266542657
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
4266642658
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
4266742659
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
4266842660
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
4266942661
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
4267042662
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
4267142663
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42672-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4267342664
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
4267442665
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
4267542666
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42693,6 +42684,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4269342684
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
4269442685
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
4269542686
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42687+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4269642688
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
4269742689
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
4269842690
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42722,45 +42714,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4272242714
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
4272342715
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
4272442716
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42725-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
42717+
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
4272642718
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
42727-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
42719+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32
4272842720
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
42729-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
42721+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33
4273042722
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
4273142723
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
42732-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42733-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42724+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
4273442725
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
42735-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
42726+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v35
4273642727
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
4273742728
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
42738-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v36
42729+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v36
4273942730
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
42740-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v37
42731+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v37
4274142732
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
4274242733
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
42743-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v38
42734+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v38
4274442735
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
42745-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v39
42736+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v39
4274642737
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
4274742738
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
42748-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v48
42739+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v48
4274942740
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
42750-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v49
42741+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v49
4275142742
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
4275242743
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
42753-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v50
42744+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v50
4275442745
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
42755-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v51
42746+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51
4275642747
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
4275742748
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
42758-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v52
42749+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v52
4275942750
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
42760-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
42751+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v53
4276142752
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
4276242753
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
42763-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v54
42754+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v54
4276442755
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
4276542756
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
4276642757
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42798,20 +42789,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4279842789
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
4279942790
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
4280042791
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
42801-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
42792+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
4280242793
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
4280342794
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
4280442795
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
4280542796
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
4280642797
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
4280742798
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
42808-
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20.l, v19.l, s29
42809-
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22.l, v21.l, s26
42810-
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24.l, v23.l, s24
42811-
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s22
42812-
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28.l, v27.l, s20
42813-
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30.l, v29.l, s18
42814-
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31.l, s16
42799+
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19.l, v18.l, s29
42800+
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21.l, v20.l, s26
42801+
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23.l, v22.l, s24
42802+
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25.l, v24.l, s22
42803+
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27.l, v26.l, s20
42804+
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29.l, v28.l, s18
42805+
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30.l, s16
4281542806
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
4281642807
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
4281742808
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42820,7 +42811,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4282042811
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
4282142812
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
4282242813
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
42823-
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
42814+
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, vcc_lo
4282442815
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
4282542816
;
4282642817
; GFX11FAKE16-LABEL: v_vselect_v32bf16:

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -908,10 +908,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
908908
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
909909
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
910910
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
911-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
912911
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
913912
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
914-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
913+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
915914
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
916915
;
917916
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -981,12 +980,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
981980
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
982981
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
983982
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
984-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
985-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
983+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
986984
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
987985
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
988986
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
989-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
987+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
990988
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
991989
;
992990
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
7676
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
7878
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
79+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
8280
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
8381
; GFX11-SDAG-NEXT: s_endpgm
8482
;
@@ -98,9 +96,7 @@ define amdgpu_kernel void @fadd_f16(
9896
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
9997
; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
10098
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
101-
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
102-
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
103-
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
99+
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
104100
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
105101
; GFX11-GISEL-NEXT: s_endpgm
106102
;

0 commit comments

Comments
 (0)