Skip to content

Commit 0f31a55

Browse files
committed
[AMDGPU][True16] add PreRA hint to improve elimination for 16bit
and 32bit register copy
1 parent 2bbe30b commit 0f31a55

25 files changed

+279
-294
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,22 @@
2222
/// although the same shall be possible with other register classes and
2323
/// instructions if necessary.
2424
///
25+
/// This pass also adds register allocation hints to COPY.
26+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28+
/// This pass also adds register allocation hints to COPY.
29+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
30+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
31+
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
32+
/// the VGPR_32, the COPY can be completely eliminated.
33+
///
2534
//===----------------------------------------------------------------------===//
2635

2736
#include "GCNPreRAOptimizations.h"
2837
#include "AMDGPU.h"
2938
#include "GCNSubtarget.h"
3039
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
40+
#include "SIRegisterInfo.h"
3141
#include "llvm/CodeGen/LiveIntervals.h"
3242
#include "llvm/CodeGen/MachineFunctionPass.h"
3343
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253263
Changed |= processReg(Reg);
254264
}
255265

266+
if (!ST.useRealTrue16Insts())
267+
return Changed;
268+
269+
// Add RA hints to improve True16 COPY elimination.
270+
for (const MachineBasicBlock &MBB : MF) {
271+
for (const MachineInstr &MI : MBB) {
272+
if (MI.getOpcode() != AMDGPU::COPY)
273+
continue;
274+
Register Dst = MI.getOperand(0).getReg();
275+
Register Src = MI.getOperand(1).getReg();
276+
if (Dst.isVirtual() &&
277+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
278+
Src.isPhysical() &&
279+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
280+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
281+
if (Src.isVirtual() &&
282+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
283+
Dst.isPhysical() &&
284+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
285+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
286+
if (!Dst.isVirtual() || !Src.isVirtual())
287+
continue;
288+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
289+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
290+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
291+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
292+
}
293+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
294+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
295+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
296+
}
297+
}
298+
256299
return Changed;
257300
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3685,6 +3685,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
36853685
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
36863686
}
36873687

3688+
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3689+
ArrayRef<MCPhysReg> Order,
3690+
SmallVectorImpl<MCPhysReg> &Hints,
3691+
const MachineFunction &MF,
3692+
const VirtRegMap *VRM,
3693+
const LiveRegMatrix *Matrix) const {
3694+
3695+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3696+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3697+
3698+
std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3699+
3700+
switch (Hint.first) {
3701+
case AMDGPURI::Size32: {
3702+
Register Paired = Hint.second;
3703+
assert(Paired);
3704+
Register PairedPhys;
3705+
if (Paired.isPhysical()) {
3706+
PairedPhys =
3707+
getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3708+
} else if (VRM && VRM->hasPhys(Paired)) {
3709+
PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3710+
&AMDGPU::VGPR_32RegClass);
3711+
}
3712+
3713+
// Prefer the paired physreg.
3714+
if (PairedPhys)
3715+
// isLo(Paired) is implicitly true here from the API of
3716+
// getMatchingSuperReg.
3717+
Hints.push_back(PairedPhys);
3718+
return false;
3719+
}
3720+
case AMDGPURI::Size16: {
3721+
Register Paired = Hint.second;
3722+
assert(Paired);
3723+
Register PairedPhys;
3724+
if (Paired.isPhysical()) {
3725+
PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3726+
} else if (VRM && VRM->hasPhys(Paired)) {
3727+
PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3728+
}
3729+
3730+
// First prefer the paired physreg.
3731+
if (PairedPhys)
3732+
Hints.push_back(PairedPhys);
3733+
else {
3734+
// Add all the lo16 physregs.
3735+
// When the Paired operand has not yet been assigned a physreg it is
3736+
// better to try putting VirtReg in a lo16 register, because possibly
3737+
// later Paired can be assigned to the overlapping register and the COPY
3738+
// can be eliminated.
3739+
for (MCPhysReg PhysReg : Order) {
3740+
if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3741+
continue;
3742+
if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3743+
!MRI.isReserved(PhysReg))
3744+
Hints.push_back(PhysReg);
3745+
}
3746+
}
3747+
return false;
3748+
}
3749+
default:
3750+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3751+
VRM);
3752+
}
3753+
}
3754+
36883755
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
36893756
// Not a callee saved register.
36903757
return AMDGPU::SGPR30_SGPR31;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class LiveRegUnits;
2929
class RegisterBank;
3030
struct SGPRSpillBuilder;
3131

32+
/// Register allocation hint types. Helps eliminate unneeded COPY with True16
33+
namespace AMDGPURI {
34+
35+
enum { Size16 = 1, Size32 = 2 };
36+
37+
} // end namespace AMDGPURI
38+
3239
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3340
private:
3441
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
329336
unsigned getRegPressureSetLimit(const MachineFunction &MF,
330337
unsigned Idx) const override;
331338

339+
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
340+
SmallVectorImpl<MCPhysReg> &Hints,
341+
const MachineFunction &MF, const VirtRegMap *VRM,
342+
const LiveRegMatrix *Matrix) const override;
343+
332344
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
333345

334346
MCRegister getReturnAddressReg(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 41 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
3771237712
; GFX11TRUE16-LABEL: v_select_bf16:
3771337713
; GFX11TRUE16: ; %bb.0:
3771437714
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37715-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37716-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37717-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37718-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37719-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37720-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37715+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37716+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37717+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37718+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
3772137719
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3772237720
;
3772337721
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3778537783
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
3778637784
; GFX11TRUE16: ; %bb.0:
3778737785
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37788-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37789-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
37790-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
37791-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37792-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37793-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37794-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37795-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
37786+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37787+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37788+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37789+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
37790+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
3779637791
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3779737792
;
3779837793
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
3786237857
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
3786337858
; GFX11TRUE16: ; %bb.0:
3786437859
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37865-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37866-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37867-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37868-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37869-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37870-
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37871-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37872-
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37860+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37861+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37862+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37863+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
37864+
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
3787337865
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
3787437866
;
3787537867
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42810,17 +42802,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4281042802
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
4281142803
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
4281242804
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42813-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4281442805
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
4281542806
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42807+
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
4281642808
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
4281742809
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
4281842810
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
4281942811
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
4282042812
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
4282142813
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
4282242814
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42823-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4282442815
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
4282542816
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
4282642817
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42844,6 +42835,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4284442835
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
4284542836
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
4284642837
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42838+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
4284742839
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
4284842840
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
4284942841
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42873,45 +42865,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4287342865
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
4287442866
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
4287542867
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42876-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
42868+
; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
4287742869
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
42878-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v32
42870+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v32
4287942871
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
42880-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v33
42872+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v33
4288142873
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
4288242874
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
42883-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42884-
; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42875+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
4288542876
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
42886-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v35
42877+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v35
4288742878
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
4288842879
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
42889-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v36
42880+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v36
4289042881
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
42891-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v37
42882+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v37
4289242883
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
4289342884
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
42894-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v38
42885+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v38
4289542886
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
42896-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v39
42887+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v39
4289742888
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
4289842889
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
42899-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v48
42890+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v48
4290042891
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
42901-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v49
42892+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v49
4290242893
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
4290342894
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
42904-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v50
42895+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v50
4290542896
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
42906-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v51
42897+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v51
4290742898
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
4290842899
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
42909-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v52
42900+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v52
4291042901
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
42911-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v53
42902+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v53
4291242903
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
4291342904
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
42914-
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v54
42905+
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v54
4291542906
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
4291642907
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
4291742908
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42949,20 +42940,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4294942940
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
4295042941
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
4295142942
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
42952-
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
42943+
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
4295342944
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
4295442945
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
4295542946
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
4295642947
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
4295742948
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
4295842949
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
42959-
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20.l, v19.l, s29
42960-
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22.l, v21.l, s26
42961-
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24.l, v23.l, s24
42962-
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26.l, v25.l, s22
42963-
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28.l, v27.l, s20
42964-
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30.l, v29.l, s18
42965-
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31.l, s16
42950+
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19.l, v18.l, s29
42951+
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21.l, v20.l, s26
42952+
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23.l, v22.l, s24
42953+
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25.l, v24.l, s22
42954+
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27.l, v26.l, s20
42955+
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29.l, v28.l, s18
42956+
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30.l, s16
4296642957
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
4296742958
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
4296842959
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42971,7 +42962,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
4297142962
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
4297242963
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
4297342964
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
42974-
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18.l, v17.l, vcc_lo
42965+
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17.l, v16.l, vcc_lo
4297542966
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
4297642967
;
4297742968
; GFX11FAKE16-LABEL: v_vselect_v32bf16:

llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -908,10 +908,9 @@ define <2 x i16> @chain_hi_to_lo_global_other_dep(ptr addrspace(1) %ptr) {
908908
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
909909
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc
910910
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
911-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
912911
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
913912
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
914-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
913+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
915914
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
916915
;
917916
; GFX11-FAKE16-LABEL: chain_hi_to_lo_global_other_dep:
@@ -981,12 +980,11 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(ptr addrspace(0) %ptr) {
981980
; GFX11-TRUE16-NEXT: flat_load_d16_b16 v2, v[0:1] offset:2 glc dlc
982981
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
983982
; GFX11-TRUE16-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc
984-
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
985-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
983+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
986984
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
987985
; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
988986
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
989-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
987+
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
990988
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
991989
;
992990
; GFX11-FAKE16-LABEL: chain_hi_to_lo_flat_other_dep:

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
7676
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
7878
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
79+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
8280
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
8381
; GFX11-SDAG-NEXT: s_endpgm
8482
;
@@ -98,9 +96,7 @@ define amdgpu_kernel void @fadd_f16(
9896
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
9997
; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
10098
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
101-
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
102-
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
103-
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
99+
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
104100
; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
105101
; GFX11-GISEL-NEXT: s_endpgm
106102
;

0 commit comments

Comments
 (0)