Skip to content

Commit 889b4cb

Browse files
committed
[AMDGPU][True16] add PreRA hint to improve elimination for 16bit
and 32bit register copy
1 parent 9106ee2 commit 889b4cb

File tree

4 files changed

+123
-3
lines changed

4 files changed

+123
-3
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,22 @@
2222
/// although the same shall be possible with other register classes and
2323
/// instructions if necessary.
2424
///
25+
/// This pass also adds register allocation hints to COPY.
26+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28+
/// This pass also adds register allocation hints to COPY.
29+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
30+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
31+
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
32+
/// the VGPR_32, the COPY can be completely eliminated.
33+
///
2534
//===----------------------------------------------------------------------===//
2635

2736
#include "GCNPreRAOptimizations.h"
2837
#include "AMDGPU.h"
2938
#include "GCNSubtarget.h"
3039
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
40+
#include "SIRegisterInfo.h"
3141
#include "llvm/CodeGen/LiveIntervals.h"
3242
#include "llvm/CodeGen/MachineFunctionPass.h"
3343
#include "llvm/InitializePasses.h"
@@ -253,5 +263,38 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
253263
Changed |= processReg(Reg);
254264
}
255265

266+
if (!ST.useRealTrue16Insts())
267+
return Changed;
268+
269+
// Add RA hints to improve True16 COPY elimination.
270+
for (const MachineBasicBlock &MBB : MF) {
271+
for (const MachineInstr &MI : MBB) {
272+
if (MI.getOpcode() != AMDGPU::COPY)
273+
continue;
274+
Register Dst = MI.getOperand(0).getReg();
275+
Register Src = MI.getOperand(1).getReg();
276+
if (Dst.isVirtual() &&
277+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
278+
Src.isPhysical() &&
279+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
280+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
281+
if (Src.isVirtual() &&
282+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
283+
Dst.isPhysical() &&
284+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
285+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
286+
if (!Dst.isVirtual() || !Src.isVirtual())
287+
continue;
288+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
289+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
290+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
291+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
292+
}
293+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
294+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
295+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
296+
}
297+
}
298+
256299
return Changed;
257300
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3652,6 +3652,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
36523652
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
36533653
}
36543654

3655+
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3656+
ArrayRef<MCPhysReg> Order,
3657+
SmallVectorImpl<MCPhysReg> &Hints,
3658+
const MachineFunction &MF,
3659+
const VirtRegMap *VRM,
3660+
const LiveRegMatrix *Matrix) const {
3661+
3662+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3663+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3664+
3665+
std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3666+
3667+
switch (Hint.first) {
3668+
case AMDGPURI::Size32: {
3669+
Register Paired = Hint.second;
3670+
assert(Paired);
3671+
Register PairedPhys;
3672+
if (Paired.isPhysical()) {
3673+
PairedPhys =
3674+
getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3675+
} else if (VRM && VRM->hasPhys(Paired)) {
3676+
PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3677+
&AMDGPU::VGPR_32RegClass);
3678+
}
3679+
3680+
// Prefer the paired physreg.
3681+
if (PairedPhys)
3682+
// isLo(Paired) is implicitly true here from the API of
3683+
// getMatchingSuperReg.
3684+
Hints.push_back(PairedPhys);
3685+
return false;
3686+
}
3687+
case AMDGPURI::Size16: {
3688+
Register Paired = Hint.second;
3689+
assert(Paired);
3690+
Register PairedPhys;
3691+
if (Paired.isPhysical()) {
3692+
PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3693+
} else if (VRM && VRM->hasPhys(Paired)) {
3694+
PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3695+
}
3696+
3697+
// First prefer the paired physreg.
3698+
if (PairedPhys)
3699+
Hints.push_back(PairedPhys);
3700+
else {
3701+
// Add all the lo16 physregs.
3702+
// When the Paired operand has not yet been assigned a physreg it is
3703+
// better to try putting VirtReg in a lo16 register, because possibly
3704+
// later Paired can be assigned to the overlapping register and the COPY
3705+
// can be eliminated.
3706+
for (MCPhysReg PhysReg : Order) {
3707+
if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3708+
continue;
3709+
if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3710+
!MRI.isReserved(PhysReg))
3711+
Hints.push_back(PhysReg);
3712+
}
3713+
}
3714+
return false;
3715+
}
3716+
default:
3717+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3718+
VRM);
3719+
}
3720+
}
3721+
36553722
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
36563723
// Not a callee saved register.
36573724
return AMDGPU::SGPR30_SGPR31;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class LiveRegUnits;
2929
class RegisterBank;
3030
struct SGPRSpillBuilder;
3131

32+
/// Register allocation hint types. Helps eliminate unneeded COPY with True16
33+
namespace AMDGPURI {
34+
35+
enum { Size16 = 1, Size32 = 2 };
36+
37+
} // end namespace AMDGPURI
38+
3239
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3340
private:
3441
const GCNSubtarget &ST;
@@ -329,6 +336,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
329336
unsigned getRegPressureSetLimit(const MachineFunction &MF,
330337
unsigned Idx) const override;
331338

339+
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
340+
SmallVectorImpl<MCPhysReg> &Hints,
341+
const MachineFunction &MF, const VirtRegMap *VRM,
342+
const LiveRegMatrix *Matrix) const override;
343+
332344
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
333345

334346
MCRegister getReturnAddressReg(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
7676
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
7878
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
79+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
8280
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
8381
; GFX11-SDAG-NEXT: s_endpgm
8482
;

0 commit comments

Comments
 (0)