Skip to content

Commit a39af9b

Browse files
committed
[AMDGPU][True16] add PreRA hint to improve elimination for 16bit
and 32bit register copy
1 parent aceb87d commit a39af9b

File tree

7 files changed

+181
-118
lines changed

7 files changed

+181
-118
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,21 @@
2222
/// although the same shall be possible with other register classes and
2323
/// instructions if necessary.
2424
///
25+
/// This pass also adds register allocation hints to COPY.
26+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
27+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
28+
/// This pass also adds register allocation hints to COPY.
29+
/// The hints will be post-processed by SIRegisterInfo::getRegAllocationHints.
30+
/// When using True16, we often see COPY moving a 16-bit value between a VGPR_32
31+
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
32+
/// the VGPR_32, the COPY can be completely eliminated.
33+
///
2534
//===----------------------------------------------------------------------===//
2635

2736
#include "AMDGPU.h"
2837
#include "GCNSubtarget.h"
2938
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
39+
#include "SIRegisterInfo.h"
3040
#include "llvm/CodeGen/LiveIntervals.h"
3141
#include "llvm/CodeGen/MachineFunctionPass.h"
3242
#include "llvm/InitializePasses.h"
@@ -236,5 +246,38 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
236246
Changed |= processReg(Reg);
237247
}
238248

249+
if (!ST.useRealTrue16Insts())
250+
return Changed;
251+
252+
// Add RA hints to improve True16 COPY elimination.
253+
for (const MachineBasicBlock &MBB : MF) {
254+
for (const MachineInstr &MI : MBB) {
255+
if (MI.getOpcode() != AMDGPU::COPY)
256+
continue;
257+
Register Dst = MI.getOperand(0).getReg();
258+
Register Src = MI.getOperand(1).getReg();
259+
if (Dst.isVirtual() &&
260+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
261+
Src.isPhysical() &&
262+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
263+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
264+
if (Src.isVirtual() &&
265+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
266+
Dst.isPhysical() &&
267+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
268+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
269+
if (!Dst.isVirtual() || !Src.isVirtual())
270+
continue;
271+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
272+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
273+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
274+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
275+
}
276+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
277+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
278+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
279+
}
280+
}
281+
239282
return Changed;
240283
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3327,6 +3327,73 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
33273327
return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
33283328
}
33293329

3330+
bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
3331+
ArrayRef<MCPhysReg> Order,
3332+
SmallVectorImpl<MCPhysReg> &Hints,
3333+
const MachineFunction &MF,
3334+
const VirtRegMap *VRM,
3335+
const LiveRegMatrix *Matrix) const {
3336+
3337+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3338+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
3339+
3340+
std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3341+
3342+
switch (Hint.first) {
3343+
case AMDGPURI::Size32: {
3344+
Register Paired = Hint.second;
3345+
assert(Paired);
3346+
Register PairedPhys;
3347+
if (Paired.isPhysical()) {
3348+
PairedPhys =
3349+
getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3350+
} else if (VRM && VRM->hasPhys(Paired)) {
3351+
PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3352+
&AMDGPU::VGPR_32RegClass);
3353+
}
3354+
3355+
// Prefer the paired physreg.
3356+
if (PairedPhys)
3357+
// isLo(Paired) is implicitly true here from the API of
3358+
// getMatchingSuperReg.
3359+
Hints.push_back(PairedPhys);
3360+
return false;
3361+
}
3362+
case AMDGPURI::Size16: {
3363+
Register Paired = Hint.second;
3364+
assert(Paired);
3365+
Register PairedPhys;
3366+
if (Paired.isPhysical()) {
3367+
PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3368+
} else if (VRM && VRM->hasPhys(Paired)) {
3369+
PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3370+
}
3371+
3372+
// First prefer the paired physreg.
3373+
if (PairedPhys)
3374+
Hints.push_back(PairedPhys);
3375+
else {
3376+
// Add all the lo16 physregs.
3377+
// When the Paired operand has not yet been assigned a physreg it is
3378+
// better to try putting VirtReg in a lo16 register, because possibly
3379+
// later Paired can be assigned to the overlapping register and the COPY
3380+
// can be eliminated.
3381+
for (MCPhysReg PhysReg : Order) {
3382+
if (PhysReg == PairedPhys || AMDGPU::isHi(PhysReg, *this))
3383+
continue;
3384+
if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3385+
!MRI.isReserved(PhysReg))
3386+
Hints.push_back(PhysReg);
3387+
}
3388+
}
3389+
return false;
3390+
}
3391+
default:
3392+
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3393+
VRM);
3394+
}
3395+
}
3396+
33303397
MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
33313398
// Not a callee saved register.
33323399
return AMDGPU::SGPR30_SGPR31;

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class LiveRegUnits;
2929
class RegisterBank;
3030
struct SGPRSpillBuilder;
3131

32+
/// Register allocation hint types. Helps eliminate unneeded COPY with True16
33+
namespace AMDGPURI {
34+
35+
enum { Size16 = 1, Size32 = 2 };
36+
37+
} // end namespace AMDGPURI
38+
3239
class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
3340
private:
3441
const GCNSubtarget &ST;
@@ -326,6 +333,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
326333
unsigned getRegPressureSetLimit(const MachineFunction &MF,
327334
unsigned Idx) const override;
328335

336+
bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
337+
SmallVectorImpl<MCPhysReg> &Hints,
338+
const MachineFunction &MF, const VirtRegMap *VRM,
339+
const LiveRegMatrix *Matrix) const override;
340+
329341
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
330342

331343
MCRegister getReturnAddressReg(const MachineFunction &MF) const;

llvm/test/CodeGen/AMDGPU/fadd.f16.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,7 @@ define amdgpu_kernel void @fadd_f16(
7676
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
7878
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
79-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
80-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
81-
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
79+
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
8280
; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
8381
; GFX11-SDAG-NEXT: s_nop 0
8482
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,9 @@ define amdgpu_kernel void @ceil_v2f16(
164164
; GFX11-NEXT: s_waitcnt vmcnt(0)
165165
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
166166
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
167-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
168-
; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l
169-
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
170167
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
171-
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
172-
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
168+
; GFX11-NEXT: v_ceil_f16_e32 v1.l, v1.l
169+
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
173170
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
174171
; GFX11-NEXT: s_nop 0
175172
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,9 @@ define amdgpu_kernel void @floor_v2f16(
165165
; GFX11-NEXT: s_waitcnt vmcnt(0)
166166
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
167167
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
168-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
169-
; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l
170-
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
171168
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
172-
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
173-
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
169+
; GFX11-NEXT: v_floor_f16_e32 v1.l, v1.l
170+
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
174171
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
175172
; GFX11-NEXT: s_nop 0
176173
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

0 commit comments

Comments
 (0)