Skip to content

Commit ba3d6e0

Browse files
authored
[AMDGPU] Rematerialize scalar loads (llvm#68778)
Extend the list of instructions that can be rematerialized in SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar loads. Try shrinking instructions to remat only the part needed for current context. Add SIInstrInfo::reMaterialize target hook, and handle shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of concept.
1 parent 274ce88 commit ba3d6e0

File tree

5 files changed

+146
-127
lines changed

5 files changed

+146
-127
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
106106
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
107107
}
108108

109+
static bool canRemat(const MachineInstr &MI) {
110+
111+
if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
112+
SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
113+
SIInstrInfo::isSALU(MI))
114+
return true;
115+
116+
if (SIInstrInfo::isSMRD(MI)) {
117+
return !MI.memoperands_empty() &&
118+
llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
119+
return MMO->isLoad() && MMO->isInvariant();
120+
});
121+
}
122+
123+
return false;
124+
}
125+
109126
bool SIInstrInfo::isReallyTriviallyReMaterializable(
110127
const MachineInstr &MI) const {
111-
if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
128+
129+
if (canRemat(MI)) {
112130
// Normally VALU use of exec would block the rematerialization, but that
113131
// is OK in this case to have an implicit exec read as all VALU do.
114132
// We really want all of the generic logic for this except for this.
@@ -2434,6 +2452,92 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
24342452
return true;
24352453
}
24362454

2455+
void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
2456+
MachineBasicBlock::iterator I, Register DestReg,
2457+
unsigned SubIdx, const MachineInstr &Orig,
2458+
const TargetRegisterInfo &RI) const {
2459+
2460+
// Try shrinking the instruction to remat only the part needed for current
2461+
// context.
2462+
// TODO: Handle more cases.
2463+
unsigned Opcode = Orig.getOpcode();
2464+
switch (Opcode) {
2465+
case AMDGPU::S_LOAD_DWORDX16_IMM:
2466+
case AMDGPU::S_LOAD_DWORDX8_IMM: {
2467+
if (SubIdx != 0)
2468+
break;
2469+
2470+
if (I == MBB.end())
2471+
break;
2472+
2473+
if (I->isBundled())
2474+
break;
2475+
2476+
// Look for a single use of the register that is also a subreg.
2477+
Register RegToFind = Orig.getOperand(0).getReg();
2478+
MachineOperand *UseMO = nullptr;
2479+
for (auto &CandMO : I->operands()) {
2480+
if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2481+
continue;
2482+
if (UseMO) {
2483+
UseMO = nullptr;
2484+
break;
2485+
}
2486+
UseMO = &CandMO;
2487+
}
2488+
if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2489+
break;
2490+
2491+
unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2492+
unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2493+
2494+
MachineFunction *MF = MBB.getParent();
2495+
MachineRegisterInfo &MRI = MF->getRegInfo();
2496+
assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2497+
2498+
unsigned NewOpcode = -1;
2499+
if (SubregSize == 256)
2500+
NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2501+
else if (SubregSize == 128)
2502+
NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2503+
else
2504+
break;
2505+
2506+
const MCInstrDesc &TID = get(NewOpcode);
2507+
const TargetRegisterClass *NewRC =
2508+
RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2509+
MRI.setRegClass(DestReg, NewRC);
2510+
2511+
UseMO->setReg(DestReg);
2512+
UseMO->setSubReg(AMDGPU::NoSubRegister);
2513+
2514+
// Use a smaller load with the desired size, possibly with updated offset.
2515+
MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2516+
MI->setDesc(TID);
2517+
MI->getOperand(0).setReg(DestReg);
2518+
MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2519+
if (Offset) {
2520+
MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2521+
int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2522+
OffsetMO->setImm(FinalOffset);
2523+
}
2524+
SmallVector<MachineMemOperand *> NewMMOs;
2525+
for (const MachineMemOperand *MemOp : Orig.memoperands())
2526+
NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2527+
SubregSize / 8));
2528+
MI->setMemRefs(*MF, NewMMOs);
2529+
2530+
MBB.insert(I, MI);
2531+
return;
2532+
}
2533+
2534+
default:
2535+
break;
2536+
}
2537+
2538+
TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2539+
}
2540+
24372541
std::pair<MachineInstr*, MachineInstr*>
24382542
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
24392543
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
275275

276276
bool expandPostRAPseudo(MachineInstr &MI) const override;
277277

278+
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
279+
Register DestReg, unsigned SubIdx,
280+
const MachineInstr &Orig,
281+
const TargetRegisterInfo &TRI) const override;
282+
278283
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
279284
// instructions. Returns a pair of generated instructions.
280285
// Can split either post-RA with physical registers or pre-RA with

llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ entry:
4747
}
4848

4949
; CHECK: .name: num_spilled_sgprs
50-
; GFX700: .sgpr_spill_count: 38
51-
; GFX803: .sgpr_spill_count: 22
50+
; GFX700: .sgpr_spill_count: 12
51+
; GFX803: .sgpr_spill_count: 12
5252
; GFX900: .sgpr_spill_count: 48
5353
; GFX1010: .sgpr_spill_count: 48
5454
; CHECK: .symbol: num_spilled_sgprs.kd

0 commit comments

Comments
 (0)