Skip to content

Commit aa79412

Browse files
authored
AMDGPU: Fold copy of scalar add of frame index (#115058)
This is a pre-optimization to avoid a regression in a future commit. Currently we almost always emit frame index with a v_mov_b32 and use vector adds for the pointer operations. We need to consider the users of the frame index (or rather, the transitive users of derived pointer operations) to know whether the value will be used in a vector or scalar context. This saves an sgpr->vgpr copy. This optimization could be more general for any opcode that's trivially convertible from a scalar to vector form (although this is a workaround for a proper regbankselect).
1 parent 29e467f commit aa79412

File tree

2 files changed

+468
-2
lines changed

2 files changed

+468
-2
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ class SIFoldOperandsImpl {
7878
bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
7979
const MachineOperand &OpToFold) const;
8080

81+
/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
82+
///
83+
/// => %vgpr = V_ADD_U32 x, frameindex
84+
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
85+
MachineInstr &MI) const;
86+
8187
bool updateOperand(FoldCandidate &Fold) const;
8288

8389
bool canUseImmWithOpSel(FoldCandidate &Fold) const;
@@ -224,6 +230,67 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
224230
return OpNo == VIdx && SIdx == -1;
225231
}
226232

233+
/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
234+
///
235+
/// => %vgpr = V_ADD_U32 x, frameindex
236+
bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
237+
Register DstReg, Register SrcReg, MachineInstr &MI) const {
238+
if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
239+
MRI->hasOneNonDBGUse(SrcReg)) {
240+
MachineInstr *Def = MRI->getVRegDef(SrcReg);
241+
if (Def && Def->getOpcode() == AMDGPU::S_ADD_I32 &&
242+
Def->getOperand(3).isDead()) {
243+
MachineOperand *Src0 = &Def->getOperand(1);
244+
MachineOperand *Src1 = &Def->getOperand(2);
245+
246+
// TODO: This is profitable with more operand types, and for more
247+
// opcodes. But ultimately this is working around poor / nonexistent
248+
// regbankselect.
249+
if (!Src0->isFI() && !Src1->isFI())
250+
return false;
251+
252+
if (Src0->isFI())
253+
std::swap(Src0, Src1);
254+
255+
MachineBasicBlock *MBB = Def->getParent();
256+
const DebugLoc &DL = Def->getDebugLoc();
257+
if (ST->hasAddNoCarry()) {
258+
bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
259+
MachineInstrBuilder Add =
260+
BuildMI(*MBB, *Def, DL,
261+
TII->get(UseVOP3 ? AMDGPU::V_ADD_U32_e64
262+
: AMDGPU::V_ADD_U32_e32),
263+
DstReg)
264+
.add(*Src0)
265+
.add(*Src1)
266+
.setMIFlags(Def->getFlags());
267+
if (UseVOP3)
268+
Add.addImm(0);
269+
270+
Def->eraseFromParent();
271+
MI.eraseFromParent();
272+
return true;
273+
}
274+
275+
MachineBasicBlock::LivenessQueryResult Liveness =
276+
MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
277+
if (Liveness == MachineBasicBlock::LQR_Dead) {
278+
// TODO: If src1 satisfies operand constraints, use vop3 version.
279+
BuildMI(*MBB, *Def, DL, TII->get(AMDGPU::V_ADD_CO_U32_e32), DstReg)
280+
.add(*Src0)
281+
.add(*Src1)
282+
.setOperandDead(3) // implicit-def $vcc
283+
.setMIFlags(Def->getFlags());
284+
Def->eraseFromParent();
285+
MI.eraseFromParent();
286+
return true;
287+
}
288+
}
289+
}
290+
291+
return false;
292+
}
293+
227294
FunctionPass *llvm::createSIFoldOperandsLegacyPass() {
228295
return new SIFoldOperandsLegacy();
229296
}
@@ -1470,9 +1537,10 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
14701537

14711538
bool SIFoldOperandsImpl::tryFoldFoldableCopy(
14721539
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1540+
Register DstReg = MI.getOperand(0).getReg();
14731541
// Specially track simple redefs of m0 to the same value in a block, so we
14741542
// can erase the later ones.
1475-
if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1543+
if (DstReg == AMDGPU::M0) {
14761544
MachineOperand &NewM0Val = MI.getOperand(1);
14771545
if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
14781546
MI.eraseFromParent();
@@ -1504,13 +1572,17 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
15041572
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
15051573
return false;
15061574

1575+
if (OpToFold.isReg() &&
1576+
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1577+
return true;
1578+
15071579
// Prevent folding operands backwards in the function. For example,
15081580
// the COPY opcode must not be replaced by 1 in this example:
15091581
//
15101582
// %3 = COPY %vgpr0; VGPR_32:%3
15111583
// ...
15121584
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1513-
if (!MI.getOperand(0).getReg().isVirtual())
1585+
if (!DstReg.isVirtual())
15141586
return false;
15151587

15161588
bool Changed = foldInstOperand(MI, OpToFold);

0 commit comments

Comments
 (0)