Skip to content

Commit 493a45c

Browse files
committed
AMDGPU: Fold more scalar operations on frame index to VALU
Further extend workaround for the lack of proper regbankselect for frame indexes.
1 parent 5569880 commit 493a45c

File tree

2 files changed

+229
-59
lines changed

2 files changed

+229
-59
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 74 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,25 @@ class SIFoldOperandsImpl {
7878
bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
7979
const MachineOperand &OpToFold) const;
8080

81-
/// Fold %vgpr = COPY (S_ADD_I32 x, frameindex)
82-
///
83-
/// => %vgpr = V_ADD_U32 x, frameindex
81+
// TODO: Just use TII::getVALUOp
82+
unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
83+
switch (Opc) {
84+
case AMDGPU::S_ADD_I32: {
85+
if (ST->hasAddNoCarry())
86+
return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
87+
return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
88+
}
89+
case AMDGPU::S_OR_B32:
90+
return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
91+
case AMDGPU::S_AND_B32:
92+
return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
93+
case AMDGPU::S_MUL_I32:
94+
return AMDGPU::V_MUL_LO_U32_e64;
95+
default:
96+
return AMDGPU::INSTRUCTION_LIST_END;
97+
}
98+
}
99+
84100
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
85101
MachineInstr &MI) const;
86102

@@ -202,6 +218,8 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
202218
const unsigned Opc = UseMI.getOpcode();
203219
switch (Opc) {
204220
case AMDGPU::S_ADD_I32:
221+
case AMDGPU::S_OR_B32:
222+
case AMDGPU::S_AND_B32:
205223
case AMDGPU::V_ADD_U32_e32:
206224
case AMDGPU::V_ADD_CO_U32_e32:
207225
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
@@ -238,53 +256,62 @@ bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
238256
if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
239257
MRI->hasOneNonDBGUse(SrcReg)) {
240258
MachineInstr *Def = MRI->getVRegDef(SrcReg);
241-
if (Def && Def->getOpcode() == AMDGPU::S_ADD_I32 &&
242-
Def->getOperand(3).isDead()) {
243-
MachineOperand *Src0 = &Def->getOperand(1);
244-
MachineOperand *Src1 = &Def->getOperand(2);
245-
246-
// TODO: This is profitable with more operand types, and for more
247-
// opcodes. But ultimately this is working around poor / nonexistent
248-
// regbankselect.
249-
if (!Src0->isFI() && !Src1->isFI())
250-
return false;
259+
if (!Def || Def->getNumOperands() != 4)
260+
return false;
251261

252-
if (Src0->isFI())
253-
std::swap(Src0, Src1);
254-
255-
MachineBasicBlock *MBB = Def->getParent();
256-
const DebugLoc &DL = Def->getDebugLoc();
257-
if (ST->hasAddNoCarry()) {
258-
bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
259-
MachineInstrBuilder Add =
260-
BuildMI(*MBB, *Def, DL,
261-
TII->get(UseVOP3 ? AMDGPU::V_ADD_U32_e64
262-
: AMDGPU::V_ADD_U32_e32),
263-
DstReg)
264-
.add(*Src0)
265-
.add(*Src1)
266-
.setMIFlags(Def->getFlags());
267-
if (UseVOP3)
268-
Add.addImm(0);
269-
270-
Def->eraseFromParent();
271-
MI.eraseFromParent();
272-
return true;
273-
}
262+
MachineOperand *Src0 = &Def->getOperand(1);
263+
MachineOperand *Src1 = &Def->getOperand(2);
274264

275-
MachineBasicBlock::LivenessQueryResult Liveness =
276-
MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
277-
if (Liveness == MachineBasicBlock::LQR_Dead) {
278-
// TODO: If src1 satisfies operand constraints, use vop3 version.
279-
BuildMI(*MBB, *Def, DL, TII->get(AMDGPU::V_ADD_CO_U32_e32), DstReg)
280-
.add(*Src0)
281-
.add(*Src1)
282-
.setOperandDead(3) // implicit-def $vcc
283-
.setMIFlags(Def->getFlags());
284-
Def->eraseFromParent();
285-
MI.eraseFromParent();
286-
return true;
265+
// TODO: This is profitable with more operand types, and for more
266+
// opcodes. But ultimately this is working around poor / nonexistent
267+
// regbankselect.
268+
if (!Src0->isFI() && !Src1->isFI())
269+
return false;
270+
271+
if (Src0->isFI())
272+
std::swap(Src0, Src1);
273+
274+
const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
275+
unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
276+
if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
277+
!Def->getOperand(3).isDead()) // Check if scc is dead
278+
return false;
279+
280+
MachineBasicBlock *MBB = Def->getParent();
281+
const DebugLoc &DL = Def->getDebugLoc();
282+
if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
283+
MachineInstrBuilder Add =
284+
BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg);
285+
286+
if (Add->getDesc().getNumDefs() == 2) {
287+
Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
288+
Add.addDef(CarryOutReg, RegState::Dead);
289+
MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
287290
}
291+
292+
Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
293+
if (AMDGPU::hasNamedOperand(NewOp, AMDGPU::OpName::clamp))
294+
Add.addImm(0);
295+
296+
Def->eraseFromParent();
297+
MI.eraseFromParent();
298+
return true;
299+
}
300+
301+
assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
302+
303+
MachineBasicBlock::LivenessQueryResult Liveness =
304+
MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, *Def, 16);
305+
if (Liveness == MachineBasicBlock::LQR_Dead) {
306+
// TODO: If src1 satisfies operand constraints, use vop3 version.
307+
BuildMI(*MBB, *Def, DL, TII->get(NewOp), DstReg)
308+
.add(*Src0)
309+
.add(*Src1)
310+
.setOperandDead(3) // implicit-def $vcc
311+
.setMIFlags(Def->getFlags());
312+
Def->eraseFromParent();
313+
MI.eraseFromParent();
314+
return true;
288315
}
289316
}
290317

llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir

Lines changed: 155 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ stack:
7575
body: |
7676
bb.0:
7777
; GFX8-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
78-
; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
79-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
78+
; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
79+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
8080
;
8181
; GFX9-LABEL: name: fold_s_add_i32__fi_imm_copy_to_virt_vgpr
8282
; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -98,8 +98,8 @@ stack:
9898
body: |
9999
bb.0:
100100
; GFX8-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
101-
; GFX8: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = nuw V_ADD_CO_U32_e32 64, %stack.0, implicit-def dead $vcc, implicit $exec
102-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
101+
; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = nuw V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
102+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
103103
;
104104
; GFX9-LABEL: name: fold_s_add_i32__imm_fi_copy_to_virt_vgpr
105105
; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = nuw V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
@@ -202,8 +202,8 @@ body: |
202202
; GFX8: liveins: $sgpr8
203203
; GFX8-NEXT: {{ $}}
204204
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
205-
; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
206-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
205+
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
206+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
207207
;
208208
; GFX9-LABEL: name: fold_s_add_i32__mov_fi_reg_copy_to_virt_vgpr
209209
; GFX9: liveins: $sgpr8
@@ -239,8 +239,8 @@ body: |
239239
; GFX8: liveins: $sgpr8
240240
; GFX8-NEXT: {{ $}}
241241
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
242-
; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
243-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
242+
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
243+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
244244
;
245245
; GFX9-LABEL: name: fold_s_add_i32__reg_copy_mov_fi_to_virt_vgpr
246246
; GFX9: liveins: $sgpr8
@@ -337,8 +337,8 @@ body: |
337337
; GFX8: liveins: $sgpr8
338338
; GFX8-NEXT: {{ $}}
339339
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
340-
; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
341-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
340+
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
341+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
342342
;
343343
; GFX9-LABEL: name: fold_s_add_i32__fi_reg_copy_to_virt_vgpr
344344
; GFX9: liveins: $sgpr8
@@ -371,8 +371,8 @@ body: |
371371
; GFX8: liveins: $sgpr8
372372
; GFX8-NEXT: {{ $}}
373373
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
374-
; GFX8-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[COPY]], %stack.0, implicit-def dead $vcc, implicit $exec
375-
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e32_]]
374+
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], %stack.0, 0, implicit $exec
375+
; GFX8-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_]]
376376
;
377377
; GFX9-LABEL: name: fold_s_add_i32__reg_fi_copy_to_virt_vgpr
378378
; GFX9: liveins: $sgpr8
@@ -392,3 +392,146 @@ body: |
392392
%2:vgpr_32 = COPY %1
393393
SI_RETURN implicit %2
394394
...
395+
396+
---
397+
name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
398+
tracksRegLiveness: true
399+
stack:
400+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
401+
body: |
402+
bb.0:
403+
; CHECK-LABEL: name: fold_s_or_b32__mov_fi_const_copy_to_virt_vgpr
404+
; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
405+
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
406+
%0:sreg_32 = S_MOV_B32 %stack.0
407+
%1:sreg_32 = S_OR_B32 %0, 128, implicit-def dead $scc
408+
%2:vgpr_32 = COPY %1
409+
SI_RETURN implicit %2
410+
...
411+
412+
---
413+
name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
414+
tracksRegLiveness: true
415+
stack:
416+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
417+
body: |
418+
bb.0:
419+
; CHECK-LABEL: name: fold_s_or_b32__const_copy_mov_fi_to_virt_vgpr
420+
; CHECK: [[V_OR_B32_e32_:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 128, %stack.0, implicit $exec
421+
; CHECK-NEXT: SI_RETURN implicit [[V_OR_B32_e32_]]
422+
%0:sreg_32 = S_MOV_B32 %stack.0
423+
%1:sreg_32 = S_OR_B32 128, %0, implicit-def dead $scc
424+
%2:vgpr_32 = COPY %1
425+
SI_RETURN implicit %2
426+
...
427+
428+
---
429+
name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
430+
tracksRegLiveness: true
431+
stack:
432+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
433+
body: |
434+
bb.0:
435+
; CHECK-LABEL: name: fold_s_or_b32__fi_imm_copy_to_virt_vgpr
436+
; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
437+
; CHECK-NEXT: SI_RETURN implicit %1
438+
%0:sreg_32 = disjoint S_OR_B32 %stack.0, 64, implicit-def dead $scc
439+
%1:vgpr_32 = COPY %0
440+
SI_RETURN implicit %1
441+
...
442+
443+
---
444+
name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
445+
tracksRegLiveness: true
446+
stack:
447+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
448+
body: |
449+
bb.0:
450+
; CHECK-LABEL: name: fold_s_or_b32__imm_fi_copy_to_virt_vgpr
451+
; CHECK: %1:vgpr_32 = disjoint V_OR_B32_e64 64, %stack.0, implicit $exec
452+
; CHECK-NEXT: SI_RETURN implicit %1
453+
%0:sreg_32 = disjoint S_OR_B32 64, %stack.0, implicit-def dead $scc
454+
%1:vgpr_32 = COPY %0
455+
SI_RETURN implicit %1
456+
...
457+
458+
---
459+
name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr
460+
tracksRegLiveness: true
461+
stack:
462+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
463+
body: |
464+
bb.0:
465+
; CHECK-LABEL: name: fold_s_and_b32__fi_imm_copy_to_virt_vgpr
466+
; CHECK: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 64, %stack.0, implicit $exec
467+
; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e64_]]
468+
%0:sreg_32 = S_AND_B32 %stack.0, 64, implicit-def dead $scc
469+
%1:vgpr_32 = COPY %0
470+
SI_RETURN implicit %1
471+
...
472+
473+
---
474+
name: fold_s_and_b32__fi_const_copy_to_virt_vgpr
475+
tracksRegLiveness: true
476+
stack:
477+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
478+
body: |
479+
bb.0:
480+
; CHECK-LABEL: name: fold_s_and_b32__fi_const_copy_to_virt_vgpr
481+
; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
482+
; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
483+
%0:sreg_32 = S_AND_B32 %stack.0, 128, implicit-def dead $scc
484+
%1:vgpr_32 = COPY %0
485+
SI_RETURN implicit %1
486+
...
487+
488+
---
489+
name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
490+
tracksRegLiveness: true
491+
stack:
492+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
493+
body: |
494+
bb.0:
495+
; CHECK-LABEL: name: fold_s_mul_i32__fi_imm_copy_to_virt_vgpr
496+
; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 64, %stack.0, implicit $exec
497+
; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
498+
%0:sreg_32 = S_MUL_I32 %stack.0, 64, implicit-def dead $scc
499+
%1:vgpr_32 = COPY %0
500+
SI_RETURN implicit %1
501+
...
502+
503+
---
504+
name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
505+
tracksRegLiveness: true
506+
stack:
507+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
508+
body: |
509+
bb.0:
510+
liveins: $sgpr4
511+
; CHECK-LABEL: name: fold_s_mul_i32__fi_reg_copy_to_virt_vgpr
512+
; CHECK: liveins: $sgpr4
513+
; CHECK-NEXT: {{ $}}
514+
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
515+
; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[COPY]], %stack.0, implicit $exec
516+
; CHECK-NEXT: SI_RETURN implicit [[V_MUL_LO_U32_e64_]]
517+
%0:sreg_32 = COPY $sgpr4
518+
%1:sreg_32 = S_MUL_I32 %stack.0, %0, implicit-def dead $scc
519+
%2:vgpr_32 = COPY %1
520+
SI_RETURN implicit %2
521+
...
522+
523+
---
524+
name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
525+
tracksRegLiveness: true
526+
stack:
527+
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
528+
body: |
529+
bb.0:
530+
; CHECK-LABEL: name: fold_s_and_b32__mov_fi_const_copy_to_virt_vgpr
531+
; CHECK: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 128, %stack.0, implicit $exec
532+
; CHECK-NEXT: SI_RETURN implicit [[V_AND_B32_e32_]]
533+
%0:sreg_32 = S_MOV_B32 %stack.0
534+
%1:sreg_32 = S_AND_B32 %0, 128, implicit-def dead $scc
535+
%2:vgpr_32 = COPY %1
536+
SI_RETURN implicit %2
537+
...

0 commit comments

Comments
 (0)