Skip to content

Commit 0a62980

Browse files
authored
AMDGPU: Support VALU add instructions in localstackalloc (#101692)
Pre-enable this optimization before allowing folds of frame indexes into add instructions. Disables this fold when using scratch instructions for now. I see some code size improvements with it, but the optimization needs to be smarter about the uses depending on the register classes.
1 parent a821fee commit 0a62980

File tree

6 files changed

+1642
-4
lines changed

6 files changed

+1642
-4
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,23 @@ int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
797797

798798
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
799799
int Idx) const {
800+
switch (MI->getOpcode()) {
801+
case AMDGPU::V_ADD_U32_e32:
802+
case AMDGPU::V_ADD_U32_e64:
803+
case AMDGPU::V_ADD_CO_U32_e32: {
804+
int OtherIdx = Idx == 1 ? 2 : 1;
805+
const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
806+
return OtherOp.isImm() ? OtherOp.getImm() : 0;
807+
}
808+
case AMDGPU::V_ADD_CO_U32_e64: {
809+
int OtherIdx = Idx == 2 ? 3 : 2;
810+
const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
811+
return OtherOp.isImm() ? OtherOp.getImm() : 0;
812+
}
813+
default:
814+
break;
815+
}
816+
800817
if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
801818
return 0;
802819

@@ -809,7 +826,60 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
809826
return getScratchInstrOffset(MI);
810827
}
811828

829+
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
830+
const MachineInstr &MI) {
831+
assert(MI.getDesc().isAdd());
832+
const MachineOperand &Src0 = MI.getOperand(1);
833+
const MachineOperand &Src1 = MI.getOperand(2);
834+
835+
if (Src0.isFI()) {
836+
return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
837+
Src1.getReg()));
838+
}
839+
840+
if (Src1.isFI()) {
841+
return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
842+
Src0.getReg()));
843+
}
844+
845+
return false;
846+
}
847+
812848
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
849+
// TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
850+
switch (MI->getOpcode()) {
851+
case AMDGPU::V_ADD_U32_e32: {
852+
// TODO: We could handle this but it requires work to avoid violating
853+
// operand restrictions.
854+
if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
855+
!isFIPlusImmOrVGPR(*this, *MI))
856+
return false;
857+
[[fallthrough]];
858+
}
859+
case AMDGPU::V_ADD_U32_e64:
860+
// FIXME: This optimization is barely profitable enableFlatScratch as-is.
861+
//
862+
// Much of the benefit with the MUBUF handling is we avoid duplicating the
863+
// shift of the frame register, which isn't needed with scratch.
864+
//
865+
// materializeFrameBaseRegister doesn't know the register classes of the
866+
// uses, and unconditionally uses an s_add_i32, which will end up using a
867+
// copy for the vector uses.
868+
return !ST.enableFlatScratch();
869+
case AMDGPU::V_ADD_CO_U32_e32:
870+
if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
871+
!isFIPlusImmOrVGPR(*this, *MI))
872+
return false;
873+
// We can't deal with the case where the carry out has a use (though this
874+
// should never happen)
875+
return MI->getOperand(3).isDead();
876+
case AMDGPU::V_ADD_CO_U32_e64:
877+
// TODO: Should we check use_empty instead?
878+
return MI->getOperand(1).isDead();
879+
default:
880+
break;
881+
}
882+
813883
if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
814884
return false;
815885

@@ -860,6 +930,8 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
860930
.addFrameIndex(FrameIdx);
861931

862932
if (ST.enableFlatScratch() ) {
933+
// FIXME: Mark scc as dead
934+
// FIXME: Make sure scc isn't live in.
863935
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
864936
.addReg(OffsetReg, RegState::Kill)
865937
.addReg(FIReg);
@@ -877,6 +949,86 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
877949
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
878950
int64_t Offset) const {
879951
const SIInstrInfo *TII = ST.getInstrInfo();
952+
953+
switch (MI.getOpcode()) {
954+
case AMDGPU::V_ADD_U32_e32:
955+
case AMDGPU::V_ADD_CO_U32_e32: {
956+
MachineOperand *FIOp = &MI.getOperand(2);
957+
MachineOperand *ImmOp = &MI.getOperand(1);
958+
if (!FIOp->isFI())
959+
std::swap(FIOp, ImmOp);
960+
961+
if (!ImmOp->isImm()) {
962+
assert(Offset == 0);
963+
FIOp->ChangeToRegister(BaseReg, false);
964+
TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
965+
return;
966+
}
967+
968+
int64_t TotalOffset = ImmOp->getImm() + Offset;
969+
if (TotalOffset == 0) {
970+
MI.setDesc(TII->get(AMDGPU::COPY));
971+
for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
972+
MI.removeOperand(I);
973+
974+
MI.getOperand(1).ChangeToRegister(BaseReg, false);
975+
return;
976+
}
977+
978+
ImmOp->setImm(TotalOffset);
979+
980+
MachineBasicBlock *MBB = MI.getParent();
981+
MachineFunction *MF = MBB->getParent();
982+
MachineRegisterInfo &MRI = MF->getRegInfo();
983+
984+
// FIXME: materializeFrameBaseRegister does not know the register class of
985+
// the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
986+
// a copy so we have a legal operand and hope the register coalescer can
987+
// clean it up.
988+
if (isSGPRReg(MRI, BaseReg)) {
989+
Register BaseRegVGPR =
990+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
991+
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
992+
.addReg(BaseReg);
993+
MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
994+
} else {
995+
MI.getOperand(2).ChangeToRegister(BaseReg, false);
996+
}
997+
return;
998+
}
999+
case AMDGPU::V_ADD_U32_e64:
1000+
case AMDGPU::V_ADD_CO_U32_e64: {
1001+
int Src0Idx = MI.getNumExplicitDefs();
1002+
MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1003+
MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1004+
if (!FIOp->isFI())
1005+
std::swap(FIOp, ImmOp);
1006+
1007+
if (!ImmOp->isImm()) {
1008+
FIOp->ChangeToRegister(BaseReg, false);
1009+
TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1010+
return;
1011+
}
1012+
1013+
int64_t TotalOffset = ImmOp->getImm() + Offset;
1014+
if (TotalOffset == 0) {
1015+
MI.setDesc(TII->get(AMDGPU::COPY));
1016+
1017+
for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1018+
MI.removeOperand(I);
1019+
1020+
MI.getOperand(1).ChangeToRegister(BaseReg, false);
1021+
} else {
1022+
FIOp->ChangeToRegister(BaseReg, false);
1023+
ImmOp->setImm(TotalOffset);
1024+
}
1025+
1026+
return;
1027+
}
1028+
default:
1029+
break;
1030+
}
1031+
8801032
bool IsFlat = TII->isFLATScratch(MI);
8811033

8821034
#ifndef NDEBUG
@@ -925,6 +1077,18 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
9251077
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
9261078
Register BaseReg,
9271079
int64_t Offset) const {
1080+
1081+
switch (MI->getOpcode()) {
1082+
case AMDGPU::V_ADD_U32_e32:
1083+
case AMDGPU::V_ADD_CO_U32_e32:
1084+
return true;
1085+
case AMDGPU::V_ADD_U32_e64:
1086+
case AMDGPU::V_ADD_CO_U32_e64:
1087+
return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1088+
default:
1089+
break;
1090+
}
1091+
9281092
if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
9291093
return false;
9301094

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -640,12 +640,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
640640

641641

642642
let Defs = [SCC] in { // Carry out goes to SCC
643-
let isCommutable = 1 in {
643+
let isCommutable = 1, isAdd = 1 in {
644644
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
645645
def S_ADD_I32 : SOP2_32 <"s_add_i32",
646646
[(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
647647
>;
648-
} // End isCommutable = 1
648+
} // End isCommutable = 1, isAdd = 1
649649

650650
def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
651651
def S_SUB_I32 : SOP2_32 <"s_sub_i32",

llvm/lib/Target/AMDGPU/VOP2Instructions.td

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,11 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
763763

764764
// No patterns so that the scalar instructions are always selected.
765765
// The scalar versions will be replaced with vector when needed later.
766-
defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
766+
767+
let isAdd = 1 in {
768+
defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
769+
}
770+
767771
defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
768772
defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
769773
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
@@ -772,7 +776,11 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
772776

773777

774778
let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
775-
defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
779+
780+
let isAdd = 1 in {
781+
defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
782+
}
783+
776784
defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
777785
defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
778786
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
4+
5+
---
6+
name: local_stack_alloc__v_add_u32_e64__literal_offsets
7+
tracksRegLiveness: true
8+
stack:
9+
- { id: 0, size: 4096, alignment: 4 }
10+
machineFunctionInfo:
11+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
12+
frameOffsetReg: '$sgpr33'
13+
stackPtrOffsetReg: '$sgpr32'
14+
body: |
15+
bb.0:
16+
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
17+
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
18+
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
19+
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
20+
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
21+
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
22+
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
23+
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
24+
; GFX10-NEXT: SI_RETURN
25+
;
26+
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
27+
; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
28+
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
29+
; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
30+
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
31+
; GFX12-NEXT: SI_RETURN
32+
%0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
33+
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
34+
%1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
35+
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
36+
SI_RETURN
37+
38+
...
39+
40+
---
41+
name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
42+
tracksRegLiveness: true
43+
stack:
44+
- { id: 0, size: 4096, alignment: 4 }
45+
machineFunctionInfo:
46+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
47+
frameOffsetReg: '$sgpr33'
48+
stackPtrOffsetReg: '$sgpr32'
49+
body: |
50+
bb.0:
51+
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
52+
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
53+
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
54+
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
55+
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
56+
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
57+
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
58+
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
59+
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
60+
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
61+
; GFX10-NEXT: SI_RETURN
62+
;
63+
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
64+
; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
65+
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
66+
; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
67+
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
68+
; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
69+
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
70+
; GFX12-NEXT: SI_RETURN
71+
%0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
72+
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
73+
%1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
74+
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
75+
%2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
76+
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %2
77+
SI_RETURN
78+
79+
...
80+

0 commit comments

Comments
 (0)