Skip to content

AMDGPU: Support VALU add instructions in localstackalloc #101692

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,23 @@ int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {

int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const {
switch (MI->getOpcode()) {
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_CO_U32_e32: {
int OtherIdx = Idx == 1 ? 2 : 1;
const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
return OtherOp.isImm() ? OtherOp.getImm() : 0;
}
case AMDGPU::V_ADD_CO_U32_e64: {
int OtherIdx = Idx == 2 ? 3 : 2;
const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
return OtherOp.isImm() ? OtherOp.getImm() : 0;
}
default:
break;
}

if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return 0;

Expand All @@ -809,7 +826,60 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
return getScratchInstrOffset(MI);
}

static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
const MachineInstr &MI) {
assert(MI.getDesc().isAdd());
const MachineOperand &Src0 = MI.getOperand(1);
const MachineOperand &Src1 = MI.getOperand(2);

if (Src0.isFI()) {
return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
Src1.getReg()));
}

if (Src1.isFI()) {
return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
Src0.getReg()));
}

return false;
}

bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
// TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
switch (MI->getOpcode()) {
case AMDGPU::V_ADD_U32_e32: {
// TODO: We could handle this but it requires work to avoid violating
// operand restrictions.
if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
!isFIPlusImmOrVGPR(*this, *MI))
return false;
[[fallthrough]];
}
case AMDGPU::V_ADD_U32_e64:
// FIXME: This optimization is barely profitable enableFlatScratch as-is.
//
// Much of the benefit with the MUBUF handling is we avoid duplicating the
// shift of the frame register, which isn't needed with scratch.
//
// materializeFrameBaseRegister doesn't know the register classes of the
// uses, and unconditionally uses an s_add_i32, which will end up using a
// copy for the vector uses.
return !ST.enableFlatScratch();
case AMDGPU::V_ADD_CO_U32_e32:
if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
!isFIPlusImmOrVGPR(*this, *MI))
return false;
// We can't deal with the case where the carry out has a use (though this
// should never happen)
return MI->getOperand(3).isDead();
case AMDGPU::V_ADD_CO_U32_e64:
// TODO: Should we check use_empty instead?
return MI->getOperand(1).isDead();
default:
break;
}

if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return false;

Expand Down Expand Up @@ -860,6 +930,8 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
.addFrameIndex(FrameIdx);

if (ST.enableFlatScratch() ) {
// FIXME: Mark scc as dead
// FIXME: Make sure scc isn't live in.
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
.addReg(OffsetReg, RegState::Kill)
.addReg(FIReg);
Expand All @@ -877,6 +949,86 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
const SIInstrInfo *TII = ST.getInstrInfo();

switch (MI.getOpcode()) {
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32: {
MachineOperand *FIOp = &MI.getOperand(2);
MachineOperand *ImmOp = &MI.getOperand(1);
if (!FIOp->isFI())
std::swap(FIOp, ImmOp);

if (!ImmOp->isImm()) {
assert(Offset == 0);
FIOp->ChangeToRegister(BaseReg, false);
TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
return;
}

int64_t TotalOffset = ImmOp->getImm() + Offset;
if (TotalOffset == 0) {
MI.setDesc(TII->get(AMDGPU::COPY));
for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
MI.removeOperand(I);

MI.getOperand(1).ChangeToRegister(BaseReg, false);
return;
}

ImmOp->setImm(TotalOffset);

MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();

// FIXME: materializeFrameBaseRegister does not know the register class of
// the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
// a copy so we have a legal operand and hope the register coalescer can
// clean it up.
if (isSGPRReg(MRI, BaseReg)) {
Register BaseRegVGPR =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
.addReg(BaseReg);
MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
} else {
MI.getOperand(2).ChangeToRegister(BaseReg, false);
}
return;
}
case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_CO_U32_e64: {
int Src0Idx = MI.getNumExplicitDefs();
MachineOperand *FIOp = &MI.getOperand(Src0Idx);
MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
if (!FIOp->isFI())
std::swap(FIOp, ImmOp);

if (!ImmOp->isImm()) {
FIOp->ChangeToRegister(BaseReg, false);
TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
return;
}

int64_t TotalOffset = ImmOp->getImm() + Offset;
if (TotalOffset == 0) {
MI.setDesc(TII->get(AMDGPU::COPY));

for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
MI.removeOperand(I);

MI.getOperand(1).ChangeToRegister(BaseReg, false);
} else {
FIOp->ChangeToRegister(BaseReg, false);
ImmOp->setImm(TotalOffset);
}

return;
}
default:
break;
}

bool IsFlat = TII->isFLATScratch(MI);

#ifndef NDEBUG
Expand Down Expand Up @@ -925,6 +1077,18 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
Register BaseReg,
int64_t Offset) const {

switch (MI->getOpcode()) {
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32:
return true;
case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_CO_U32_e64:
return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
default:
break;
}

if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
return false;

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -640,12 +640,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <


let Defs = [SCC] in { // Carry out goes to SCC
let isCommutable = 1 in {
let isCommutable = 1, isAdd = 1 in {
def S_ADD_U32 : SOP2_32 <"s_add_u32">;
def S_ADD_I32 : SOP2_32 <"s_add_i32",
[(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
>;
} // End isCommutable = 1
} // End isCommutable = 1, isAdd = 1

def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
def S_SUB_I32 : SOP2_32 <"s_sub_i32",
Expand Down
12 changes: 10 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,11 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;

// No patterns so that the scalar instructions are always selected.
// The scalar versions will be replaced with vector when needed later.
defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;

let isAdd = 1 in {
defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
}

defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
Expand All @@ -772,7 +776,11 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f


let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;

let isAdd = 1 in {
defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
}

defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s

---
name: local_stack_alloc__v_add_u32_e64__literal_offsets
tracksRegLiveness: true
stack:
- { id: 0, size: 4096, alignment: 4 }
machineFunctionInfo:
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
frameOffsetReg: '$sgpr33'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
; GFX10-NEXT: SI_RETURN
;
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
; GFX12-NEXT: SI_RETURN
%0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
%1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
SI_RETURN

...

---
name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
tracksRegLiveness: true
stack:
- { id: 0, size: 4096, alignment: 4 }
machineFunctionInfo:
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
frameOffsetReg: '$sgpr33'
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
; GFX10-NEXT: SI_RETURN
;
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
; GFX12-NEXT: SI_RETURN
%0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
%1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
%2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %2
SI_RETURN

...

Loading
Loading