Skip to content

[AMDGPU] Support divergent sized dynamic alloca #121148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1190,9 +1190,13 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(

const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);

// TODO: Need to emit a wave reduction to get the maximum size.
if (SizeBank != &AMDGPU::SGPRRegBank)
return false;
if (SizeBank != &AMDGPU::SGPRRegBank) {
auto WaveReduction =
B.buildIntrinsic(Intrinsic::amdgcn_wave_reduce_umax, {LLT::scalar(32)})
.addUse(AllocSize)
.addImm(0);
AllocSize = WaveReduction.getReg(0);
}

LLT PtrTy = MRI.getType(Dst);
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
Expand Down
67 changes: 35 additions & 32 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4017,29 +4017,26 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}

// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
// applying the wave size scale to the increment amount.
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
SelectionDAG &DAG) const {
// except for:
// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
const MachineFunction &MF = DAG.getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

SDLoc dl(Op);
EVT VT = Op.getValueType();
SDValue Tmp1 = Op;
SDValue Tmp2 = Op.getValue(1);
SDValue Tmp3 = Op.getOperand(2);
SDValue Chain = Tmp1.getOperand(0);

SDValue Chain = Op.getOperand(0);
Register SPReg = Info->getStackPtrOffsetReg();

// Chain the dynamic stack allocation so that it doesn't modify the stack
// pointer when other instructions are using the stack.
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

SDValue Size = Tmp2.getOperand(1);
SDValue Size = Op.getOperand(1);
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();

const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
Expand All @@ -4057,30 +4054,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
DAG.getSignedConstant(-ScaledAlignment, dl, VT));
}

SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));

SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
SDValue NewSP;
if (isa<ConstantSDNode>(Size)) {
// For constant sized alloca, scale alloca size by wave-size
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
} else {
// For dynamic sized alloca, perform wave-wide reduction to get max of
// alloca size(divergent) and then scale it by wave-size
SDValue WaveReduction =
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
Size, DAG.getConstant(0, dl, MVT::i32));
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
NewSP =
DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
SDValue ReadFirstLaneID =
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
NewSP);
}

Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
}

SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
// We only handle constant sizes here to allow non-entry block, static sized
// allocas. A truly dynamic value is more difficult to support because we
// don't know if the size value is uniform or not. If the size isn't uniform,
// we would need to do a wave reduction to get the maximum size to know how
// much to increment the uniform stack pointer.
SDValue Size = Op.getOperand(1);
if (isa<ConstantSDNode>(Size))
return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
}

SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;

SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
Expand Down
72 changes: 0 additions & 72 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll

This file was deleted.

129 changes: 129 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
Original file line number Diff line number Diff line change
Expand Up @@ -491,3 +491,132 @@ body: |
%1:_(p5) = G_DYN_STACKALLOC %0, 32
S_ENDPGM 0, implicit %1
...

---
name: test_dyn_stackalloc_vgpr_align4
legalized: true
frameInfo:
maxAlignment: 4
stack:
- { id: 0, type: variable-sized, alignment: 4 }
body: |
bb.0:
liveins: $vgpr0

; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align4
; WAVE64: liveins: $vgpr0
; WAVE64-NEXT: {{ $}}
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
;
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align4
; WAVE32: liveins: $vgpr0
; WAVE32-NEXT: {{ $}}
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $vgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 4
S_ENDPGM 0, implicit %1
...

---
name: test_dyn_stackalloc_vgpr_align16
legalized: true
frameInfo:
maxAlignment: 16
stack:
- { id: 0, type: variable-sized, alignment: 16 }
body: |
bb.0:
liveins: $vgpr0

; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align16
; WAVE64: liveins: $vgpr0
; WAVE64-NEXT: {{ $}}
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
;
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align16
; WAVE32: liveins: $vgpr0
; WAVE32-NEXT: {{ $}}
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
%0:_(s32) = COPY $vgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 16
S_ENDPGM 0, implicit %1
...

---
name: test_dyn_stackalloc_vgpr_align64
legalized: true
frameInfo:
maxAlignment: 64
stack:
- { id: 0, type: variable-sized, alignment: 64 }
body: |
bb.0:
liveins: $vgpr0

; WAVE64-LABEL: name: test_dyn_stackalloc_vgpr_align64
; WAVE64: liveins: $vgpr0
; WAVE64-NEXT: {{ $}}
; WAVE64-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE64-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
;
; WAVE32-LABEL: name: test_dyn_stackalloc_vgpr_align64
; WAVE32: liveins: $vgpr0
; WAVE32-NEXT: {{ $}}
; WAVE32-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
; WAVE32-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.wave.reduce.umax), [[COPY]](s32), 0
; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[INTRINSIC_CONVERGENT]], [[C]](s32)
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
%0:_(s32) = COPY $vgpr0
%1:_(p5) = G_DYN_STACKALLOC %0, 64
S_ENDPGM 0, implicit %1
...
Loading
Loading