Skip to content

Commit 2852618

Browse files
committed
[AMDGPU] Update base addr of dyn alloca considering GrowingUp stack (llvm#119822)
Currently, compiler calculates the base address of dynamic sized stack object (alloca) as follows: 1. `NewSP = Align(CurrSP + Size)` _where_ `Size = # of elements * wave size * alloca type` 2. `BaseAddr = NewSP` 3. The alignment is computed as: `AlignedAddr = Addr & ~(Alignment - 1)` 4. Return the `BaseAddr` This makes sense when stack is grows downwards. AMDGPU stack grows upwards, the base address needs to be aligned first and SP bump by required size later: 1. `BaseAddr = Align(CurrSP)` 2. `NewSP = BaseAddr + Size` 3. `AlignedAddr = (Addr + (Alignment - 1)) & ~(Alignment - 1)` 4. and returns the `BaseAddr`.
1 parent e38c124 commit 2852618

File tree

7 files changed

+292
-190
lines changed

7 files changed

+292
-190
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,15 +1200,18 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
12001200
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
12011201
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
12021202

1203-
auto SPCopy = B.buildCopy(PtrTy, SPReg);
1203+
auto OldSP = B.buildCopy(PtrTy, SPReg);
12041204
if (Alignment > TFI.getStackAlign()) {
1205-
auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1206-
B.buildMaskLowPtrBits(Dst, PtrAdd,
1205+
auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
1206+
auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
1207+
B.buildConstant(LLT::scalar(32), StackAlignMask));
1208+
B.buildMaskLowPtrBits(Dst, Tmp1,
12071209
Log2(Alignment) + ST.getWavefrontSizeLog2());
12081210
} else {
1209-
B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1211+
B.buildCopy(Dst, OldSP);
12101212
}
1211-
1213+
auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
1214+
B.buildCopy(SPReg, PtrAdd);
12121215
MI.eraseFromParent();
12131216
return true;
12141217
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3990,10 +3990,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39903990
InVals, /*IsThisReturn=*/false, SDValue());
39913991
}
39923992

3993-
// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3994-
// except for applying the wave size scale to the increment amount.
3995-
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3996-
SDValue Op, SelectionDAG &DAG) const {
3993+
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
3994+
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
3995+
// applying the wave size scale to the increment amount.
3996+
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
3997+
SelectionDAG &DAG) const {
39973998
const MachineFunction &MF = DAG.getMachineFunction();
39983999
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
39994000

@@ -4010,31 +4011,36 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
40104011
// pointer when other instructions are using the stack.
40114012
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
40124013

4013-
SDValue Size = Tmp2.getOperand(1);
4014-
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4015-
Chain = SP.getValue(1);
4016-
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
4014+
SDValue Size = Tmp2.getOperand(1);
4015+
SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4016+
Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4017+
40174018
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
40184019
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
40194020
"Stack grows upwards for AMDGPU");
40204021

4022+
Chain = BaseAddr.getValue(1);
4023+
Align StackAlign = TFL->getStackAlign();
4024+
if (Alignment > StackAlign) {
4025+
uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4026+
<< Subtarget->getWavefrontSizeLog2();
4027+
uint64_t StackAlignMask = ScaledAlignment - 1;
4028+
SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4029+
DAG.getConstant(StackAlignMask, dl, VT));
4030+
BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4031+
DAG.getConstant(-ScaledAlignment, dl, VT));
4032+
}
4033+
40214034
SDValue ScaledSize = DAG.getNode(
40224035
ISD::SHL, dl, VT, Size,
40234036
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
40244037

4025-
Align StackAlign = TFL->getStackAlign();
4026-
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
4027-
if (Alignment && *Alignment > StackAlign) {
4028-
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
4029-
DAG.getConstant(-(uint64_t)Alignment->value()
4030-
<< Subtarget->getWavefrontSizeLog2(),
4031-
dl, VT));
4032-
}
4038+
SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
40334039

4034-
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4040+
Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
40354041
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
40364042

4037-
return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4043+
return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
40384044
}
40394045

40404046
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

0 commit comments

Comments
 (0)