Skip to content

Commit 2d06d3a

Browse files
committed
fixing base address of dynamically sized stack object for growing up stack
1 parent e7303fe commit 2d06d3a

File tree

2 files changed

+45
-42
lines changed

2 files changed

+45
-42
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4016,8 +4016,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
40164016
InVals, /*IsThisReturn=*/false, SDValue());
40174017
}
40184018

4019-
// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020-
// except for applying the wave size scale to the increment amount.
4019+
// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4020+
// except for stack growth direction(default: downwards, AMDGPU: upwards) and
4021+
// applying the wave size scale to the increment amount.
40214022
SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40224023
SelectionDAG &DAG) const {
40234024
const MachineFunction &MF = DAG.getMachineFunction();
@@ -4037,19 +4038,29 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40374038
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
40384039

40394040
SDValue Size = Tmp2.getOperand(1);
4040-
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4041-
Chain = SP.getValue(1);
4041+
SDValue SPOld = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4042+
Chain = SPOld.getValue(1);
40424043
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
40434044
const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
40444045
assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
40454046
"Stack grows upwards for AMDGPU");
4047+
Align StackAlign = TFL->getStackAlign();
4048+
if (Alignment && *Alignment > StackAlign) {
4049+
SDValue ScaledAlignment = DAG.getSignedConstant(
4050+
(uint64_t)Alignment->value() << Subtarget->getWavefrontSizeLog2(), dl,
4051+
VT);
4052+
SDValue StackAlignMask = DAG.getNode(ISD::SUB, dl, VT, ScaledAlignment,
4053+
DAG.getConstant(1, dl, VT));
4054+
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, StackAlignMask);
4055+
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, ScaledAlignment);
4056+
}
40464057

40474058
SDValue ScaledSize = DAG.getNode(
40484059
ISD::SHL, dl, VT, Size,
40494060
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
40504061

40514062
Align StackAlign = TFL->getStackAlign();
4052-
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
4063+
Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SPOld, ScaledSize); // Value
40534064
if (Alignment && *Alignment > StackAlign) {
40544065
Tmp1 = DAG.getNode(
40554066
ISD::AND, dl, VT, Tmp1,
@@ -4061,7 +4072,7 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
40614072
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
40624073
Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
40634074

4064-
return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4075+
return DAG.getMergeValues({SPOld, Tmp2}, dl);
40654076
}
40664077

40674078
SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
3030
; MUBUF-NEXT: s_cmp_lg_u32 s9, 0
3131
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3
3232
; MUBUF-NEXT: ; %bb.2: ; %bb.1
33-
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
34-
; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
35-
; MUBUF-NEXT: s_mov_b32 s32, s6
33+
; MUBUF-NEXT: s_mov_b32 s6, s32
3634
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
37-
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
38-
; MUBUF-NEXT: v_mov_b32_e32 v3, 1
35+
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
36+
; MUBUF-NEXT: s_lshl_b32 s7, s10, 2
37+
; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
38+
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s6
39+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
3940
; MUBUF-NEXT: s_add_i32 s6, s6, s7
40-
; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
41-
; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
4241
; MUBUF-NEXT: v_mov_b32_e32 v2, s6
4342
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
4443
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
6665
; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0
6766
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3
6867
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
69-
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
68+
; FLATSCR-NEXT: s_mov_b32 s2, s32
7069
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
7170
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
7271
; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2
73-
; FLATSCR-NEXT: s_mov_b32 s32, s2
72+
; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
7473
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
7574
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
7675
; FLATSCR-NEXT: scratch_load_dword v2, off, s2
@@ -131,16 +130,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
131130
; MUBUF-NEXT: s_cmp_lg_u32 s4, 0
132131
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2
133132
; MUBUF-NEXT: ; %bb.1: ; %bb.0
134-
; MUBUF-NEXT: s_add_i32 s4, s32, 0x1000
135-
; MUBUF-NEXT: s_and_b32 s4, s4, 0xfffff000
136-
; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137-
; MUBUF-NEXT: s_mov_b32 s32, s4
133+
; MUBUF-NEXT: s_mov_b32 s4, s32
138134
; MUBUF-NEXT: v_mov_b32_e32 v1, 0
139-
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
140-
; MUBUF-NEXT: v_mov_b32_e32 v3, 1
135+
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
136+
; MUBUF-NEXT: s_lshl_b32 s5, s5, 2
137+
; MUBUF-NEXT: s_add_i32 s32, s4, 0x1000
138+
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4
139+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s4 offset:4
141140
; MUBUF-NEXT: s_add_i32 s4, s4, s5
142-
; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
143-
; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
144141
; MUBUF-NEXT: v_mov_b32_e32 v2, s4
145142
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
146143
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
@@ -165,12 +162,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
165162
; FLATSCR-NEXT: s_cmp_lg_u32 s0, 0
166163
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2
167164
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
168-
; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1000
169165
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
170-
; FLATSCR-NEXT: s_and_b32 s0, s0, 0xfffff000
166+
; FLATSCR-NEXT: s_mov_b32 s0, s32
171167
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
172168
; FLATSCR-NEXT: s_lshl_b32 s1, s1, 2
173-
; FLATSCR-NEXT: s_mov_b32 s32, s0
169+
; FLATSCR-NEXT: s_add_i32 s32, s0, 0x1000
174170
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
175171
; FLATSCR-NEXT: s_add_i32 s0, s0, s1
176172
; FLATSCR-NEXT: scratch_load_dword v2, off, s0
@@ -230,16 +226,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
230226
; MUBUF-NEXT: s_and_b64 exec, exec, vcc
231227
; MUBUF-NEXT: s_cbranch_execz .LBB2_3
232228
; MUBUF-NEXT: ; %bb.2: ; %bb.1
233-
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
229+
; MUBUF-NEXT: s_mov_b32 s6, s32
234230
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
235-
; MUBUF-NEXT: v_mov_b32_e32 v3, s6
236-
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen
231+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
237232
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
238-
; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
233+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
239234
; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6
240235
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
241236
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
242-
; MUBUF-NEXT: s_mov_b32 s32, s6
237+
; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
243238
; MUBUF-NEXT: s_waitcnt vmcnt(0)
244239
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
245240
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -266,14 +261,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
266261
; FLATSCR-NEXT: s_and_b64 exec, exec, vcc
267262
; FLATSCR-NEXT: s_cbranch_execz .LBB2_3
268263
; FLATSCR-NEXT: ; %bb.2: ; %bb.1
269-
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
264+
; FLATSCR-NEXT: s_mov_b32 s2, s32
270265
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
271266
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
272267
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
273268
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2
274269
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
275270
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
276-
; FLATSCR-NEXT: s_mov_b32 s32, s2
271+
; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
277272
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
278273
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
279274
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off
@@ -324,17 +319,15 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
324319
; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc
325320
; MUBUF-NEXT: s_cbranch_execz .LBB3_2
326321
; MUBUF-NEXT: ; %bb.1: ; %bb.0
327-
; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000
328-
; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000
322+
; MUBUF-NEXT: s_mov_b32 s6, s32
329323
; MUBUF-NEXT: v_mov_b32_e32 v2, 0
330-
; MUBUF-NEXT: v_mov_b32_e32 v4, s6
331-
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen
324+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6
332325
; MUBUF-NEXT: v_mov_b32_e32 v2, 1
333-
; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
326+
; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s6 offset:4
334327
; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6
335328
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
336329
; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31
337-
; MUBUF-NEXT: s_mov_b32 s32, s6
330+
; MUBUF-NEXT: s_add_i32 s32, s6, 0x1000
338331
; MUBUF-NEXT: s_waitcnt vmcnt(0)
339332
; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3
340333
; MUBUF-NEXT: global_store_dword v[0:1], v2, off
@@ -358,15 +351,14 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
358351
; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
359352
; FLATSCR-NEXT: s_cbranch_execz .LBB3_2
360353
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
361-
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
362-
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
354+
; FLATSCR-NEXT: s_mov_b32 s2, s32
363355
; FLATSCR-NEXT: v_mov_b32_e32 v4, 0
364356
; FLATSCR-NEXT: v_mov_b32_e32 v5, 1
365357
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2
366358
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
367359
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
368360
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31
369-
; FLATSCR-NEXT: s_mov_b32 s32, s2
361+
; FLATSCR-NEXT: s_add_i32 s32, s2, 0x1000
370362
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
371363
; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3
372364
; FLATSCR-NEXT: global_store_dword v[0:1], v2, off

0 commit comments

Comments
 (0)