Skip to content

Commit 010f5ed

Browse files
committed
[AMDGPU][SDAG] Initial support for ISD::PTRADD
Enable generation of PTRADD SelectionDAG nodes for pointer arithmetic for SI, for now behind an internal CLI option. Also add basic patterns to match these nodes. Optimizations will come in follow-up PRs. Basic tests for SDAG codegen with PTRADD are in test/CodeGen/AMDGPU/ptradd-sdag.ll Since GlobalISel also uses the PTRADD SDAG patterns via SelectionDAGCompat, this change affects GlobalISel tests: - Uniform 32-bit address arithmetic is now lowered to s_add_i32 instead of s_add_u32, which is consistent to what SDAG does (and gives SIShrinkInstructions the chance to generate s_addk_i32). - 64-bit address arithmetic uses the [sv]_add_u64 pseudos, which is consistent with SDAG and means that GISel now generates 64-bit adds for gfx12. The only drawback with that is that we could save 1-2 instructions if we didn't use 64-bit adds with >32-bit immediate (two movs with 32-bit immediates, s_delay_alu, and a 64-bit add vs two 32-bit adds with immediate), but that's a separate problem. - The register class for the dead carry-out/sign-bit operand of V_ADD_CO_U32_e64 on architectures without carry-less additions now is sreg_64 instead of sreg_64_xexec. I'm not sure if that loses us something worth preserving; I haven't found an obvious way to avoid this. Overall, the changes in the GlobalISel tests seem to be improvements.
1 parent 4236423 commit 010f5ed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2988
-4536
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
6161
cl::desc("Use indirect register addressing for divergent indexes"),
6262
cl::init(false));
6363

64+
// TODO This option should be removed once we switch to always using PTRADD in
65+
// the SelectionDAG.
66+
static cl::opt<bool> UseSelectionDAGPTRADD(
67+
"amdgpu-use-sdag-ptradd", cl::Hidden,
68+
cl::desc("Generate ISD::PTRADD nodes in the SelectionDAG ISel"),
69+
cl::init(false));
70+
6471
static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
6572
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6673
return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -10457,6 +10464,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1045710464
}
1045810465
}
1045910466

10467+
bool SITargetLowering::shouldPreservePtrArith(const Function &F,
10468+
EVT PtrVT) const {
10469+
return UseSelectionDAGPTRADD;
10470+
}
10471+
1046010472
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
1046110473
// offset (the offset that is included in bounds checking and swizzling, to be
1046210474
// split between the instruction's voffset and immoffset fields) and soffset

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
260260

261261
bool shouldExpandVectorDynExt(SDNode *N) const;
262262

263+
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
264+
263265
private:
264266
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
265267
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,6 +1376,37 @@ def : GCNPat <
13761376
(i32 (V_MOV_B32_e32 (i32 0))), sub1)
13771377
>;
13781378

1379+
//===----------------------------------------------------------------------===//
1380+
// PTRADD Patterns
1381+
//===----------------------------------------------------------------------===//
1382+
1383+
def : GCNPat<
1384+
(DivergentBinFrag<ptradd> i64:$src0, i64:$src1),
1385+
(V_ADD_U64_PSEUDO $src0, $src1)>;
1386+
1387+
def : GCNPat<
1388+
(DivergentBinFrag<ptradd> i32:$src0, i32:$src1),
1389+
(V_ADD_U32_e64 $src0, $src1, 0)> {
1390+
let SubtargetPredicate = HasAddNoCarryInsts;
1391+
}
1392+
1393+
def : GCNPat<
1394+
(DivergentBinFrag<ptradd> i32:$src0, i32:$src1),
1395+
(V_ADD_CO_U32_e64 $src0, $src1)> {
1396+
let SubtargetPredicate = NotHasAddNoCarryInsts;
1397+
}
1398+
1399+
def : GCNPat<
1400+
(UniformBinFrag<ptradd> i64:$src0, i64:$src1),
1401+
(S_ADD_U64_PSEUDO $src0, $src1)>;
1402+
1403+
// Whether we select S_ADD_I32 or S_ADD_U32 does not make much of a
1404+
// difference. Most notably, S_ADD_I32 instructions can be transformed
1405+
// to S_ADDK_I32, so we select that.
1406+
def : GCNPat<
1407+
(UniformBinFrag<ptradd> i32:$src0, i32:$src1),
1408+
(S_ADD_I32 $src0, $src1)>;
1409+
13791410
/********** ============================================ **********/
13801411
/********** Extraction, Insertion, Building and Casting **********/
13811412
/********** ============================================ **********/

llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,15 @@ define amdgpu_kernel void @kernel_caller_stack() {
3636
; FLATSCR-NEXT: s_mov_b32 s32, 0
3737
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
3838
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
39-
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
39+
; FLATSCR-NEXT: s_add_i32 s0, s32, 4
4040
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
4141
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
42-
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
42+
; FLATSCR-NEXT: s_add_i32 s0, s32, 8
4343
; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
4444
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
45-
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
45+
; FLATSCR-NEXT: s_add_i32 s0, s32, 12
4646
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
47-
; FLATSCR-NEXT: s_add_u32 s2, s32, 16
47+
; FLATSCR-NEXT: s_add_i32 s2, s32, 16
4848
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
4949
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
5050
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
@@ -189,13 +189,13 @@ define amdgpu_kernel void @kernel_caller_byval() {
189189
; FLATSCR-NEXT: s_getpc_b64 s[0:1]
190190
; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
191191
; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
192-
; FLATSCR-NEXT: s_add_u32 s2, s32, 8
193-
; FLATSCR-NEXT: s_add_u32 s3, s32, 16
194-
; FLATSCR-NEXT: s_add_u32 s4, s32, 24
195-
; FLATSCR-NEXT: s_add_u32 s5, s32, 32
196-
; FLATSCR-NEXT: s_add_u32 s6, s32, 40
197-
; FLATSCR-NEXT: s_add_u32 s7, s32, 48
198-
; FLATSCR-NEXT: s_add_u32 s8, s32, 56
192+
; FLATSCR-NEXT: s_add_i32 s2, s32, 8
193+
; FLATSCR-NEXT: s_add_i32 s3, s32, 16
194+
; FLATSCR-NEXT: s_add_i32 s4, s32, 24
195+
; FLATSCR-NEXT: s_add_i32 s5, s32, 32
196+
; FLATSCR-NEXT: s_add_i32 s6, s32, 40
197+
; FLATSCR-NEXT: s_add_i32 s7, s32, 48
198+
; FLATSCR-NEXT: s_add_i32 s8, s32, 56
199199
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
200200
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32
201201
; FLATSCR-NEXT: s_waitcnt vmcnt(7)
@@ -266,16 +266,16 @@ define void @func_caller_stack() {
266266
; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
267267
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
268268
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
269-
; FLATSCR-NEXT: s_add_u32 s0, s32, 4
269+
; FLATSCR-NEXT: s_add_i32 s0, s32, 4
270270
; FLATSCR-NEXT: v_mov_b32_e32 v0, 9
271271
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
272-
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
272+
; FLATSCR-NEXT: s_add_i32 s0, s32, 8
273273
; FLATSCR-NEXT: v_mov_b32_e32 v0, 10
274274
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
275-
; FLATSCR-NEXT: s_add_u32 s0, s32, 12
275+
; FLATSCR-NEXT: s_add_i32 s0, s32, 12
276276
; FLATSCR-NEXT: v_mov_b32_e32 v0, 11
277277
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
278-
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
278+
; FLATSCR-NEXT: s_add_i32 s0, s32, 16
279279
; FLATSCR-NEXT: v_mov_b32_e32 v0, 12
280280
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
281281
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
@@ -393,8 +393,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
393393
; FLATSCR-NEXT: s_add_i32 s32, s32, 16
394394
; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0
395395
; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2
396-
; FLATSCR-NEXT: s_add_u32 s0, s32, 8
397-
; FLATSCR-NEXT: s_add_u32 s2, s32, 56
396+
; FLATSCR-NEXT: s_add_i32 s0, s32, 8
397+
; FLATSCR-NEXT: s_add_i32 s2, s32, 56
398398
; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
399399
; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
400400
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
@@ -404,28 +404,28 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
404404
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
405405
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
406406
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
407-
; FLATSCR-NEXT: s_add_u32 s0, s32, 16
407+
; FLATSCR-NEXT: s_add_i32 s0, s32, 16
408408
; FLATSCR-NEXT: v_add_u32_e32 v3, 24, v0
409409
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
410410
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
411411
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
412-
; FLATSCR-NEXT: s_add_u32 s0, s32, 24
412+
; FLATSCR-NEXT: s_add_i32 s0, s32, 24
413413
; FLATSCR-NEXT: v_add_u32_e32 v3, 32, v0
414414
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
415415
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
416416
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
417-
; FLATSCR-NEXT: s_add_u32 s0, s32, 32
417+
; FLATSCR-NEXT: s_add_i32 s0, s32, 32
418418
; FLATSCR-NEXT: v_add_u32_e32 v3, 40, v0
419419
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
420420
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
421421
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
422-
; FLATSCR-NEXT: s_add_u32 s0, s32, 40
422+
; FLATSCR-NEXT: s_add_i32 s0, s32, 40
423423
; FLATSCR-NEXT: v_add_u32_e32 v3, 48, v0
424424
; FLATSCR-NEXT: v_add_u32_e32 v0, 56, v0
425425
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
426426
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
427427
; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v3, off
428-
; FLATSCR-NEXT: s_add_u32 s0, s32, 48
428+
; FLATSCR-NEXT: s_add_i32 s0, s32, 48
429429
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
430430
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s0
431431
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], v0, off

llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
2020
; GFX9-NEXT: v_mov_b32_e32 v1, s4
2121
; GFX9-NEXT: s_lshl_b32 s5, s5, 6
2222
; GFX9-NEXT: s_mov_b32 s33, 0
23-
; GFX9-NEXT: s_add_u32 s32, s4, s5
23+
; GFX9-NEXT: s_add_i32 s32, s4, s5
2424
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
2525
; GFX9-NEXT: s_endpgm
2626
;
@@ -39,7 +39,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
3939
; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15
4040
; GFX10-NEXT: s_and_b32 s5, s5, -16
4141
; GFX10-NEXT: s_lshl_b32 s5, s5, 5
42-
; GFX10-NEXT: s_add_u32 s32, s4, s5
42+
; GFX10-NEXT: s_add_i32 s32, s4, s5
4343
; GFX10-NEXT: s_endpgm
4444
;
4545
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
@@ -56,7 +56,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
5656
; GFX11-NEXT: s_and_b32 s1, s1, -16
5757
; GFX11-NEXT: s_lshl_b32 s1, s1, 5
5858
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
59-
; GFX11-NEXT: s_add_u32 s32, s0, s1
59+
; GFX11-NEXT: s_add_i32 s32, s0, s1
6060
; GFX11-NEXT: s_endpgm
6161
%alloca = alloca i32, i32 %n, align 4, addrspace(5)
6262
store i32 0, ptr addrspace(5) %alloca
@@ -84,7 +84,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
8484
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
8585
; GFX9-NEXT: s_and_b32 s4, s4, -16
8686
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
87-
; GFX9-NEXT: s_add_u32 s32, s6, s4
87+
; GFX9-NEXT: s_add_i32 s32, s6, s4
8888
; GFX9-NEXT: s_mov_b32 s32, s33
8989
; GFX9-NEXT: s_mov_b32 s33, s7
9090
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -110,7 +110,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
110110
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
111111
; GFX10-NEXT: s_and_b32 s4, s4, -16
112112
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
113-
; GFX10-NEXT: s_add_u32 s32, s6, s4
113+
; GFX10-NEXT: s_add_i32 s32, s6, s4
114114
; GFX10-NEXT: s_mov_b32 s32, s33
115115
; GFX10-NEXT: s_mov_b32 s33, s7
116116
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -136,7 +136,7 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
136136
; GFX11-NEXT: s_and_b32 s0, s0, -16
137137
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
138138
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
139-
; GFX11-NEXT: s_add_u32 s32, s2, s0
139+
; GFX11-NEXT: s_add_i32 s32, s2, s0
140140
; GFX11-NEXT: s_mov_b32 s32, s33
141141
; GFX11-NEXT: s_mov_b32 s33, s3
142142
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -161,7 +161,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
161161
; GFX9-NEXT: v_mov_b32_e32 v1, s4
162162
; GFX9-NEXT: s_lshl_b32 s5, s5, 6
163163
; GFX9-NEXT: s_mov_b32 s33, 0
164-
; GFX9-NEXT: s_add_u32 s32, s4, s5
164+
; GFX9-NEXT: s_add_i32 s32, s4, s5
165165
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
166166
; GFX9-NEXT: s_endpgm
167167
;
@@ -180,7 +180,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
180180
; GFX10-NEXT: s_lshl2_add_u32 s5, s5, 15
181181
; GFX10-NEXT: s_and_b32 s5, s5, -16
182182
; GFX10-NEXT: s_lshl_b32 s5, s5, 5
183-
; GFX10-NEXT: s_add_u32 s32, s4, s5
183+
; GFX10-NEXT: s_add_i32 s32, s4, s5
184184
; GFX10-NEXT: s_endpgm
185185
;
186186
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
@@ -197,7 +197,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
197197
; GFX11-NEXT: s_and_b32 s1, s1, -16
198198
; GFX11-NEXT: s_lshl_b32 s1, s1, 5
199199
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
200-
; GFX11-NEXT: s_add_u32 s32, s0, s1
200+
; GFX11-NEXT: s_add_i32 s32, s0, s1
201201
; GFX11-NEXT: s_endpgm
202202
%alloca = alloca i32, i32 %n, align 16, addrspace(5)
203203
store i32 0, ptr addrspace(5) %alloca
@@ -225,7 +225,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
225225
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
226226
; GFX9-NEXT: s_and_b32 s4, s4, -16
227227
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
228-
; GFX9-NEXT: s_add_u32 s32, s6, s4
228+
; GFX9-NEXT: s_add_i32 s32, s6, s4
229229
; GFX9-NEXT: s_mov_b32 s32, s33
230230
; GFX9-NEXT: s_mov_b32 s33, s7
231231
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -251,7 +251,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
251251
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
252252
; GFX10-NEXT: s_and_b32 s4, s4, -16
253253
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
254-
; GFX10-NEXT: s_add_u32 s32, s6, s4
254+
; GFX10-NEXT: s_add_i32 s32, s6, s4
255255
; GFX10-NEXT: s_mov_b32 s32, s33
256256
; GFX10-NEXT: s_mov_b32 s33, s7
257257
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -277,7 +277,7 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
277277
; GFX11-NEXT: s_and_b32 s0, s0, -16
278278
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
279279
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280-
; GFX11-NEXT: s_add_u32 s32, s2, s0
280+
; GFX11-NEXT: s_add_i32 s32, s2, s0
281281
; GFX11-NEXT: s_mov_b32 s32, s33
282282
; GFX11-NEXT: s_mov_b32 s33, s3
283283
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -294,7 +294,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
294294
; GFX9-NEXT: s_movk_i32 s32, 0x800
295295
; GFX9-NEXT: s_add_u32 s0, s0, s17
296296
; GFX9-NEXT: s_addc_u32 s1, s1, 0
297-
; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff
297+
; GFX9-NEXT: s_add_i32 s5, s32, 0x7ff
298298
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
299299
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
300300
; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800
@@ -303,7 +303,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
303303
; GFX9-NEXT: v_mov_b32_e32 v1, s5
304304
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
305305
; GFX9-NEXT: s_mov_b32 s33, 0
306-
; GFX9-NEXT: s_add_u32 s32, s5, s4
306+
; GFX9-NEXT: s_add_i32 s32, s5, s4
307307
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
308308
; GFX9-NEXT: s_endpgm
309309
;
@@ -313,7 +313,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
313313
; GFX10-NEXT: s_movk_i32 s32, 0x400
314314
; GFX10-NEXT: s_add_u32 s0, s0, s17
315315
; GFX10-NEXT: s_addc_u32 s1, s1, 0
316-
; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff
316+
; GFX10-NEXT: s_add_i32 s5, s32, 0x3ff
317317
; GFX10-NEXT: v_mov_b32_e32 v0, 0
318318
; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00
319319
; GFX10-NEXT: s_mov_b32 s33, 0
@@ -323,15 +323,15 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
323323
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
324324
; GFX10-NEXT: s_and_b32 s4, s4, -16
325325
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
326-
; GFX10-NEXT: s_add_u32 s32, s5, s4
326+
; GFX10-NEXT: s_add_i32 s32, s5, s4
327327
; GFX10-NEXT: s_endpgm
328328
;
329329
; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
330330
; GFX11: ; %bb.0:
331331
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
332332
; GFX11-NEXT: s_mov_b32 s32, 32
333333
; GFX11-NEXT: v_mov_b32_e32 v0, 0
334-
; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff
334+
; GFX11-NEXT: s_add_i32 s1, s32, 0x3ff
335335
; GFX11-NEXT: s_mov_b32 s33, 0
336336
; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00
337337
; GFX11-NEXT: scratch_store_b32 off, v0, s1
@@ -341,7 +341,7 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
341341
; GFX11-NEXT: s_and_b32 s0, s0, -16
342342
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
343343
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
344-
; GFX11-NEXT: s_add_u32 s32, s1, s0
344+
; GFX11-NEXT: s_add_i32 s32, s1, s0
345345
; GFX11-NEXT: s_endpgm
346346
%alloca = alloca i32, i32 %n, align 32, addrspace(5)
347347
store i32 0, ptr addrspace(5) %alloca
@@ -366,15 +366,15 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
366366
; GFX9-NEXT: s_mov_b32 s33, s6
367367
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
368368
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
369-
; GFX9-NEXT: s_add_u32 s5, s32, 0x7ff
369+
; GFX9-NEXT: s_add_i32 s5, s32, 0x7ff
370370
; GFX9-NEXT: s_and_b32 s5, s5, 0xfffff800
371371
; GFX9-NEXT: v_mov_b32_e32 v1, s5
372372
; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
373373
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374374
; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15
375375
; GFX9-NEXT: s_and_b32 s4, s4, -16
376376
; GFX9-NEXT: s_lshl_b32 s4, s4, 6
377-
; GFX9-NEXT: s_add_u32 s32, s5, s4
377+
; GFX9-NEXT: s_add_i32 s32, s5, s4
378378
; GFX9-NEXT: s_mov_b32 s32, s34
379379
; GFX9-NEXT: s_mov_b32 s34, s7
380380
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -397,15 +397,15 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
397397
; GFX10-NEXT: s_mov_b32 s33, s6
398398
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
399399
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
400-
; GFX10-NEXT: s_add_u32 s5, s32, 0x3ff
400+
; GFX10-NEXT: s_add_i32 s5, s32, 0x3ff
401401
; GFX10-NEXT: s_and_b32 s5, s5, 0xfffffc00
402402
; GFX10-NEXT: v_mov_b32_e32 v1, s5
403403
; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
404404
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
405405
; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15
406406
; GFX10-NEXT: s_and_b32 s4, s4, -16
407407
; GFX10-NEXT: s_lshl_b32 s4, s4, 5
408-
; GFX10-NEXT: s_add_u32 s32, s5, s4
408+
; GFX10-NEXT: s_add_i32 s32, s5, s4
409409
; GFX10-NEXT: s_mov_b32 s32, s34
410410
; GFX10-NEXT: s_mov_b32 s34, s7
411411
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -427,7 +427,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
427427
; GFX11-NEXT: s_mov_b32 s33, s2
428428
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
429429
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
430-
; GFX11-NEXT: s_add_u32 s1, s32, 0x3ff
430+
; GFX11-NEXT: s_add_i32 s1, s32, 0x3ff
431431
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
432432
; GFX11-NEXT: s_and_b32 s1, s1, 0xfffffc00
433433
; GFX11-NEXT: scratch_store_b32 off, v0, s1
@@ -436,7 +436,7 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
436436
; GFX11-NEXT: s_and_b32 s0, s0, -16
437437
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
438438
; GFX11-NEXT: s_lshl_b32 s0, s0, 5
439-
; GFX11-NEXT: s_add_u32 s32, s1, s0
439+
; GFX11-NEXT: s_add_i32 s32, s1, s0
440440
; GFX11-NEXT: s_mov_b32 s32, s34
441441
; GFX11-NEXT: s_mov_b32 s34, s3
442442
; GFX11-NEXT: s_setpc_b64 s[30:31]

0 commit comments

Comments
 (0)