Skip to content

Commit 3ac243b

Browse files
authored
Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226 (#81394)
Resolve #78226
1 parent 83e5a12 commit 3ac243b

12 files changed

+2069
-2560
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -715,10 +715,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
715715
if (!IsEntryFunc && !IsGraphics) {
716716
// For the fixed ABI, pass workitem IDs in the last argument register.
717717
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
718+
}
718719

720+
if (!IsEntryFunc) {
719721
if (!Subtarget.enableFlatScratch())
720722
CCInfo.AllocateReg(Info->getScratchRSrcReg());
721-
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
723+
if (!IsGraphics)
724+
TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
722725
}
723726

724727
IncomingValueAssigner Assigner(AssignFn);

llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def CC_SI_Gfx : CallingConv<[
2323
// 33 is reserved for the frame pointer
2424
// 34 is reserved for the base pointer
2525
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
26+
SGPR0, SGPR1, SGPR2, SGPR3,
2627
SGPR4, SGPR5, SGPR6, SGPR7,
2728
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
2829
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2857,12 +2857,13 @@ SDValue SITargetLowering::LowerFormalArguments(
28572857
} else if (!IsGraphics) {
28582858
// For the fixed ABI, pass workitem IDs in the last argument register.
28592859
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2860+
}
28602861

2861-
// FIXME: Sink this into allocateSpecialInputSGPRs
2862+
if (!IsEntryFunc) {
28622863
if (!Subtarget->enableFlatScratch())
28632864
CCInfo.AllocateReg(Info->getScratchRSrcReg());
2864-
2865-
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2865+
if (!IsGraphics)
2866+
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
28662867
}
28672868

28682869
if (!IsKernel) {

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
5050
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
5151
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
5252
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
53-
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
53+
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
5454
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
5555
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
56-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
56+
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
5757
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
5858
; CHECK-NEXT: SI_RETURN
5959
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
9999
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
100100
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
101101
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
102-
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
103-
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
102+
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
103+
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
104104
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
105105
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
106-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
106+
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
107107
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
108108
; CHECK-NEXT: SI_RETURN
109109
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef

llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
942942
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
943943
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
944944
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
945-
; CHECK-NEXT: $sgpr4 = COPY [[C]](s32)
945+
; CHECK-NEXT: $sgpr0 = COPY [[C]](s32)
946946
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
947947
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
948-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
948+
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
949949
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
950950
; CHECK-NEXT: SI_RETURN
951951
call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
39843984
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
39853985
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
39863986
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
3987-
; CHECK-NEXT: $sgpr4 = COPY [[ANYEXT1]](s32)
3988-
; CHECK-NEXT: $sgpr5 = COPY [[LOAD2]](s32)
3987+
; CHECK-NEXT: $sgpr0 = COPY [[ANYEXT1]](s32)
3988+
; CHECK-NEXT: $sgpr1 = COPY [[LOAD2]](s32)
39893989
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
39903990
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
3991-
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
3991+
; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
39923992
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
39933993
; CHECK-NEXT: SI_RETURN
39943994
%ptr0 = load ptr addrspace(1), ptr addrspace(4) undef

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
33373337
; GFX11-LABEL: test_inreg_arg_store:
33383338
; GFX11: ; %bb.0:
33393339
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340-
; GFX11-NEXT: v_mov_b32_e32 v2, s4
3340+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
33413341
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
33423342
; GFX11-NEXT: s_setpc_b64 s[30:31]
33433343
store bfloat %in, ptr addrspace(1) %out

llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
472472
; GCN-LABEL: test34:
473473
; GCN: ; %bb.0:
474474
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475-
; GCN-NEXT: s_min_i32 s0, s4, s5
475+
; GCN-NEXT: s_min_i32 s0, s0, s1
476476
; GCN-NEXT: v_mov_b32_e32 v0, 0
477477
; GCN-NEXT: s_cmpk_lt_i32 s0, 0x3e9
478478
; GCN-NEXT: v_mov_b32_e32 v1, 0
@@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
492492
; GCN-LABEL: test35:
493493
; GCN: ; %bb.0:
494494
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495-
; GCN-NEXT: s_max_i32 s0, s4, s5
495+
; GCN-NEXT: s_max_i32 s0, s0, s1
496496
; GCN-NEXT: v_mov_b32_e32 v0, 0
497497
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e8
498498
; GCN-NEXT: v_mov_b32_e32 v1, 0
@@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
512512
; GCN-LABEL: test36:
513513
; GCN: ; %bb.0:
514514
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
515-
; GCN-NEXT: s_min_u32 s0, s4, s5
515+
; GCN-NEXT: s_min_u32 s0, s0, s1
516516
; GCN-NEXT: v_mov_b32_e32 v0, 0
517-
; GCN-NEXT: s_cmp_lt_u32 s0, s6
517+
; GCN-NEXT: s_cmp_lt_u32 s0, s2
518518
; GCN-NEXT: v_mov_b32_e32 v1, 0
519519
; GCN-NEXT: s_cselect_b32 s0, -1, 0
520520
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
532532
; GCN-LABEL: test37:
533533
; GCN: ; %bb.0:
534534
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
535-
; GCN-NEXT: s_max_i32 s0, s4, s5
535+
; GCN-NEXT: s_max_i32 s0, s0, s1
536536
; GCN-NEXT: v_mov_b32_e32 v0, 0
537-
; GCN-NEXT: s_cmp_ge_i32 s0, s6
537+
; GCN-NEXT: s_cmp_ge_i32 s0, s2
538538
; GCN-NEXT: v_mov_b32_e32 v1, 0
539539
; GCN-NEXT: s_cselect_b32 s0, -1, 0
540540
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
552552
; GCN-LABEL: test38:
553553
; GCN: ; %bb.0:
554554
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555-
; GCN-NEXT: s_max_u32 s0, s4, s5
555+
; GCN-NEXT: s_max_u32 s0, s0, s1
556556
; GCN-NEXT: v_mov_b32_e32 v0, 0
557557
; GCN-NEXT: s_cmpk_lt_u32 s0, 0x3e9
558558
; GCN-NEXT: v_mov_b32_e32 v1, 0
@@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
572572
; GCN-LABEL: test39:
573573
; GCN: ; %bb.0:
574574
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
575-
; GCN-NEXT: s_min_i32 s0, s4, s5
575+
; GCN-NEXT: s_min_i32 s0, s0, s1
576576
; GCN-NEXT: v_mov_b32_e32 v0, 0
577577
; GCN-NEXT: s_cmpk_gt_i32 s0, 0x3e7
578578
; GCN-NEXT: v_mov_b32_e32 v1, 0
@@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
592592
; GCN-LABEL: test40:
593593
; GCN: ; %bb.0:
594594
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595-
; GCN-NEXT: s_max_i32 s0, s4, s5
595+
; GCN-NEXT: s_max_i32 s0, s0, s1
596596
; GCN-NEXT: v_mov_b32_e32 v0, 0
597-
; GCN-NEXT: s_cmp_le_i32 s0, s6
597+
; GCN-NEXT: s_cmp_le_i32 s0, s2
598598
; GCN-NEXT: v_mov_b32_e32 v1, 0
599599
; GCN-NEXT: s_cselect_b32 s0, -1, 0
600600
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
612612
; GCN-LABEL: test41:
613613
; GCN: ; %bb.0:
614614
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615-
; GCN-NEXT: s_min_u32 s0, s4, s5
615+
; GCN-NEXT: s_min_u32 s0, s0, s1
616616
; GCN-NEXT: v_mov_b32_e32 v0, 0
617-
; GCN-NEXT: s_cmp_ge_u32 s0, s6
617+
; GCN-NEXT: s_cmp_ge_u32 s0, s2
618618
; GCN-NEXT: v_mov_b32_e32 v1, 0
619619
; GCN-NEXT: s_cselect_b32 s0, -1, 0
620620
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0

llvm/test/CodeGen/AMDGPU/function-args-inreg.ll

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,6 +2176,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr)
21762176
declare void @extern()
21772177

21782178
define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
2179+
; GFX9-LABEL: void_func_a13i32_inreg:
2180+
; GFX9: ; %bb.0:
2181+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2182+
; GFX9-NEXT: s_mov_b32 s27, s33
2183+
; GFX9-NEXT: s_mov_b32 s33, s32
2184+
; GFX9-NEXT: s_or_saveexec_b64 s[28:29], -1
2185+
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
2186+
; GFX9-NEXT: s_mov_b64 exec, s[28:29]
2187+
; GFX9-NEXT: v_mov_b32_e32 v2, s26
2188+
; GFX9-NEXT: global_store_dword v[0:1], v2, off offset:48
2189+
; GFX9-NEXT: v_mov_b32_e32 v5, s25
2190+
; GFX9-NEXT: v_mov_b32_e32 v4, s24
2191+
; GFX9-NEXT: v_mov_b32_e32 v3, s23
2192+
; GFX9-NEXT: v_mov_b32_e32 v2, s22
2193+
; GFX9-NEXT: s_addk_i32 s32, 0x400
2194+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
2195+
; GFX9-NEXT: v_writelane_b32 v40, s27, 2
2196+
; GFX9-NEXT: v_mov_b32_e32 v5, s21
2197+
; GFX9-NEXT: v_mov_b32_e32 v4, s20
2198+
; GFX9-NEXT: v_mov_b32_e32 v3, s19
2199+
; GFX9-NEXT: v_mov_b32_e32 v2, s18
2200+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
2201+
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
2202+
; GFX9-NEXT: v_mov_b32_e32 v5, s17
2203+
; GFX9-NEXT: v_mov_b32_e32 v4, s16
2204+
; GFX9-NEXT: s_getpc_b64 s[16:17]
2205+
; GFX9-NEXT: s_add_u32 s16, s16, extern@gotpcrel32@lo+4
2206+
; GFX9-NEXT: s_addc_u32 s17, s17, extern@gotpcrel32@hi+12
2207+
; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
2208+
; GFX9-NEXT: v_mov_b32_e32 v3, s7
2209+
; GFX9-NEXT: v_mov_b32_e32 v2, s6
2210+
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
2211+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
2212+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2213+
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
2214+
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
2215+
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
2216+
; GFX9-NEXT: v_readlane_b32 s4, v40, 2
2217+
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
2218+
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
2219+
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
2220+
; GFX9-NEXT: s_addk_i32 s32, 0xfc00
2221+
; GFX9-NEXT: s_mov_b32 s33, s4
2222+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2223+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2224+
;
2225+
; GFX11-LABEL: void_func_a13i32_inreg:
2226+
; GFX11: ; %bb.0:
2227+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2228+
; GFX11-NEXT: s_mov_b32 s23, s33
2229+
; GFX11-NEXT: s_mov_b32 s33, s32
2230+
; GFX11-NEXT: s_or_saveexec_b32 s24, -1
2231+
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
2232+
; GFX11-NEXT: s_mov_b32 exec_lo, s24
2233+
; GFX11-NEXT: s_add_i32 s32, s32, 16
2234+
; GFX11-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19
2235+
; GFX11-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17
2236+
; GFX11-NEXT: s_getpc_b64 s[18:19]
2237+
; GFX11-NEXT: s_add_u32 s18, s18, extern@gotpcrel32@lo+4
2238+
; GFX11-NEXT: s_addc_u32 s19, s19, extern@gotpcrel32@hi+12
2239+
; GFX11-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7
2240+
; GFX11-NEXT: s_load_b64 s[16:17], s[18:19], 0x0
2241+
; GFX11-NEXT: v_writelane_b32 v40, s23, 2
2242+
; GFX11-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21
2243+
; GFX11-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3
2244+
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
2245+
; GFX11-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
2246+
; GFX11-NEXT: v_mov_b32_e32 v10, s0
2247+
; GFX11-NEXT: s_clause 0x3
2248+
; GFX11-NEXT: global_store_b32 v[0:1], v14, off offset:48
2249+
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:32
2250+
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16
2251+
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off
2252+
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
2253+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2254+
; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17]
2255+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2256+
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
2257+
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
2258+
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
2259+
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
2260+
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
2261+
; GFX11-NEXT: s_mov_b32 exec_lo, s1
2262+
; GFX11-NEXT: s_add_i32 s32, s32, -16
2263+
; GFX11-NEXT: s_mov_b32 s33, s0
2264+
; GFX11-NEXT: s_waitcnt vmcnt(0)
2265+
; GFX11-NEXT: s_setpc_b64 s[30:31]
21792266
store [13 x i32] %arg0, ptr addrspace(1) %ptr
21802267
call void @extern()
21812268
ret void
@@ -2203,6 +2290,52 @@ define void @void_func_a13i32_inreg([13 x i32] inreg %arg0, ptr addrspace(1) %p
22032290

22042291
; FIXME: Should still fail
22052292
define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
2293+
; GFX9-LABEL: void_func_a16i32_inreg__noimplicit:
2294+
; GFX9: ; %bb.0:
2295+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2296+
; GFX9-NEXT: v_mov_b32_e32 v5, s19
2297+
; GFX9-NEXT: v_mov_b32_e32 v4, s18
2298+
; GFX9-NEXT: v_mov_b32_e32 v3, s17
2299+
; GFX9-NEXT: v_mov_b32_e32 v2, s16
2300+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:48
2301+
; GFX9-NEXT: s_nop 0
2302+
; GFX9-NEXT: v_mov_b32_e32 v5, s15
2303+
; GFX9-NEXT: v_mov_b32_e32 v4, s14
2304+
; GFX9-NEXT: v_mov_b32_e32 v3, s13
2305+
; GFX9-NEXT: v_mov_b32_e32 v2, s12
2306+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:32
2307+
; GFX9-NEXT: s_nop 0
2308+
; GFX9-NEXT: v_mov_b32_e32 v5, s11
2309+
; GFX9-NEXT: v_mov_b32_e32 v4, s10
2310+
; GFX9-NEXT: v_mov_b32_e32 v3, s9
2311+
; GFX9-NEXT: v_mov_b32_e32 v2, s8
2312+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
2313+
; GFX9-NEXT: s_nop 0
2314+
; GFX9-NEXT: v_mov_b32_e32 v5, s7
2315+
; GFX9-NEXT: v_mov_b32_e32 v4, s6
2316+
; GFX9-NEXT: v_mov_b32_e32 v3, s5
2317+
; GFX9-NEXT: v_mov_b32_e32 v2, s4
2318+
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
2319+
; GFX9-NEXT: s_waitcnt vmcnt(0)
2320+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2321+
;
2322+
; GFX11-LABEL: void_func_a16i32_inreg__noimplicit:
2323+
; GFX11: ; %bb.0:
2324+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2325+
; GFX11-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14
2326+
; GFX11-NEXT: v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
2327+
; GFX11-NEXT: v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10
2328+
; GFX11-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8
2329+
; GFX11-NEXT: v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
2330+
; GFX11-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
2331+
; GFX11-NEXT: v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2
2332+
; GFX11-NEXT: v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0
2333+
; GFX11-NEXT: s_clause 0x3
2334+
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off offset:48
2335+
; GFX11-NEXT: global_store_b128 v[0:1], v[6:9], off offset:32
2336+
; GFX11-NEXT: global_store_b128 v[0:1], v[10:13], off offset:16
2337+
; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off
2338+
; GFX11-NEXT: s_setpc_b64 s[30:31]
22062339
store [16 x i32] %arg0, ptr addrspace(1) %ptr
22072340
ret void
22082341
}

0 commit comments

Comments
 (0)