Skip to content

Commit 86815a1

Browse files
authored
AMDGPU/GlobalISel: Permit mapping G_FRAME_INDEX to sgprs (#101325)
eliminateFrameIndex should now properly handle materializing frame indices in SGPRs, so treat this like the other constant operand types. On average this will produce worse code; we need to detect VGPR uses, and improve SGPR->VGPR frame index folds.
1 parent 9d068f7 commit 86815a1

File tree

8 files changed

+266
-201
lines changed

8 files changed

+266
-201
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4060,20 +4060,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
40604060
case AMDGPU::G_FCONSTANT:
40614061
case AMDGPU::G_CONSTANT:
40624062
case AMDGPU::G_GLOBAL_VALUE:
4063+
case AMDGPU::G_FRAME_INDEX:
40634064
case AMDGPU::G_BLOCK_ADDR:
40644065
case AMDGPU::G_READSTEADYCOUNTER:
40654066
case AMDGPU::G_READCYCLECOUNTER: {
40664067
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
40674068
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
40684069
break;
40694070
}
4070-
case AMDGPU::G_FRAME_INDEX: {
4071-
// TODO: This should be the same as other constants, but eliminateFrameIndex
4072-
// currently assumes VALU uses.
4073-
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4074-
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4075-
break;
4076-
}
40774071
case AMDGPU::G_DYN_STACKALLOC: {
40784072
// Result is always uniform, and a wave reduction is needed for the source.
40794073
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ define amdgpu_kernel void @stack_write_fi() {
1010
; CHECK-NEXT: s_add_u32 s0, s0, s15
1111
; CHECK-NEXT: s_addc_u32 s1, s1, 0
1212
; CHECK-NEXT: s_mov_b32 s5, 0
13+
; CHECK-NEXT: s_mov_b32 s6, 0
1314
; CHECK-NEXT: s_mov_b32 s4, 0
14-
; CHECK-NEXT: v_mov_b32_e32 v0, s5
15-
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
15+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
16+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
17+
; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
1618
; CHECK-NEXT: s_waitcnt vmcnt(0)
1719
; CHECK-NEXT: v_mov_b32_e32 v0, s4
1820
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ define amdgpu_ps void @amdgpu_ps() {
1010
; MESA-LABEL: amdgpu_ps:
1111
; MESA: ; %bb.0:
1212
; MESA-NEXT: s_add_u32 flat_scratch_lo, s2, s4
13-
; MESA-NEXT: s_mov_b64 s[0:1], src_private_base
1413
; MESA-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
15-
; MESA-NEXT: v_mov_b32_e32 v0, 0
16-
; MESA-NEXT: v_mov_b32_e32 v1, s1
14+
; MESA-NEXT: s_mov_b32 s0, 0
15+
; MESA-NEXT: s_mov_b64 s[2:3], src_private_base
16+
; MESA-NEXT: s_mov_b32 s1, s3
17+
; MESA-NEXT: v_mov_b32_e32 v0, s0
1718
; MESA-NEXT: v_mov_b32_e32 v2, 0
19+
; MESA-NEXT: v_mov_b32_e32 v1, s1
1820
; MESA-NEXT: flat_store_dword v[0:1], v2
1921
; MESA-NEXT: s_waitcnt vmcnt(0)
2022
; MESA-NEXT: s_endpgm
@@ -24,13 +26,15 @@ define amdgpu_ps void @amdgpu_ps() {
2426
; PAL-NEXT: s_getpc_b64 s[2:3]
2527
; PAL-NEXT: s_mov_b32 s2, s0
2628
; PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
27-
; PAL-NEXT: v_mov_b32_e32 v0, 0
2829
; PAL-NEXT: v_mov_b32_e32 v2, 0
2930
; PAL-NEXT: s_waitcnt lgkmcnt(0)
3031
; PAL-NEXT: s_and_b32 s3, s3, 0xffff
3132
; PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
32-
; PAL-NEXT: s_mov_b64 s[0:1], src_private_base
3333
; PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
34+
; PAL-NEXT: s_mov_b32 s0, 0
35+
; PAL-NEXT: s_mov_b64 s[2:3], src_private_base
36+
; PAL-NEXT: s_mov_b32 s1, s3
37+
; PAL-NEXT: v_mov_b32_e32 v0, s0
3438
; PAL-NEXT: v_mov_b32_e32 v1, s1
3539
; PAL-NEXT: flat_store_dword v[0:1], v2
3640
; PAL-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 68 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
5555
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
5656
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
5757
; GFX940-NEXT: s_waitcnt vmcnt(0)
58-
; GFX940-NEXT: v_mov_b32_e32 v0, s0
59-
; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
58+
; GFX940-NEXT: s_add_i32 s0, s0, 0
59+
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
6060
; GFX940-NEXT: s_waitcnt vmcnt(0)
6161
; GFX940-NEXT: s_endpgm
6262
;
6363
; GFX11-LABEL: store_load_sindex_kernel:
6464
; GFX11: ; %bb.0: ; %bb
6565
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
66+
; GFX11-NEXT: v_mov_b32_e32 v0, 15
6667
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
6768
; GFX11-NEXT: s_and_b32 s1, s0, 15
6869
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
6970
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
70-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
71-
; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
7271
; GFX11-NEXT: s_add_i32 s0, s0, 0
72+
; GFX11-NEXT: s_add_i32 s1, s1, 0
7373
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
7474
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
75-
; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
75+
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
7676
; GFX11-NEXT: s_waitcnt vmcnt(0)
7777
; GFX11-NEXT: s_endpgm
7878
;
7979
; GFX12-LABEL: store_load_sindex_kernel:
8080
; GFX12: ; %bb.0: ; %bb
8181
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
82-
; GFX12-NEXT: v_mov_b32_e32 v1, 15
82+
; GFX12-NEXT: v_mov_b32_e32 v0, 15
8383
; GFX12-NEXT: s_wait_kmcnt 0x0
84-
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
85-
; GFX12-NEXT: s_and_b32 s0, s0, 15
86-
; GFX12-NEXT: v_mov_b32_e32 v0, s1
84+
; GFX12-NEXT: s_and_b32 s1, s0, 15
8785
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
88-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89-
; GFX12-NEXT: v_mov_b32_e32 v2, s0
90-
; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS
86+
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87+
; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88+
; GFX12-NEXT: s_add_co_i32 s1, s1, 0
89+
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
9190
; GFX12-NEXT: s_wait_storecnt 0x0
92-
; GFX12-NEXT: scratch_load_b32 v0, v2, off scope:SCOPE_SYS
91+
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
9392
; GFX12-NEXT: s_wait_loadcnt 0x0
9493
; GFX12-NEXT: s_endpgm
9594
bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
378377
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
379378
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
380379
; GFX940-NEXT: s_waitcnt vmcnt(0)
381-
; GFX940-NEXT: v_mov_b32_e32 v0, s0
382-
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:256 sc0 sc1
380+
; GFX940-NEXT: s_addk_i32 s0, 0x100
381+
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
383382
; GFX940-NEXT: s_waitcnt vmcnt(0)
384383
; GFX940-NEXT: s_endpgm
385384
;
386385
; GFX11-LABEL: store_load_sindex_small_offset_kernel:
387386
; GFX11: ; %bb.0: ; %bb
388387
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
389-
; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc
390-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
388+
; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc
389+
; GFX11-NEXT: s_waitcnt vmcnt(0)
390+
; GFX11-NEXT: v_mov_b32_e32 v0, 15
391+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
391392
; GFX11-NEXT: s_and_b32 s1, s0, 15
392393
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
393394
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
394-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
395-
; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
396395
; GFX11-NEXT: s_addk_i32 s0, 0x100
396+
; GFX11-NEXT: s_addk_i32 s1, 0x100
397397
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
398398
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
399-
; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:256 glc dlc
399+
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
400400
; GFX11-NEXT: s_waitcnt vmcnt(0)
401401
; GFX11-NEXT: s_endpgm
402402
;
403403
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
404404
; GFX12: ; %bb.0: ; %bb
405405
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
406-
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
406+
; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS
407407
; GFX12-NEXT: s_wait_loadcnt 0x0
408-
; GFX12-NEXT: v_mov_b32_e32 v1, 15
408+
; GFX12-NEXT: v_mov_b32_e32 v0, 15
409409
; GFX12-NEXT: s_wait_kmcnt 0x0
410-
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
411-
; GFX12-NEXT: s_and_b32 s0, s0, 15
412-
; GFX12-NEXT: v_mov_b32_e32 v0, s1
410+
; GFX12-NEXT: s_and_b32 s1, s0, 15
413411
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
414-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
415-
; GFX12-NEXT: v_mov_b32_e32 v2, s0
416-
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
412+
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
413+
; GFX12-NEXT: s_addk_co_i32 s0, 0x100
414+
; GFX12-NEXT: s_addk_co_i32 s1, 0x100
415+
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
417416
; GFX12-NEXT: s_wait_storecnt 0x0
418-
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
417+
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
419418
; GFX12-NEXT: s_wait_loadcnt 0x0
420419
; GFX12-NEXT: s_endpgm
421420
bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
692691
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
693692
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
694693
; GFX940-NEXT: s_waitcnt vmcnt(0)
695-
; GFX940-NEXT: v_mov_b32_e32 v0, s0
696-
; GFX940-NEXT: s_movk_i32 s0, 0x4004
697-
; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
694+
; GFX940-NEXT: s_addk_i32 s0, 0x4004
695+
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
698696
; GFX940-NEXT: s_waitcnt vmcnt(0)
699697
; GFX940-NEXT: s_endpgm
700698
;
701699
; GFX11-LABEL: store_load_sindex_large_offset_kernel:
702700
; GFX11: ; %bb.0: ; %bb
703701
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
704-
; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
705-
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702+
; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
703+
; GFX11-NEXT: s_waitcnt vmcnt(0)
704+
; GFX11-NEXT: v_mov_b32_e32 v0, 15
705+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
706706
; GFX11-NEXT: s_and_b32 s1, s0, 15
707707
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
708708
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
709-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
710-
; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
711709
; GFX11-NEXT: s_addk_i32 s0, 0x4004
710+
; GFX11-NEXT: s_addk_i32 s1, 0x4004
712711
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
713712
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
714-
; GFX11-NEXT: s_movk_i32 s0, 0x4004
715-
; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
713+
; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
716714
; GFX11-NEXT: s_waitcnt vmcnt(0)
717715
; GFX11-NEXT: s_endpgm
718716
;
719717
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
720718
; GFX12: ; %bb.0: ; %bb
721719
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
722-
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
720+
; GFX12-NEXT: scratch_load_b32 v0, off, off scope:SCOPE_SYS
723721
; GFX12-NEXT: s_wait_loadcnt 0x0
724-
; GFX12-NEXT: v_mov_b32_e32 v1, 15
722+
; GFX12-NEXT: v_mov_b32_e32 v0, 15
725723
; GFX12-NEXT: s_wait_kmcnt 0x0
726-
; GFX12-NEXT: s_lshl_b32 s1, s0, 2
727-
; GFX12-NEXT: s_and_b32 s0, s0, 15
728-
; GFX12-NEXT: v_mov_b32_e32 v0, s1
724+
; GFX12-NEXT: s_and_b32 s1, s0, 15
729725
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
730-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
731-
; GFX12-NEXT: v_mov_b32_e32 v2, s0
732-
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
726+
; GFX12-NEXT: s_lshl_b32 s1, s1, 2
727+
; GFX12-NEXT: s_addk_co_i32 s0, 0x4000
728+
; GFX12-NEXT: s_addk_co_i32 s1, 0x4000
729+
; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
733730
; GFX12-NEXT: s_wait_storecnt 0x0
734-
; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
731+
; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
735732
; GFX12-NEXT: s_wait_loadcnt 0x0
736733
; GFX12-NEXT: s_endpgm
737734
bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
995992
; GFX940-LABEL: store_load_large_imm_offset_kernel:
996993
; GFX940: ; %bb.0: ; %bb
997994
; GFX940-NEXT: v_mov_b32_e32 v0, 13
995+
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
998996
; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
999997
; GFX940-NEXT: s_waitcnt vmcnt(0)
1000-
; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1001-
; GFX940-NEXT: v_mov_b32_e32 v1, 15
1002-
; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
998+
; GFX940-NEXT: v_mov_b32_e32 v0, 15
999+
; GFX940-NEXT: s_add_i32 s0, s0, 4
1000+
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10031001
; GFX940-NEXT: s_waitcnt vmcnt(0)
1004-
; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
1002+
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
10051003
; GFX940-NEXT: s_waitcnt vmcnt(0)
10061004
; GFX940-NEXT: s_endpgm
10071005
;
10081006
; GFX11-LABEL: store_load_large_imm_offset_kernel:
10091007
; GFX11: ; %bb.0: ; %bb
1010-
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1011-
; GFX11-NEXT: v_mov_b32_e32 v2, 15
1008+
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1009+
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1010+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1011+
; GFX11-NEXT: s_add_i32 s0, s0, 4
10121012
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
10131013
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1014-
; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc
1014+
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
10151015
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1016-
; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
1016+
; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
10171017
; GFX11-NEXT: s_waitcnt vmcnt(0)
10181018
; GFX11-NEXT: s_endpgm
10191019
;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
10751075
; GFX940: ; %bb.0: ; %bb
10761076
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10771077
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1078+
; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079+
; GFX940-NEXT: s_add_i32 s1, s32, 4
10781080
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
10791081
; GFX940-NEXT: s_waitcnt vmcnt(0)
1080-
; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1081-
; GFX940-NEXT: v_mov_b32_e32 v1, 15
1082-
; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
1082+
; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083+
; GFX940-NEXT: s_add_i32 s0, s0, s1
1084+
; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
10831085
; GFX940-NEXT: s_waitcnt vmcnt(0)
1084-
; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
1086+
; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
10851087
; GFX940-NEXT: s_waitcnt vmcnt(0)
10861088
; GFX940-NEXT: s_setpc_b64 s[30:31]
10871089
;
10881090
; GFX11-LABEL: store_load_large_imm_offset_foo:
10891091
; GFX11: ; %bb.0: ; %bb
10901092
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1091-
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1092-
; GFX11-NEXT: v_mov_b32_e32 v2, 15
1093+
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1094+
; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095+
; GFX11-NEXT: s_add_i32 s1, s32, 4
1096+
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097+
; GFX11-NEXT: s_add_i32 s0, s0, s1
10931098
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
10941099
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1095-
; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc
1100+
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
10961101
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1097-
; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc
1102+
; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
10981103
; GFX11-NEXT: s_waitcnt vmcnt(0)
10991104
; GFX11-NEXT: s_setpc_b64 s[30:31]
11001105
;

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,11 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
1111
; GCN-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x10
1212
; GCN-NEXT: s_add_u32 s0, s0, s13
1313
; GCN-NEXT: s_addc_u32 s1, s1, 0
14-
; GCN-NEXT: v_mov_b32_e32 v16, 0
14+
; GCN-NEXT: v_mov_b32_e32 v64, 0
1515
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1616
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
1717
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
1818
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
19-
; GCN-NEXT: v_mov_b32_e32 v64, 0
2019
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2120
; GCN-NEXT: v_mov_b32_e32 v0, s36
2221
; GCN-NEXT: v_mov_b32_e32 v1, s37
@@ -143,16 +142,17 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
143142
; GCN-NEXT: v_mov_b32_e32 v0, s48
144143
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240
145144
; GCN-NEXT: v_mov_b32_e32 v0, s49
145+
; GCN-NEXT: s_and_b32 s4, s25, 63
146146
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244
147147
; GCN-NEXT: v_mov_b32_e32 v0, s50
148-
; GCN-NEXT: s_and_b32 s4, s25, 63
148+
; GCN-NEXT: s_lshl_b32 s4, s4, 2
149149
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248
150150
; GCN-NEXT: v_mov_b32_e32 v0, s51
151-
; GCN-NEXT: s_lshl_b32 s4, s4, 2
151+
; GCN-NEXT: s_add_u32 s4, 0, s4
152152
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252
153-
; GCN-NEXT: v_add_u32_e32 v0, s4, v16
154-
; GCN-NEXT: v_mov_b32_e32 v1, s24
155-
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
153+
; GCN-NEXT: v_mov_b32_e32 v0, s24
154+
; GCN-NEXT: v_mov_b32_e32 v1, s4
155+
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
156156
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
157157
; GCN-NEXT: s_nop 0
158158
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4

0 commit comments

Comments
 (0)