@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
55
55
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
56
56
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
57
57
; GFX940-NEXT: s_waitcnt vmcnt(0)
58
- ; GFX940-NEXT: v_mov_b32_e32 v0 , s0
59
- ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
58
+ ; GFX940-NEXT: s_add_i32 s0 , s0, 0
59
+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
60
60
; GFX940-NEXT: s_waitcnt vmcnt(0)
61
61
; GFX940-NEXT: s_endpgm
62
62
;
63
63
; GFX11-LABEL: store_load_sindex_kernel:
64
64
; GFX11: ; %bb.0: ; %bb
65
65
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
66
+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
66
67
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
67
68
; GFX11-NEXT: s_and_b32 s1, s0, 15
68
69
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
69
70
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
70
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
71
- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
72
71
; GFX11-NEXT: s_add_i32 s0, s0, 0
72
+ ; GFX11-NEXT: s_add_i32 s1, s1, 0
73
73
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
74
74
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
75
- ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
75
+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
76
76
; GFX11-NEXT: s_waitcnt vmcnt(0)
77
77
; GFX11-NEXT: s_endpgm
78
78
;
79
79
; GFX12-LABEL: store_load_sindex_kernel:
80
80
; GFX12: ; %bb.0: ; %bb
81
81
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
82
- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
82
+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
83
83
; GFX12-NEXT: s_wait_kmcnt 0x0
84
- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
85
- ; GFX12-NEXT: s_and_b32 s0, s0, 15
86
- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
84
+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
87
85
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
88
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
89
- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
90
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS
86
+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
87
+ ; GFX12-NEXT: s_add_co_i32 s0, s0, 0
88
+ ; GFX12-NEXT: s_add_co_i32 s1, s1, 0
89
+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
91
90
; GFX12-NEXT: s_wait_storecnt 0x0
92
- ; GFX12-NEXT: scratch_load_b32 v0, v2, off scope:SCOPE_SYS
91
+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
93
92
; GFX12-NEXT: s_wait_loadcnt 0x0
94
93
; GFX12-NEXT: s_endpgm
95
94
bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
378
377
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
379
378
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
380
379
; GFX940-NEXT: s_waitcnt vmcnt(0)
381
- ; GFX940-NEXT: v_mov_b32_e32 v0, s0
382
- ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:256 sc0 sc1
380
+ ; GFX940-NEXT: s_addk_i32 s0, 0x100
381
+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
383
382
; GFX940-NEXT: s_waitcnt vmcnt(0)
384
383
; GFX940-NEXT: s_endpgm
385
384
;
386
385
; GFX11-LABEL: store_load_sindex_small_offset_kernel:
387
386
; GFX11: ; %bb.0: ; %bb
388
387
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
389
- ; GFX11-NEXT: scratch_load_b32 v2, off, off glc dlc
390
- ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
388
+ ; GFX11-NEXT: scratch_load_b32 v0, off, off glc dlc
389
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
390
+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
391
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
391
392
; GFX11-NEXT: s_and_b32 s1, s0, 15
392
393
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
393
394
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
394
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
395
- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
396
395
; GFX11-NEXT: s_addk_i32 s0, 0x100
396
+ ; GFX11-NEXT: s_addk_i32 s1, 0x100
397
397
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
398
398
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
399
- ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:256 glc dlc
399
+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
400
400
; GFX11-NEXT: s_waitcnt vmcnt(0)
401
401
; GFX11-NEXT: s_endpgm
402
402
;
403
403
; GFX12-LABEL: store_load_sindex_small_offset_kernel:
404
404
; GFX12: ; %bb.0: ; %bb
405
405
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
406
- ; GFX12-NEXT: scratch_load_b32 v3 , off, off scope:SCOPE_SYS
406
+ ; GFX12-NEXT: scratch_load_b32 v0 , off, off scope:SCOPE_SYS
407
407
; GFX12-NEXT: s_wait_loadcnt 0x0
408
- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
408
+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
409
409
; GFX12-NEXT: s_wait_kmcnt 0x0
410
- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
411
- ; GFX12-NEXT: s_and_b32 s0, s0, 15
412
- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
410
+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
413
411
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
414
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
415
- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
416
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
412
+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
413
+ ; GFX12-NEXT: s_addk_co_i32 s0, 0x100
414
+ ; GFX12-NEXT: s_addk_co_i32 s1, 0x100
415
+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
417
416
; GFX12-NEXT: s_wait_storecnt 0x0
418
- ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
417
+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
419
418
; GFX12-NEXT: s_wait_loadcnt 0x0
420
419
; GFX12-NEXT: s_endpgm
421
420
bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
692
691
; GFX940-NEXT: s_lshl_b32 s0, s0, 2
693
692
; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
694
693
; GFX940-NEXT: s_waitcnt vmcnt(0)
695
- ; GFX940-NEXT: v_mov_b32_e32 v0, s0
696
- ; GFX940-NEXT: s_movk_i32 s0, 0x4004
697
- ; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
694
+ ; GFX940-NEXT: s_addk_i32 s0, 0x4004
695
+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
698
696
; GFX940-NEXT: s_waitcnt vmcnt(0)
699
697
; GFX940-NEXT: s_endpgm
700
698
;
701
699
; GFX11-LABEL: store_load_sindex_large_offset_kernel:
702
700
; GFX11: ; %bb.0: ; %bb
703
701
; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x24
704
- ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
705
- ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
702
+ ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
703
+ ; GFX11-NEXT: s_waitcnt vmcnt(0)
704
+ ; GFX11-NEXT: v_mov_b32_e32 v0, 15
705
+ ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
706
706
; GFX11-NEXT: s_and_b32 s1, s0, 15
707
707
; GFX11-NEXT: s_lshl_b32 s0, s0, 2
708
708
; GFX11-NEXT: s_lshl_b32 s1, s1, 2
709
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
710
- ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
711
709
; GFX11-NEXT: s_addk_i32 s0, 0x4004
710
+ ; GFX11-NEXT: s_addk_i32 s1, 0x4004
712
711
; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
713
712
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
714
- ; GFX11-NEXT: s_movk_i32 s0, 0x4004
715
- ; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
713
+ ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
716
714
; GFX11-NEXT: s_waitcnt vmcnt(0)
717
715
; GFX11-NEXT: s_endpgm
718
716
;
719
717
; GFX12-LABEL: store_load_sindex_large_offset_kernel:
720
718
; GFX12: ; %bb.0: ; %bb
721
719
; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x24
722
- ; GFX12-NEXT: scratch_load_b32 v3 , off, off scope:SCOPE_SYS
720
+ ; GFX12-NEXT: scratch_load_b32 v0 , off, off scope:SCOPE_SYS
723
721
; GFX12-NEXT: s_wait_loadcnt 0x0
724
- ; GFX12-NEXT: v_mov_b32_e32 v1 , 15
722
+ ; GFX12-NEXT: v_mov_b32_e32 v0 , 15
725
723
; GFX12-NEXT: s_wait_kmcnt 0x0
726
- ; GFX12-NEXT: s_lshl_b32 s1, s0, 2
727
- ; GFX12-NEXT: s_and_b32 s0, s0, 15
728
- ; GFX12-NEXT: v_mov_b32_e32 v0, s1
724
+ ; GFX12-NEXT: s_and_b32 s1, s0, 15
729
725
; GFX12-NEXT: s_lshl_b32 s0, s0, 2
730
- ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
731
- ; GFX12-NEXT: v_mov_b32_e32 v2, s0
732
- ; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
726
+ ; GFX12-NEXT: s_lshl_b32 s1, s1, 2
727
+ ; GFX12-NEXT: s_addk_co_i32 s0, 0x4000
728
+ ; GFX12-NEXT: s_addk_co_i32 s1, 0x4000
729
+ ; GFX12-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
733
730
; GFX12-NEXT: s_wait_storecnt 0x0
734
- ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
731
+ ; GFX12-NEXT: scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
735
732
; GFX12-NEXT: s_wait_loadcnt 0x0
736
733
; GFX12-NEXT: s_endpgm
737
734
bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
995
992
; GFX940-LABEL: store_load_large_imm_offset_kernel:
996
993
; GFX940: ; %bb.0: ; %bb
997
994
; GFX940-NEXT: v_mov_b32_e32 v0, 13
995
+ ; GFX940-NEXT: s_movk_i32 s0, 0x3e80
998
996
; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
999
997
; GFX940-NEXT: s_waitcnt vmcnt(0)
1000
- ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1001
- ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1002
- ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
998
+ ; GFX940-NEXT: v_mov_b32_e32 v0, 15
999
+ ; GFX940-NEXT: s_add_i32 s0, s0, 4
1000
+ ; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
1003
1001
; GFX940-NEXT: s_waitcnt vmcnt(0)
1004
- ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
1002
+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
1005
1003
; GFX940-NEXT: s_waitcnt vmcnt(0)
1006
1004
; GFX940-NEXT: s_endpgm
1007
1005
;
1008
1006
; GFX11-LABEL: store_load_large_imm_offset_kernel:
1009
1007
; GFX11: ; %bb.0: ; %bb
1010
- ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1011
- ; GFX11-NEXT: v_mov_b32_e32 v2, 15
1008
+ ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1009
+ ; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1010
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1011
+ ; GFX11-NEXT: s_add_i32 s0, s0, 4
1012
1012
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
1013
1013
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1014
- ; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc
1014
+ ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
1015
1015
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1016
- ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
1016
+ ; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
1017
1017
; GFX11-NEXT: s_waitcnt vmcnt(0)
1018
1018
; GFX11-NEXT: s_endpgm
1019
1019
;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
1075
1075
; GFX940: ; %bb.0: ; %bb
1076
1076
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1077
1077
; GFX940-NEXT: v_mov_b32_e32 v0, 13
1078
+ ; GFX940-NEXT: s_movk_i32 s0, 0x3e80
1079
+ ; GFX940-NEXT: s_add_i32 s1, s32, 4
1078
1080
; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
1079
1081
; GFX940-NEXT: s_waitcnt vmcnt(0)
1080
- ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
1081
- ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1082
- ; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
1082
+ ; GFX940-NEXT: v_mov_b32_e32 v0, 15
1083
+ ; GFX940-NEXT: s_add_i32 s0, s0, s1
1084
+ ; GFX940-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
1083
1085
; GFX940-NEXT: s_waitcnt vmcnt(0)
1084
- ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
1086
+ ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
1085
1087
; GFX940-NEXT: s_waitcnt vmcnt(0)
1086
1088
; GFX940-NEXT: s_setpc_b64 s[30:31]
1087
1089
;
1088
1090
; GFX11-LABEL: store_load_large_imm_offset_foo:
1089
1091
; GFX11: ; %bb.0: ; %bb
1090
1092
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1091
- ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
1092
- ; GFX11-NEXT: v_mov_b32_e32 v2, 15
1093
+ ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1094
+ ; GFX11-NEXT: s_movk_i32 s0, 0x3e80
1095
+ ; GFX11-NEXT: s_add_i32 s1, s32, 4
1096
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1097
+ ; GFX11-NEXT: s_add_i32 s0, s0, s1
1093
1098
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
1094
1099
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1095
- ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc
1100
+ ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
1096
1101
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1097
- ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc
1102
+ ; GFX11-NEXT: scratch_load_b32 v0, off, s0 glc dlc
1098
1103
; GFX11-NEXT: s_waitcnt vmcnt(0)
1099
1104
; GFX11-NEXT: s_setpc_b64 s[30:31]
1100
1105
;
0 commit comments