Skip to content

Commit f0c57c9

Browse files
committed
AMDGPU: Default to selecting frame indexes to SGPRs
Only select to a VGPR if it's trivally used in VGPR only contexts. This fixes mishandling frame indexes used in SGPR only contexts, like inline assembly constraints. This is suboptimal in the common case where the frame index is transitively used by only VALU ops. We make up for this by later folding the copy to VALU plus scalar op in SIFoldOperands.
1 parent 4fb43c4 commit f0c57c9

19 files changed

+884
-988
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2175,8 +2175,11 @@ foreach vt = [i32, p3, p5, p6, p2] in {
21752175
>;
21762176
}
21772177

2178+
// FIXME: The register bank of the frame index should depend on the
2179+
// users, and transitive users of the add. We may require an
2180+
// unnecessary copy from SGPR to VGPR.
21782181
def : GCNPat <
2179-
(p5 frameindex:$fi),
2182+
(VGPRImm<(p5 frameindex)>:$fi),
21802183
(V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
21812184
>;
21822185

llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,9 +364,10 @@ entry:
364364

365365
; FUNC-LABEL: ptrtoint:
366366
; SI-NOT: ds_write
367+
; SI: s_add_i32 [[S_ADD_OFFSET:s[0-9]+]], s{{[0-9]+}}, 5
367368
; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
368-
; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5,
369-
; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ;
369+
; SI: v_mov_b32_e32 [[V_ADD_OFFSET:v[0-9]+]], [[S_ADD_OFFSET]]
370+
; SI: buffer_load_dword v{{[0-9]+}}, [[V_ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ;
370371
define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
371372
%alloca = alloca [16 x i32], addrspace(5)
372373
%tmp0 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a

llvm/test/CodeGen/AMDGPU/captured-frame-index.ll

Lines changed: 31 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -147,19 +147,14 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1)
147147

148148
; GCN-LABEL: {{^}}kernel_stored_fi_to_global_huge_frame_offset:
149149
; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
150-
; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
151150

152-
; FIXME: Re-initialize
153-
; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}}
151+
; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
154152

155153
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
156-
; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
157-
154+
; GCN-DAG: v_mov_b32_e32 [[V_BASE_1_OFF:v[0-9]+]], 0x4000{{$}}
155+
; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
158156

159-
; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]]
160-
; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
161-
162-
; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
157+
; GCN: buffer_store_dword [[V_BASE_1_OFF]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
163158
define amdgpu_kernel void @kernel_stored_fi_to_global_huge_frame_offset(ptr addrspace(1) %ptr) #0 {
164159
%tmp0 = alloca [4096 x i32], addrspace(5)
165160
%tmp1 = alloca [4096 x i32], addrspace(5)
@@ -171,20 +166,20 @@ define amdgpu_kernel void @kernel_stored_fi_to_global_huge_frame_offset(ptr addr
171166
ret void
172167
}
173168

169+
; FIXME: Shift of SP repeated twice
174170
; GCN-LABEL: {{^}}func_stored_fi_to_global_huge_frame_offset:
175-
; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
171+
; GCN-DAG: v_lshr_b32_e64 [[FI_TMP_0:v[0-9]+]], s32, 6
172+
; GCN-DAG: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
176173
; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4{{$}}
177174

178-
; GCN: v_lshr_b32_e64 [[FI_TMP:v[0-9]+]], s32, 6
179-
; GCN: v_add_i32_e32 [[BASE_0_1:v[0-9]+]], vcc, 4, [[FI_TMP]]{{$}}
180175

176+
; GCN-DAG: v_add_i32_e32 [[FI_0:v[0-9]+]], vcc, 0x4000, [[FI_TMP_0]]{{$}}
181177
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
182-
; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
183178

184-
; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]]
185-
; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
186-
187-
; GCN: buffer_store_dword [[BASE_1_OFF_2]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
179+
; GCN: buffer_store_dword [[K]], [[FI_0]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
180+
; GCN: v_lshr_b32_e64 [[FI_TMP_1:v[0-9]+]], s32, 6
181+
; GCN: v_add_i32_e32 [[BASE_0_1:v[0-9]+]], vcc, 60, [[FI_TMP_1]]{{$}}
182+
; GCN: buffer_store_dword [[BASE_0_1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
188183
define void @func_stored_fi_to_global_huge_frame_offset(ptr addrspace(1) %ptr) #0 {
189184
%tmp0 = alloca [4096 x i32], addrspace(5)
190185
%tmp1 = alloca [4096 x i32], addrspace(5)
@@ -217,9 +212,9 @@ entry:
217212
ret void
218213
}
219214

220-
; FIXME: This is broken, and the sgpr input just gets replaced with a VGPR
221215
; GCN-LABEL: {{^}}func_alloca_offset0__use_asm_sgpr:
222-
; GCN: v_lshr_b32_e64 [[FI:v[0-9]+]], s32, 6
216+
; GCN: s_lshr_b32 [[FI:s[0-9]+]], s32, 6
217+
; GCN-NOT: [[FI]]
223218
; GCN: ; use [[FI]]
224219
define void @func_alloca_offset0__use_asm_sgpr() {
225220
%alloca = alloca i32, addrspace(5)
@@ -238,9 +233,9 @@ define void @func_alloca_offset0__use_asm_vgpr() {
238233
}
239234

240235
; GCN-LABEL: {{^}}func_alloca_offset0__use_asm_phys_sgpr:
241-
; GCN: s_lshr_b32 s8, s32, 6
236+
; GCN: s_lshr_b32 [[FI:s[0-9]+]], s32, 6
242237
; GCN-NEXT: ;;#ASMSTART
243-
; GCN-NEXT: ; use s8
238+
; GCN-NEXT: ; use [[FI]]
244239
define void @func_alloca_offset0__use_asm_phys_sgpr() {
245240
%alloca = alloca i32, addrspace(5)
246241
call void asm sideeffect "; use $0", "{s8}"(ptr addrspace(5) %alloca)
@@ -258,12 +253,11 @@ define void @func_alloca_offset0__use_asm_phys_vgpr() {
258253
}
259254

260255
; GCN-LABEL: {{^}}func_alloca_offset_use_asm_sgpr:
261-
; GCN: v_lshr_b32_e64 [[FI0_TMP0:v[0-9]+]], s32, 6
262-
; GCN-NEXT: v_add_i32_e32 [[FI0:v[0-9]+]], vcc, 16, [[FI0_TMP0]]
256+
; GCN: s_lshr_b32 [[FI0_TMP0:s[0-9]+]], s32, 6
257+
; GCN-NEXT: s_add_i32 [[FI0:s[0-9]+]], [[FI0_TMP0]], 16
263258

264-
; GCN: v_lshr_b32_e64 [[TMP:v[0-9]+]], s32, 6
265-
; GCN-NEXT: s_movk_i32 vcc_lo, 0x4010
266-
; GCN-NEXT: v_add_i32_e32 [[TMP]], vcc, vcc_lo, [[TMP]]
259+
; GCN: s_lshr_b32 [[TMP:s[0-9]+]], s32, 6
260+
; GCN-NEXT: s_addk_i32 [[TMP]], 0x4010
267261
; GCN-NEXT: ;;#ASMSTART
268262
; GCN: ; use [[TMP]]
269263
define void @func_alloca_offset_use_asm_sgpr() {
@@ -274,19 +268,17 @@ define void @func_alloca_offset_use_asm_sgpr() {
274268
ret void
275269
}
276270

277-
; FIXME: Shouldn't need to materialize constant
278271
; GCN-LABEL: {{^}}func_alloca_offset_use_asm_vgpr:
279-
; GCN: v_lshr_b32_e64 [[FI0_TMP:v[0-9]+]], s32, 6
280-
; GCN-NEXT: v_add_i32_e32 [[FI0:v[0-9]+]], vcc, 16, [[FI0_TMP]]
272+
; GCN: s_lshr_b32 [[S_FI:s[0-9]+]], s32, 6
273+
; GCN: v_lshr_b32_e64 [[V_FI:v[0-9]+]], s32, 6
274+
; GCN: s_movk_i32 vcc_lo, 0x4010
275+
; GCN: s_add_i32 [[S_FI]], [[S_FI]], 16
281276
; GCN-NEXT: ;;#ASMSTART
282-
; GCN-NEXT: ; use [[FI0]]
277+
; GCN-NEXT: ; use [[S_FI]]
283278
; GCN-NEXT: ;;#ASMEND
284-
285-
; GCN: v_lshr_b32_e64 [[FI1_TMP:v[0-9]+]], s32, 6
286-
; GCN-NEXT: s_movk_i32 vcc_lo, 0x4010
287-
; GCN-NEXT: v_add_i32_e32 [[FI1:v[0-9]+]], vcc, vcc_lo, [[FI1_TMP]]
279+
; GCN-NEXT: v_add_i32_e32 [[V_FI:v[0-9]+]], vcc, vcc_lo, [[V_FI]]
288280
; GCN-NEXT: ;;#ASMSTART
289-
; GCN-NEXT: ; use [[FI1]]
281+
; GCN-NEXT: ; use [[V_FI]]
290282
; GCN-NEXT: ;;#ASMEND
291283
define void @func_alloca_offset_use_asm_vgpr() {
292284
%alloca0 = alloca [4096 x i32], align 16, addrspace(5)
@@ -296,17 +288,15 @@ define void @func_alloca_offset_use_asm_vgpr() {
296288
ret void
297289
}
298290

299-
; FIXME: Using VGPR for SGPR input
300291
; GCN-LABEL: {{^}}kernel_alloca_offset_use_asm_sgpr:
301-
; GCN: v_mov_b32_e32 v0, 16
292+
; GCN: s_mov_b32 [[FI0:s[0-9]+]], 16
302293
; GCN-NOT: v0
303294
; GCN: ;;#ASMSTART
304-
; GCN-NEXT: ; use v0
295+
; GCN-NEXT: ; use [[FI0]]
305296
; GCN-NEXT: ;;#ASMEND
306-
307-
; GCN: v_mov_b32_e32 v0, 0x4010
297+
; GCN: s_movk_i32 [[FI1:s[0-9]+]], 0x4010
308298
; GCN-NEXT: ;;#ASMSTART
309-
; GCN-NEXT: ; use v0
299+
; GCN-NEXT: ; use [[FI1]]
310300
; GCN-NEXT: ;;#ASMEND
311301
define amdgpu_kernel void @kernel_alloca_offset_use_asm_sgpr() {
312302
%alloca0 = alloca [4096 x i32], align 16, addrspace(5)

llvm/test/CodeGen/AMDGPU/commute-compares.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -699,8 +699,8 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp
699699
; GCN-LABEL: {{^}}commute_frameindex:
700700
; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
701701

702-
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
703-
; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
702+
; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}}
703+
; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
704704
define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 {
705705
entry:
706706
%stack0 = alloca i32, addrspace(5)

llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll

Lines changed: 55 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,16 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
1515
; GFX940-SDAG-LABEL: soff1_voff1:
1616
; GFX940-SDAG: ; %bb.0: ; %bb
1717
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
18-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
1918
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
20-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
19+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
2120
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
22-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
23-
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0
24-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0
25-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
21+
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
22+
; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
23+
; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
24+
; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
2625
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
27-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
28-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
29-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
26+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
27+
; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1
3028
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
3129
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
3230
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4
@@ -145,18 +143,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
145143
; GFX940-SDAG-LABEL: soff1_voff2:
146144
; GFX940-SDAG: ; %bb.0: ; %bb
147145
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
148-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
149146
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
150-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
147+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
151148
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
152-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
153-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1
154-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0
155-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
149+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0
150+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2
151+
; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
152+
; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
153+
; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
156154
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
157-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
158-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
159-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
155+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
156+
; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1
160157
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
161158
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
162159
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4
@@ -282,18 +279,17 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
282279
; GFX940-SDAG-LABEL: soff1_voff4:
283280
; GFX940-SDAG: ; %bb.0: ; %bb
284281
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
285-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
286282
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
287-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
283+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
288284
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
289-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
290-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1
291-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0
292-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
285+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0
286+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2
287+
; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
288+
; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
289+
; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
293290
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
294-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
295-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
296-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
291+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
292+
; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1
297293
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
298294
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
299295
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4
@@ -419,19 +415,17 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
419415
; GFX940-SDAG-LABEL: soff2_voff1:
420416
; GFX940-SDAG: ; %bb.0: ; %bb
421417
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
422-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
423418
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
424-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
419+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
425420
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
426421
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1
427-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
428-
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0
429-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0
430-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
422+
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
423+
; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
424+
; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
425+
; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
431426
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
432-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
433-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
434-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
427+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
428+
; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1
435429
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
436430
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
437431
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4
@@ -556,14 +550,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
556550
; GFX940-SDAG-LABEL: soff2_voff2:
557551
; GFX940-SDAG: ; %bb.0: ; %bb
558552
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
559-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
560553
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
561-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
554+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
562555
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
563556
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1
564-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
565-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1
566-
; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1
557+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0
558+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2
559+
; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
567560
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
568561
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
569562
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
@@ -698,14 +691,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
698691
; GFX940-SDAG-LABEL: soff2_voff4:
699692
; GFX940-SDAG: ; %bb.0: ; %bb
700693
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
701-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
702694
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
703-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
695+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
704696
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
705697
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1
706-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
707-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1
708-
; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1
698+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0
699+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2
700+
; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
709701
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
710702
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
711703
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
@@ -840,19 +832,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
840832
; GFX940-SDAG-LABEL: soff4_voff1:
841833
; GFX940-SDAG: ; %bb.0: ; %bb
842834
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
843-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
844835
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
845-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
836+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
846837
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
847838
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2
848-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
849-
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0
850-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0
851-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
839+
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0
840+
; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
841+
; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
842+
; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
852843
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
853-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
854-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
855-
; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
844+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
845+
; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1
856846
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
857847
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
858848
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4
@@ -977,14 +967,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
977967
; GFX940-SDAG-LABEL: soff4_voff2:
978968
; GFX940-SDAG: ; %bb.0: ; %bb
979969
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
980-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
981970
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
982-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
971+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
983972
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
984973
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2
985-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
986-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1
987-
; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1
974+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0
975+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2
976+
; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
988977
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
989978
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
990979
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
@@ -1119,17 +1108,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
11191108
; GFX940-SDAG-LABEL: soff4_voff4:
11201109
; GFX940-SDAG: ; %bb.0: ; %bb
11211110
; GFX940-SDAG-NEXT: s_load_dword s0, s[2:3], 0x24
1122-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0
11231111
; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1124-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1
1112+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1
1113+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2
11251114
; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0)
11261115
; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2
1127-
; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1
1128-
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1
1129-
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2
1130-
; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1
1116+
; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s0
1117+
; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v3
1118+
; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
11311119
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
1132-
; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:2 sc0 sc1
1120+
; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1
11331121
; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0)
11341122
; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
11351123
; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4

0 commit comments

Comments
 (0)