Skip to content

Commit fb27867

Browse files
authored
[AMDGPU] SIFoldOperands: Delay foldCopyToVGPROfScalarAddOfFrameIndex (#141558)
foldCopyToVGPROfScalarAddOfFrameIndex transforms s_adds whose results are copied to vector registers into v_adds. We don't want to do that if foldInstOperand (which so far runs later) can fold the sreg->vreg copy away. This patch therefore delays foldCopyToVGPROfScalarAddOfFrameIndex until after foldInstOperand. This avoids unnecessary movs in the flat-scratch-svs.ll test and also avoids regressions in an upcoming patch to enable ISD::PTRADD nodes.
1 parent bf6cd24 commit fb27867

File tree

2 files changed

+17
-20
lines changed

2 files changed

+17
-20
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1909,10 +1909,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
19091909
if (!DstReg.isVirtual())
19101910
return false;
19111911

1912-
if (OpToFold.isReg() &&
1913-
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1914-
return true;
1915-
19161912
// Fold copy to AGPR through reg_sequence
19171913
// TODO: Handle with subregister extract
19181914
if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
@@ -1947,7 +1943,14 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
19471943
Changed = true;
19481944
}
19491945

1950-
return Changed;
1946+
if (Changed)
1947+
return true;
1948+
1949+
// Run this after foldInstOperand to avoid turning scalar additions into
1950+
// vector additions when the result scalar result could just be folded into
1951+
// the user(s).
1952+
return OpToFold.isReg() &&
1953+
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
19511954
}
19521955

19531956
// Clamp patterns are canonically selected to v_max_* instructions, so only

llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,7 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
182182
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
183183
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
184184
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
185-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
186-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
185+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
187186
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
188187
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
189188
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -356,8 +355,7 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
356355
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
357356
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
358357
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
359-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
360-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
358+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
361359
; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 1, v0
362360
; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 2, v0
363361
; GFX942-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1
@@ -701,14 +699,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
701699
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
702700
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
703701
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
702+
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
704703
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
705704
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
706-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
707-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
705+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
708706
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
709707
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
710708
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
711-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
712709
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
713710
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
714711
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -884,14 +881,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
884881
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
885882
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
886883
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
884+
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
887885
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
888886
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 1
889-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
890-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v2
887+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
891888
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
892889
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
893890
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
894-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
895891
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
896892
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
897893
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1239,14 +1235,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
12391235
; GFX942-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24
12401236
; GFX942-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
12411237
; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
1238+
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
12421239
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
12431240
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
1244-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, s0
1245-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, v2
1241+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 2, s0
12461242
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
12471243
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
12481244
; GFX942-SDAG-NEXT: v_add_u32_e32 v1, 2, v0
1249-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
12501245
; GFX942-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1
12511246
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
12521247
; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 4, v0
@@ -1425,8 +1420,7 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
14251420
; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
14261421
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
14271422
; GFX942-SDAG-NEXT: s_lshl_b32 s0, s0, 2
1428-
; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, s0
1429-
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, v3
1423+
; GFX942-SDAG-NEXT: v_mad_u32_u24 v0, v0, 4, s0
14301424
; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1
14311425
; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
14321426
; GFX942-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1

0 commit comments

Comments
 (0)