Skip to content

Commit 5911fbb

Browse files
authored
AMDGPU: Do not fold copy to physreg from operation on frame index (#115977)
1 parent 2baead0 commit 5911fbb

File tree

3 files changed

+88
-15
lines changed

3 files changed

+88
-15
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1599,10 +1599,6 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
15991599
if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
16001600
return false;
16011601

1602-
if (OpToFold.isReg() &&
1603-
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1604-
return true;
1605-
16061602
// Prevent folding operands backwards in the function. For example,
16071603
// the COPY opcode must not be replaced by 1 in this example:
16081604
//
@@ -1612,6 +1608,10 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
16121608
if (!DstReg.isVirtual())
16131609
return false;
16141610

1611+
if (OpToFold.isReg() &&
1612+
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
1613+
return true;
1614+
16151615
bool Changed = foldInstOperand(MI, OpToFold);
16161616

16171617
// If we managed to fold all uses of this copy then we might as well

llvm/test/CodeGen/AMDGPU/captured-frame-index.ll

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,23 @@ define amdgpu_kernel void @kernel_alloca_offset_use_asm_vgpr() {
325325
ret void
326326
}
327327

328+
; GCN-LABEL: {{^}}live_out_physreg_copy_add_fi:
329+
; GCN: s_or_b32 [[FI:s[0-9]+]], s{{[0-9]+}}, 4
330+
; GCN: v_mov_b32_e32 v0, [[FI]]
331+
; GCN: v_mov_b32_e32 v1
332+
; GCN: s_swappc_b64
333+
define void @live_out_physreg_copy_add_fi(ptr %fptr) #2 {
334+
bb:
335+
%alloca = alloca [4 x i32], align 16, addrspace(5)
336+
%addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
337+
%getelementptr = getelementptr i8, ptr %addrspacecast, i64 4
338+
call void %fptr(ptr %getelementptr) #2
339+
ret void
340+
}
341+
328342
declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #1
329343
declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #1
330344

331345
attributes #0 = { nounwind }
332346
attributes #1 = { argmemonly nounwind }
347+
attributes #2 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

llvm/test/CodeGen/AMDGPU/fold-operands-s-add-copy-to-vgpr.mir

Lines changed: 69 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,17 +120,10 @@ stack:
120120
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
121121
body: |
122122
bb.0:
123-
; GFX8-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
124-
; GFX8: $vgpr0 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def dead $vcc, implicit $exec
125-
; GFX8-NEXT: SI_RETURN implicit $vgpr0
126-
;
127-
; GFX9-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
128-
; GFX9: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec
129-
; GFX9-NEXT: SI_RETURN implicit $vgpr0
130-
;
131-
; GFX10-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
132-
; GFX10: $vgpr0 = V_ADD_U32_e32 128, %stack.0, implicit $exec
133-
; GFX10-NEXT: SI_RETURN implicit $vgpr0
123+
; CHECK-LABEL: name: fold_s_add_i32__mov_fi_const_copy_to_phys_vgpr
124+
; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def dead $scc
125+
; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]]
126+
; CHECK-NEXT: SI_RETURN implicit $vgpr0
134127
%0:sreg_32 = S_MOV_B32 %stack.0
135128
%1:sreg_32 = S_ADD_I32 %0, 128, implicit-def dead $scc
136129
$vgpr0 = COPY %1
@@ -535,3 +528,68 @@ body: |
535528
%2:vgpr_32 = COPY %1
536529
SI_RETURN implicit %2
537530
...
531+
532+
# Physreg copy of %2 to $vgpr0 should not be erased
533+
---
534+
name: fold_fi_into_s_or_b32_user_is_physreg_copy
535+
tracksRegLiveness: true
536+
stack:
537+
- { id: 0, size: 16, alignment: 16 }
538+
machineFunctionInfo:
539+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
540+
frameOffsetReg: '$sgpr33'
541+
stackPtrOffsetReg: '$sgpr32'
542+
body: |
543+
; CHECK-LABEL: name: fold_fi_into_s_or_b32_user_is_physreg_copy
544+
; CHECK: bb.0:
545+
; CHECK-NEXT: successors: %bb.1(0x80000000)
546+
; CHECK-NEXT: liveins: $vgpr0_vgpr1
547+
; CHECK-NEXT: {{ $}}
548+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
549+
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 4, implicit-def dead $scc
550+
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
551+
; CHECK-NEXT: {{ $}}
552+
; CHECK-NEXT: bb.1:
553+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
554+
; CHECK-NEXT: {{ $}}
555+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub0, implicit $exec
556+
; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[COPY]].sub1, implicit $exec
557+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
558+
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE]], [[COPY]], implicit $exec
559+
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U64_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
560+
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
561+
; CHECK-NEXT: $vgpr0 = COPY [[S_ADD_I32_]]
562+
; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL [[REG_SEQUENCE]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
563+
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
564+
; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
565+
; CHECK-NEXT: SI_WATERFALL_LOOP %bb.1, implicit $exec
566+
; CHECK-NEXT: {{ $}}
567+
; CHECK-NEXT: bb.2:
568+
; CHECK-NEXT: $exec = S_MOV_B64 [[S_MOV_B64_]]
569+
; CHECK-NEXT: SI_RETURN
570+
bb.0:
571+
liveins: $vgpr0_vgpr1
572+
573+
%0:vreg_64 = COPY $vgpr0_vgpr1
574+
%1:sreg_32 = S_MOV_B32 %stack.0
575+
%2:sreg_32 = S_ADD_I32 killed %1, 4, implicit-def dead $scc
576+
%3:sreg_64_xexec = S_MOV_B64 $exec
577+
578+
bb.1:
579+
%4:sgpr_32 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec
580+
%5:sgpr_32 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec
581+
%6:sgpr_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1
582+
%7:sreg_64_xexec = V_CMP_EQ_U64_e64 %6, %0, implicit $exec
583+
%8:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %7, implicit-def $exec, implicit-def $scc, implicit $exec
584+
ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
585+
$vgpr0 = COPY %2
586+
$sgpr30_sgpr31 = SI_CALL %6, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0
587+
ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
588+
$exec = S_XOR_B64_term $exec, %8, implicit-def $scc
589+
SI_WATERFALL_LOOP %bb.1, implicit $exec
590+
591+
bb.2:
592+
$exec = S_MOV_B64 %3
593+
SI_RETURN
594+
595+
...

0 commit comments

Comments
 (0)