Skip to content

Commit 56ad6d1

Browse files
[MachineLICM] Hoist COPY instruction only when user can be hoisted (#81735)
befa925 introduced preliminary hoisting of COPY instructions when the user of the COPY is inside the same loop. That optimization appeared to be too aggressive and hoisted too many COPY's greatly increasing register pressure causing performance regressions for AMDGPU target. This is intended to fix the regression by hoisting COPY instruction only if either: - User of COPY can be hoisted (other args are invariant) or - Hoisting COPY doesn't bring high register pressure
1 parent d273a19 commit 56ad6d1

File tree

4 files changed

+118
-6
lines changed

4 files changed

+118
-6
lines changed

llvm/include/llvm/CodeGen/MachineLoopInfo.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
7979
/// I.e., all virtual register operands are defined outside of the loop,
8080
/// physical registers aren't accessed explicitly, and there are no side
8181
/// effects that aren't captured by the operands or other flags.
82-
bool isLoopInvariant(MachineInstr &I) const;
82+
/// ExcludeReg can be used to exclude the given register from the check
83+
/// i.e. when we're considering hoisting it's definition but not hoisted it
84+
/// yet
85+
bool isLoopInvariant(MachineInstr &I, const Register ExcludeReg = 0) const;
8386

8487
void dump() const;
8588

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,13 +1264,24 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
12641264

12651265
// If we have a COPY with other uses in the loop, hoist to allow the users to
12661266
// also be hoisted.
1267+
Register DefReg;
12671268
if (MI.isCopy() && MI.getOperand(0).isReg() &&
1268-
MI.getOperand(0).getReg().isVirtual() && MI.getOperand(1).isReg() &&
1269-
MI.getOperand(1).getReg().isVirtual() &&
1269+
(DefReg = MI.getOperand(0).getReg()).isVirtual() &&
1270+
MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
12701271
IsLoopInvariantInst(MI, CurLoop) &&
12711272
any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
1272-
[&CurLoop](MachineInstr &UseMI) {
1273-
return CurLoop->contains(&UseMI);
1273+
[&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
1274+
if (!CurLoop->contains(&UseMI))
1275+
return false;
1276+
1277+
// COPY is a cheap instruction, but if moving it won't cause high
1278+
// RP we're fine to hoist it even if the user can't be hoisted
1279+
// later Otherwise we want to check the user if it's hoistable
1280+
if (CanCauseHighRegPressure(Cost, false) &&
1281+
!CurLoop->isLoopInvariant(UseMI, DefReg))
1282+
return false;
1283+
1284+
return true;
12741285
}))
12751286
return true;
12761287

llvm/lib/CodeGen/MachineLoopInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ MDNode *MachineLoop::getLoopID() const {
198198
return LoopID;
199199
}
200200

201-
bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
201+
bool MachineLoop::isLoopInvariant(MachineInstr &I,
202+
const Register ExcludeReg) const {
202203
MachineFunction *MF = I.getParent()->getParent();
203204
MachineRegisterInfo *MRI = &MF->getRegInfo();
204205
const TargetSubtargetInfo &ST = MF->getSubtarget();
@@ -213,6 +214,9 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
213214
Register Reg = MO.getReg();
214215
if (Reg == 0) continue;
215216

217+
if (ExcludeReg == Reg)
218+
continue;
219+
216220
// An instruction that uses or defines a physical register can't e.g. be
217221
// hoisted, so mark this as not invariant.
218222
if (Reg.isPhysical()) {
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
; NOTE: There must be no spill reload inside the loop starting with LBB0_1:
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s
3+
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
define amdgpu_kernel void @foo(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
7+
; CHECK-LABEL: foo:
8+
; CHECK-LABEL: .LBB0_1
9+
; CHECK-NOT: buffer_load_dword {{v[0-9]+}}
10+
11+
.lr.ph:
12+
%.pre = load double, ptr null, align 8
13+
br label %42
14+
15+
42: ; preds = %42, %.lr.ph
16+
%.0.i4402 = phi i32 [ 1, %.lr.ph ], [ 0, %42 ]
17+
%43 = zext i32 %.0.i4402 to i64
18+
%44 = load double, ptr %2, align 8
19+
%45 = load double, ptr %4, align 8
20+
%46 = load double, ptr %7, align 8
21+
%47 = load double, ptr %13, align 8
22+
%48 = load double, ptr %15, align 8
23+
%49 = load double, ptr %17, align 8
24+
%50 = load double, ptr %19, align 8
25+
%51 = load double, ptr %18, align 8
26+
%52 = load double, ptr %27, align 8
27+
%53 = load double, ptr %23, align 8
28+
%54 = load double, ptr %31, align 8
29+
%55 = load double, ptr %33, align 8
30+
%56 = load double, ptr %25, align 8
31+
%57 = load double, ptr %16, align 8
32+
%58 = fpext float %40 to double
33+
%59 = fmul double %52, %58
34+
%60 = fadd double %59, %51
35+
%61 = fsub double %60, %48
36+
%62 = fmul double 0.000000e+00, %36
37+
%63 = fsub double %61, %62
38+
%64 = fadd double %49, %63
39+
%65 = fptrunc double %64 to float
40+
%66 = fsub double 0.000000e+00, %34
41+
%67 = fpext float %39 to double
42+
%68 = fmul double %53, %67
43+
%69 = fsub double %66, %68
44+
%70 = fadd double %50, %69
45+
%71 = fptrunc double %70 to float
46+
store float 0.000000e+00, ptr %30, align 4
47+
store float 0.000000e+00, ptr %26, align 4
48+
%72 = getelementptr float, ptr %41, i64 %43
49+
store float %38, ptr %72, align 4
50+
store float %65, ptr %29, align 4
51+
store float %71, ptr %14, align 4
52+
store float %39, ptr %3, align 4
53+
store float %39, ptr %11, align 4
54+
%73 = fsub double %46, %44
55+
%74 = fptrunc double %73 to float
56+
%75 = fsub double %47, %45
57+
%76 = fptrunc double %75 to float
58+
%77 = fadd float %74, %76
59+
%78 = fpext float %37 to double
60+
%79 = fmul contract double %56, 0.000000e+00
61+
%80 = fsub contract double %34, %79
62+
%81 = fpext float %77 to double
63+
%82 = fmul double %.pre, %81
64+
%83 = fsub double %80, %82
65+
%84 = fpext float %38 to double
66+
%85 = fmul double %57, %84
67+
%86 = fsub double %83, %85
68+
%87 = fptrunc double %86 to float
69+
%88 = fmul double %34, 0.000000e+00
70+
%89 = fmul double %54, %78
71+
%90 = fadd double %89, %88
72+
%91 = fsub double %90, %55
73+
%92 = fmul double 0.000000e+00, %35
74+
%93 = fsub double %91, %92
75+
%94 = fmul double %34, %34
76+
%95 = fadd double %93, %94
77+
%96 = fptrunc double %95 to float
78+
store float %87, ptr %1, align 4
79+
store float %37, ptr %21, align 4
80+
store float %96, ptr %0, align 4
81+
store float 0.000000e+00, ptr %9, align 4
82+
store float 0.000000e+00, ptr %32, align 4
83+
store float 0.000000e+00, ptr %20, align 4
84+
store float 0.000000e+00, ptr %22, align 4
85+
store float 0.000000e+00, ptr %5, align 4
86+
store float 0.000000e+00, ptr %28, align 4
87+
store float 0.000000e+00, ptr %12, align 4
88+
store float 0.000000e+00, ptr %6, align 4
89+
store float 0.000000e+00, ptr %8, align 4
90+
store float 0.000000e+00, ptr %.sroa.1.0.copyload, align 4
91+
store float %37, ptr %10, align 4
92+
store float 0.000000e+00, ptr %24, align 4
93+
br label %42
94+
}

0 commit comments

Comments
 (0)