Skip to content

Commit 39e24bd

Browse files
authored
MachineLICM: Allow hoisting REG_SEQUENCE (#90638)
1 parent e83c6dd commit 39e24bd

File tree

5 files changed

+168
-26
lines changed

5 files changed

+168
-26
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,25 +1264,32 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
12641264

12651265
// If we have a COPY with other uses in the loop, hoist to allow the users to
12661266
// also be hoisted.
1267-
Register DefReg;
1268-
if (MI.isCopy() && (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
1269-
MI.getOperand(1).getReg().isVirtual() &&
1270-
IsLoopInvariantInst(MI, CurLoop) &&
1271-
any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
1272-
[&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
1273-
if (!CurLoop->contains(&UseMI))
1274-
return false;
1275-
1276-
// COPY is a cheap instruction, but if moving it won't cause high
1277-
// RP we're fine to hoist it even if the user can't be hoisted
1278-
// later Otherwise we want to check the user if it's hoistable
1279-
if (CanCauseHighRegPressure(Cost, false) &&
1280-
!CurLoop->isLoopInvariant(UseMI, DefReg))
1281-
return false;
1282-
1283-
return true;
1284-
}))
1285-
return true;
1267+
// TODO: Handle all isCopyLike?
1268+
if (MI.isCopy() || MI.isRegSequence()) {
1269+
Register DefReg = MI.getOperand(0).getReg();
1270+
if (DefReg.isVirtual() &&
1271+
all_of(MI.uses(),
1272+
[](const MachineOperand &UseOp) {
1273+
return !UseOp.isReg() || UseOp.getReg().isVirtual();
1274+
}) &&
1275+
IsLoopInvariantInst(MI, CurLoop) &&
1276+
any_of(MRI->use_nodbg_instructions(DefReg),
1277+
[&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
1278+
if (!CurLoop->contains(&UseMI))
1279+
return false;
1280+
1281+
// COPY is a cheap instruction, but if moving it won't cause
1282+
// high RP we're fine to hoist it even if the user can't be
1283+
// hoisted later Otherwise we want to check the user if it's
1284+
// hoistable
1285+
if (CanCauseHighRegPressure(Cost, false) &&
1286+
!CurLoop->isLoopInvariant(UseMI, DefReg))
1287+
return false;
1288+
1289+
return true;
1290+
}))
1291+
return true;
1292+
}
12861293

12871294
// High register pressure situation, only hoist if the instruction is going
12881295
// to be remat'ed.

llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8907,17 +8907,17 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
89078907
; SI: ; %bb.0: ; %entry
89088908
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
89098909
; SI-NEXT: s_waitcnt lgkmcnt(0)
8910-
; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x0
8910+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
89118911
; SI-NEXT: s_mov_b64 s[8:9], 0
89128912
; SI-NEXT: s_mov_b32 s7, 0xf000
89138913
; SI-NEXT: v_mov_b32_e32 v4, s3
89148914
; SI-NEXT: v_mov_b32_e32 v5, s2
8915-
; SI-NEXT: s_mov_b32 s5, s1
8916-
; SI-NEXT: s_mov_b32 s4, s0
89178915
; SI-NEXT: s_waitcnt lgkmcnt(0)
8918-
; SI-NEXT: v_mov_b32_e32 v2, s10
8919-
; SI-NEXT: v_mov_b32_e32 v3, s11
8916+
; SI-NEXT: v_mov_b32_e32 v2, s4
8917+
; SI-NEXT: v_mov_b32_e32 v3, s5
89208918
; SI-NEXT: s_mov_b32 s6, -1
8919+
; SI-NEXT: s_mov_b32 s4, s0
8920+
; SI-NEXT: s_mov_b32 s5, s1
89218921
; SI-NEXT: .LBB127_1: ; %atomicrmw.start
89228922
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
89238923
; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -mtriple=amdgcn -run-pass=early-machinelicm -simplify-mir -o - %s | FileCheck %s
3+
4+
# Test to check machine LICM does not hoist convergent instructions,
5+
# DS_PERMUTE_B32 in this example.
6+
7+
---
8+
name: licm_reg_sequence
9+
body: |
10+
; CHECK-LABEL: name: licm_reg_sequence
11+
; CHECK: bb.0:
12+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
15+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
16+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
17+
; CHECK-NEXT: {{ $}}
18+
; CHECK-NEXT: bb.1:
19+
; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
20+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
21+
; CHECK-NEXT: S_BRANCH %bb.2
22+
; CHECK-NEXT: {{ $}}
23+
; CHECK-NEXT: bb.2:
24+
; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
25+
; CHECK-NEXT: S_ENDPGM 0
26+
bb.0:
27+
liveins: $vgpr0, $vgpr1
28+
successors: %bb.1
29+
30+
%0:vgpr_32 = COPY $vgpr0
31+
%1:vgpr_32 = COPY $vgpr1
32+
33+
bb.1:
34+
successors: %bb.1, %bb.2
35+
36+
%3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
37+
S_NOP 0, implicit %3
38+
S_CBRANCH_SCC1 %bb.1, implicit undef $scc
39+
S_BRANCH %bb.2
40+
41+
bb.2:
42+
$vgpr0 = COPY %3
43+
S_ENDPGM 0
44+
45+
...
46+
47+
# Don't bother handling reg_sequence with physreg uses (is there any
48+
# reason for these to be legal)?
49+
---
50+
name: licm_reg_sequence_physreg_use
51+
body: |
52+
; CHECK-LABEL: name: licm_reg_sequence_physreg_use
53+
; CHECK: bb.0:
54+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
55+
; CHECK-NEXT: {{ $}}
56+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
57+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
58+
; CHECK-NEXT: {{ $}}
59+
; CHECK-NEXT: bb.1:
60+
; CHECK-NEXT: liveins: $vgpr0
61+
; CHECK-NEXT: {{ $}}
62+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, $vgpr1, %subreg.sub1
63+
; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
64+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
65+
; CHECK-NEXT: S_BRANCH %bb.2
66+
; CHECK-NEXT: {{ $}}
67+
; CHECK-NEXT: bb.2:
68+
; CHECK-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
69+
; CHECK-NEXT: S_ENDPGM 0
70+
bb.0:
71+
liveins: $vgpr0, $vgpr1
72+
successors: %bb.1
73+
74+
%0:vgpr_32 = COPY $vgpr0
75+
%1:vgpr_32 = COPY $vgpr1
76+
77+
bb.1:
78+
successors: %bb.1, %bb.2
79+
liveins: $vgpr0
80+
81+
%3:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, $vgpr1, %subreg.sub1
82+
S_NOP 0, implicit %3
83+
S_CBRANCH_SCC1 %bb.1, implicit undef $scc
84+
S_BRANCH %bb.2
85+
86+
bb.2:
87+
$vgpr0 = COPY %3
88+
S_ENDPGM 0
89+
90+
...
91+
92+
---
93+
name: licm_insert_subreg
94+
body: |
95+
; CHECK-LABEL: name: licm_insert_subreg
96+
; CHECK: bb.0:
97+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1
98+
; CHECK-NEXT: {{ $}}
99+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
100+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
101+
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
102+
; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.sub0
103+
; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vreg_64 = INSERT_SUBREG [[INSERT_SUBREG]], [[COPY1]], %subreg.sub1
104+
; CHECK-NEXT: {{ $}}
105+
; CHECK-NEXT: bb.1:
106+
; CHECK-NEXT: S_NOP 0, implicit [[INSERT_SUBREG1]]
107+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
108+
; CHECK-NEXT: S_BRANCH %bb.2
109+
; CHECK-NEXT: {{ $}}
110+
; CHECK-NEXT: bb.2:
111+
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[INSERT_SUBREG1]]
112+
; CHECK-NEXT: S_ENDPGM 0
113+
bb.0:
114+
liveins: $vgpr0, $vgpr1
115+
successors: %bb.1
116+
117+
%0:vgpr_32 = COPY $vgpr0
118+
%1:vgpr_32 = COPY $vgpr1
119+
120+
bb.1:
121+
successors: %bb.1, %bb.2
122+
123+
%3:vreg_64 = IMPLICIT_DEF
124+
%4:vreg_64 = INSERT_SUBREG %3, %0, %subreg.sub0
125+
%5:vreg_64 = INSERT_SUBREG %4, %1, %subreg.sub1
126+
S_NOP 0, implicit %5
127+
S_CBRANCH_SCC1 %bb.1, implicit undef $scc
128+
S_BRANCH %bb.2
129+
130+
bb.2:
131+
$vgpr0_vgpr1 = COPY %5
132+
S_ENDPGM 0
133+
134+
...

llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
88
; GCN-NEXT: s_mov_b32 s7, 0xf000
99
; GCN-NEXT: s_mov_b32 s10, -1
1010
; GCN-NEXT: s_mov_b32 s6, 0
11+
; GCN-NEXT: s_mov_b32 s11, s7
1112
; GCN-NEXT: s_waitcnt lgkmcnt(0)
12-
; GCN-NEXT: s_mov_b32 s9, s5
1313
; GCN-NEXT: s_mov_b32 s8, s4
14+
; GCN-NEXT: s_mov_b32 s9, s5
1415
; GCN-NEXT: v_mov_b32_e32 v0, 0
1516
; GCN-NEXT: s_branch .LBB0_2
1617
; GCN-NEXT: .LBB0_1: ; %loop.exit.guard
@@ -20,7 +21,6 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
2021
; GCN-NEXT: .LBB0_2: ; %bb1
2122
; GCN-NEXT: ; =>This Loop Header: Depth=1
2223
; GCN-NEXT: ; Child Loop BB0_4 Depth 2
23-
; GCN-NEXT: s_mov_b32 s11, s7
2424
; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
2525
; GCN-NEXT: s_waitcnt vmcnt(0)
2626
; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v1

llvm/test/CodeGen/Hexagon/expand-vstorerw-undef.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ b18: ; preds = %b16, %b7
6969
br label %b22
7070

7171
b21: ; preds = %b22
72+
store volatile <64 x i32> %v20, ptr null
7273
tail call void @sammy() #3
7374
br label %b7
7475

0 commit comments

Comments
 (0)