Skip to content

Commit fb59ac6

Browse files
committed
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Whilst reviewing PR #109289 and doing some analysis with various tests involving predicated blocks I noticed that we're making codegen and performance worse by sinking vector comparisons multiple times into blocks. It looks like the sinkCmpExpression in CodeGenPrepare was written for scalar comparisons where there is only a single condition register, whereas vector comparisons typically produce a vector result and register pressure is much lower. Given they are likely to be more expensive than scalar comparisons it makes sense to avoid sinking too many. The CodeGen/SystemZ/vec-perm-14.ll test does rely upon sinking a vector comparison so I've kept that behaviour by permitting one sink. Alternatively, I could also introduce a TLI hook to query the target if this is a preferred solution?
1 parent deba620 commit fb59ac6

File tree

4 files changed

+296
-312
lines changed

4 files changed

+296
-312
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1779,29 +1779,35 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
17791779
if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
17801780
return false;
17811781

1782-
// Only insert a cmp in each block once.
1783-
DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
1782+
// Collect a list of non-phis users that are in blocks that are different to
1783+
// the definition block.
1784+
BasicBlock *DefBB = Cmp->getParent();
1785+
SmallSet<User *, 4> Users;
1786+
for (auto *U : Cmp->users()) {
1787+
Instruction *User = cast<Instruction>(U);
1788+
if (isa<PHINode>(User))
1789+
continue;
17841790

1785-
bool MadeChange = false;
1786-
for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
1787-
UI != E;) {
1788-
Use &TheUse = UI.getUse();
1789-
Instruction *User = cast<Instruction>(*UI);
1791+
if (User->getParent() == DefBB)
1792+
continue;
17901793

1791-
// Preincrement use iterator so we don't invalidate it.
1792-
++UI;
1794+
Users.insert(User);
1795+
}
17931796

1794-
// Don't bother for PHI nodes.
1795-
if (isa<PHINode>(User))
1796-
continue;
1797+
// If this is a vector comparison the result will likely not depend upon
1798+
// setting a condition register, and it's probably too expensive to sink too
1799+
// many times.
1800+
VectorType *VecType = dyn_cast<VectorType>(Cmp->getType());
1801+
if (VecType && VecType->getElementCount().isVector() && Users.size() > 1)
1802+
return false;
17971803

1798-
// Figure out which BB this cmp is used in.
1799-
BasicBlock *UserBB = User->getParent();
1800-
BasicBlock *DefBB = Cmp->getParent();
1804+
// Only insert a cmp in each block once.
1805+
DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
18011806

1802-
// If this user is in the same block as the cmp, don't change the cmp.
1803-
if (UserBB == DefBB)
1804-
continue;
1807+
bool MadeChange = false;
1808+
for (auto *U : Users) {
1809+
Instruction *UI = cast<Instruction>(U);
1810+
BasicBlock *UserBB = UI->getParent();
18051811

18061812
// If we have already inserted a cmp into this block, use it.
18071813
CmpInst *&InsertedCmp = InsertedCmps[UserBB];
@@ -1816,10 +1822,15 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
18161822
InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
18171823
}
18181824

1819-
// Replace a use of the cmp with a use of the new cmp.
1820-
TheUse = InsertedCmp;
1825+
// Replace all uses of the cmp with a use of the new cmp and update the
1826+
// number of uses.
1827+
for (unsigned I = 0; I < U->getNumOperands(); I++)
1828+
if (U->getOperand(I) == Cmp) {
1829+
U->setOperand(I, InsertedCmp);
1830+
NumCmpUses++;
1831+
}
1832+
18211833
MadeChange = true;
1822-
++NumCmpUses;
18231834
}
18241835

18251836
// If we removed all uses, nuke the cmp.

llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu"
66
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
77
; CHECK-LABEL: vector_loop_with_icmp:
88
; CHECK: // %bb.0: // %entry
9-
; CHECK-NEXT: mov w8, #15 // =0xf
9+
; CHECK-NEXT: mov w9, #15 // =0xf
1010
; CHECK-NEXT: mov w10, #4 // =0x4
11-
; CHECK-NEXT: adrp x9, .LCPI0_0
11+
; CHECK-NEXT: adrp x8, .LCPI0_0
1212
; CHECK-NEXT: adrp x11, .LCPI0_1
13-
; CHECK-NEXT: dup v0.2d, x8
13+
; CHECK-NEXT: dup v0.2d, x9
1414
; CHECK-NEXT: dup v1.2d, x10
15-
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
15+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
1616
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1]
17-
; CHECK-NEXT: add x9, x0, #8
18-
; CHECK-NEXT: mov w10, #16 // =0x10
19-
; CHECK-NEXT: mov w11, #1 // =0x1
17+
; CHECK-NEXT: add x8, x0, #8
18+
; CHECK-NEXT: mov w9, #16 // =0x10
19+
; CHECK-NEXT: mov w10, #1 // =0x1
2020
; CHECK-NEXT: b .LBB0_2
2121
; CHECK-NEXT: .LBB0_1: // %pred.store.continue18
2222
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
2323
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
2424
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
25-
; CHECK-NEXT: subs x10, x10, #4
26-
; CHECK-NEXT: add x9, x9, #16
25+
; CHECK-NEXT: subs x9, x9, #4
26+
; CHECK-NEXT: add x8, x8, #16
2727
; CHECK-NEXT: b.eq .LBB0_10
2828
; CHECK-NEXT: .LBB0_2: // %vector.body
2929
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30-
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
31-
; CHECK-NEXT: xtn v4.2s, v4.2d
32-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
33-
; CHECK-NEXT: umov w12, v4.h[0]
34-
; CHECK-NEXT: tbz w12, #0, .LBB0_4
35-
; CHECK-NEXT: // %bb.3: // %pred.store.if
30+
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
31+
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
32+
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
33+
; CHECK-NEXT: xtn v4.4h, v4.4s
34+
; CHECK-NEXT: umov w11, v4.h[0]
35+
; CHECK-NEXT: tbnz w11, #0, .LBB0_6
36+
; CHECK-NEXT: // %bb.3: // %pred.store.continue
3637
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
37-
; CHECK-NEXT: stur w11, [x9, #-8]
38-
; CHECK-NEXT: .LBB0_4: // %pred.store.continue
38+
; CHECK-NEXT: umov w11, v4.h[1]
39+
; CHECK-NEXT: tbnz w11, #0, .LBB0_7
40+
; CHECK-NEXT: .LBB0_4: // %pred.store.continue6
3941
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
40-
; CHECK-NEXT: dup v4.2d, x8
41-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
42-
; CHECK-NEXT: xtn v4.2s, v4.2d
43-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
44-
; CHECK-NEXT: umov w12, v4.h[1]
45-
; CHECK-NEXT: tbz w12, #0, .LBB0_6
46-
; CHECK-NEXT: // %bb.5: // %pred.store.if5
42+
; CHECK-NEXT: umov w11, v4.h[2]
43+
; CHECK-NEXT: tbnz w11, #0, .LBB0_8
44+
; CHECK-NEXT: .LBB0_5: // %pred.store.continue8
4745
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
48-
; CHECK-NEXT: stur w11, [x9, #-4]
49-
; CHECK-NEXT: .LBB0_6: // %pred.store.continue6
46+
; CHECK-NEXT: umov w11, v4.h[3]
47+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
48+
; CHECK-NEXT: b .LBB0_9
49+
; CHECK-NEXT: .LBB0_6: // %pred.store.if
5050
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
51-
; CHECK-NEXT: dup v4.2d, x8
52-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
53-
; CHECK-NEXT: xtn v4.2s, v4.2d
54-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
55-
; CHECK-NEXT: umov w12, v4.h[2]
56-
; CHECK-NEXT: tbz w12, #0, .LBB0_8
57-
; CHECK-NEXT: // %bb.7: // %pred.store.if7
51+
; CHECK-NEXT: stur w10, [x8, #-8]
52+
; CHECK-NEXT: umov w11, v4.h[1]
53+
; CHECK-NEXT: tbz w11, #0, .LBB0_4
54+
; CHECK-NEXT: .LBB0_7: // %pred.store.if5
5855
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
59-
; CHECK-NEXT: str w11, [x9]
60-
; CHECK-NEXT: .LBB0_8: // %pred.store.continue8
56+
; CHECK-NEXT: stur w10, [x8, #-4]
57+
; CHECK-NEXT: umov w11, v4.h[2]
58+
; CHECK-NEXT: tbz w11, #0, .LBB0_5
59+
; CHECK-NEXT: .LBB0_8: // %pred.store.if7
6160
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
62-
; CHECK-NEXT: dup v4.2d, x8
63-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
64-
; CHECK-NEXT: xtn v4.2s, v4.2d
65-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
66-
; CHECK-NEXT: umov w12, v4.h[3]
67-
; CHECK-NEXT: tbz w12, #0, .LBB0_1
68-
; CHECK-NEXT: // %bb.9: // %pred.store.if9
61+
; CHECK-NEXT: str w10, [x8]
62+
; CHECK-NEXT: umov w11, v4.h[3]
63+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
64+
; CHECK-NEXT: .LBB0_9: // %pred.store.if9
6965
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
70-
; CHECK-NEXT: str w11, [x9, #4]
66+
; CHECK-NEXT: str w10, [x8, #4]
7167
; CHECK-NEXT: b .LBB0_1
7268
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
7369
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)