Skip to content

Commit 4de421e

Browse files
committed
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Whilst reviewing PR #109289 and doing some analysis with various tests involving predicated blocks I noticed that we're making codegen and performance worse by sinking vector comparisons multiple times into blocks. It looks like the sinkCmpExpression in CodeGenPrepare was written for scalar comparisons where there is only a single condition register, whereas vector comparisons typically produce a vector result. For some targets, such a NEON or SVE, there are multiple allocatable vector registers that can store the result and so we should avoid sinking in that case.
1 parent 6b93bd0 commit 4de421e

File tree

9 files changed

+72
-79
lines changed

9 files changed

+72
-79
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -497,10 +497,10 @@ class TargetLoweringBase {
497497
return true;
498498
}
499499

500-
/// Return true if multiple condition registers are available.
501-
bool hasMultipleConditionRegisters() const {
502-
return HasMultipleConditionRegisters;
503-
}
500+
/// Return true if multiple (allocatable) predicate registers are available
501+
/// for \p VT. If there is only a single register the code generator will
502+
/// sink comparisons into the blocks of their users.
503+
virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; }
504504

505505
/// Return true if the target has BitExtract instructions.
506506
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2389,7 +2389,7 @@ class TargetLoweringBase {
23892389
EVT VT) const {
23902390
// If a target has multiple condition registers, then it likely has logical
23912391
// operations on those registers.
2392-
if (hasMultipleConditionRegisters())
2392+
if (hasMultiplePredicateRegisters(VT))
23932393
return false;
23942394
// Only do the transform if the value won't be split into multiple
23952395
// registers.
@@ -2496,15 +2496,6 @@ class TargetLoweringBase {
24962496
StackPointerRegisterToSaveRestore = R;
24972497
}
24982498

2499-
/// Tells the code generator that the target has multiple (allocatable)
2500-
/// condition registers that can be used to store the results of comparisons
2501-
/// for use by selects and conditional branches. With multiple condition
2502-
/// registers, the code generator will not aggressively sink comparisons into
2503-
/// the blocks of their users.
2504-
void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
2505-
HasMultipleConditionRegisters = hasManyRegs;
2506-
}
2507-
25082499
/// Tells the code generator that the target has BitExtract instructions.
25092500
/// The code generator will aggressively sink "shift"s into the blocks of
25102501
/// their users if the users will generate "and" instructions which can be
@@ -3470,13 +3461,6 @@ class TargetLoweringBase {
34703461
private:
34713462
const TargetMachine &TM;
34723463

3473-
/// Tells the code generator that the target has multiple (allocatable)
3474-
/// condition registers that can be used to store the results of comparisons
3475-
/// for use by selects and conditional branches. With multiple condition
3476-
/// registers, the code generator will not aggressively sink comparisons into
3477-
/// the blocks of their users.
3478-
bool HasMultipleConditionRegisters;
3479-
34803464
/// Tells the code generator that the target has BitExtract instructions.
34813465
/// The code generator will aggressively sink "shift"s into the blocks of
34823466
/// their users if the users will generate "and" instructions which can be

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,8 +1771,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
17711771
/// lose; some adjustment may be wanted there.
17721772
///
17731773
/// Return true if any changes are made.
1774-
static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
1775-
if (TLI.hasMultipleConditionRegisters())
1774+
static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp,
1775+
const TargetLowering &TLI) {
1776+
EVT ResVT = TLI.getValueType(DL, Cmp->getType());
1777+
if (TLI.hasMultiplePredicateRegisters(ResVT))
17761778
return false;
17771779

17781780
// Avoid sinking soft-FP comparisons, since this can move them into a loop.
@@ -2137,7 +2139,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
21372139
}
21382140

21392141
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
2140-
if (sinkCmpExpression(Cmp, *TLI))
2142+
if (sinkCmpExpression(*DL, Cmp, *TLI))
21412143
return true;
21422144

21432145
if (combineToUAddWithOverflow(Cmp, ModifiedDT))

llvm/lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
625625
MaxGluedStoresPerMemcpy = 0;
626626
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
627627
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
628-
HasMultipleConditionRegisters = false;
629628
HasExtractBitsInsn = false;
630629
JumpIsExpensive = JumpIsExpensiveOverride;
631630
PredictableSelectIsExpensive = false;

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,6 +1358,10 @@ class AArch64TargetLowering : public TargetLowering {
13581358
unsigned getMinimumJumpTableEntries() const override;
13591359

13601360
bool softPromoteHalfType() const override { return true; }
1361+
1362+
virtual bool hasMultiplePredicateRegisters(EVT VT) const {
1363+
return VT.isVector();
1364+
}
13611365
};
13621366

13631367
namespace AArch64 {

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
583583
setSchedulingPreference(Sched::RegPressure);
584584
setJumpIsExpensive(true);
585585

586-
// FIXME: This is only partially true. If we have to do vector compares, any
587-
// SGPR pair can be a condition register. If we have a uniform condition, we
588-
// are better off doing SALU operations, where there is only one SCC. For now,
589-
// we don't have a way of knowing during instruction selection if a condition
590-
// will be uniform and we always use vector compares. Assume we are using
591-
// vector compares until that is fixed.
592-
setHasMultipleConditionRegisters(true);
593-
594586
setMinCmpXchgSizeInBits(32);
595587
setSupportsUnalignedAtomics(false);
596588

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering {
387387
MVT getFenceOperandTy(const DataLayout &DL) const override {
388388
return MVT::i32;
389389
}
390+
391+
virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
392+
// FIXME: This is only partially true. If we have to do vector compares,
393+
// any SGPR pair can be a condition register. If we have a uniform
394+
// condition, we are better off doing SALU operations, where there is only
395+
// one SCC. For now, we don't have a way of knowing during instruction
396+
// selection if a condition will be uniform and we always use vector
397+
// compares. Assume we are using vector compares until that is fixed.
398+
return true;
399+
}
390400
};
391401

392402
namespace AMDGPUISD {

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,10 +1454,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
14541454

14551455
// With 32 condition bits, we don't need to sink (and duplicate) compares
14561456
// aggressively in CodeGenPrep.
1457-
if (Subtarget.useCRBits()) {
1458-
setHasMultipleConditionRegisters();
1457+
if (Subtarget.useCRBits())
14591458
setJumpIsExpensive();
1460-
}
14611459

14621460
// TODO: The default entry number is set to 64. This stops most jump table
14631461
// generation on PPC. But it is good for current PPC HWs because the indirect
@@ -19044,3 +19042,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
1904419042
return Builder.CreateOr(
1904519043
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
1904619044
}
19045+
19046+
bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const {
19047+
// With 32 condition bits, we don't need to sink (and duplicate) compares
19048+
// aggressively in CodeGenPrep.
19049+
return Subtarget.useCRBits();
19050+
}

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1490,6 +1490,8 @@ namespace llvm {
14901490
/// through to determine the optimal load/store instruction format.
14911491
unsigned computeMOFlags(const SDNode *Parent, SDValue N,
14921492
SelectionDAG &DAG) const;
1493+
1494+
virtual bool hasMultiplePredicateRegisters(EVT VT) const override;
14931495
}; // end class PPCTargetLowering
14941496

14951497
namespace PPC {

llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,68 +6,64 @@ target triple = "aarch64-unknown-linux-gnu"
66
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
77
; CHECK-LABEL: vector_loop_with_icmp:
88
; CHECK: // %bb.0: // %entry
9-
; CHECK-NEXT: mov w8, #15 // =0xf
9+
; CHECK-NEXT: mov w9, #15 // =0xf
1010
; CHECK-NEXT: mov w10, #4 // =0x4
11-
; CHECK-NEXT: adrp x9, .LCPI0_0
11+
; CHECK-NEXT: adrp x8, .LCPI0_0
1212
; CHECK-NEXT: adrp x11, .LCPI0_1
13-
; CHECK-NEXT: dup v0.2d, x8
13+
; CHECK-NEXT: dup v0.2d, x9
1414
; CHECK-NEXT: dup v1.2d, x10
15-
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_0]
15+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
1616
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI0_1]
17-
; CHECK-NEXT: add x9, x0, #8
18-
; CHECK-NEXT: mov w10, #16 // =0x10
19-
; CHECK-NEXT: mov w11, #1 // =0x1
17+
; CHECK-NEXT: add x8, x0, #8
18+
; CHECK-NEXT: mov w9, #16 // =0x10
19+
; CHECK-NEXT: mov w10, #1 // =0x1
2020
; CHECK-NEXT: b .LBB0_2
2121
; CHECK-NEXT: .LBB0_1: // %pred.store.continue18
2222
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
2323
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
2424
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
25-
; CHECK-NEXT: subs x10, x10, #4
26-
; CHECK-NEXT: add x9, x9, #16
25+
; CHECK-NEXT: subs x9, x9, #4
26+
; CHECK-NEXT: add x8, x8, #16
2727
; CHECK-NEXT: b.eq .LBB0_10
2828
; CHECK-NEXT: .LBB0_2: // %vector.body
2929
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30-
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
31-
; CHECK-NEXT: xtn v4.2s, v4.2d
32-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
33-
; CHECK-NEXT: umov w12, v4.h[0]
34-
; CHECK-NEXT: tbz w12, #0, .LBB0_4
35-
; CHECK-NEXT: // %bb.3: // %pred.store.if
30+
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
31+
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
32+
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
33+
; CHECK-NEXT: xtn v4.4h, v4.4s
34+
; CHECK-NEXT: umov w11, v4.h[0]
35+
; CHECK-NEXT: tbnz w11, #0, .LBB0_6
36+
; CHECK-NEXT: // %bb.3: // %pred.store.continue
3637
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
37-
; CHECK-NEXT: stur w11, [x9, #-8]
38-
; CHECK-NEXT: .LBB0_4: // %pred.store.continue
38+
; CHECK-NEXT: umov w11, v4.h[1]
39+
; CHECK-NEXT: tbnz w11, #0, .LBB0_7
40+
; CHECK-NEXT: .LBB0_4: // %pred.store.continue6
3941
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
40-
; CHECK-NEXT: dup v4.2d, x8
41-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
42-
; CHECK-NEXT: xtn v4.2s, v4.2d
43-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
44-
; CHECK-NEXT: umov w12, v4.h[1]
45-
; CHECK-NEXT: tbz w12, #0, .LBB0_6
46-
; CHECK-NEXT: // %bb.5: // %pred.store.if5
42+
; CHECK-NEXT: umov w11, v4.h[2]
43+
; CHECK-NEXT: tbnz w11, #0, .LBB0_8
44+
; CHECK-NEXT: .LBB0_5: // %pred.store.continue8
4745
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
48-
; CHECK-NEXT: stur w11, [x9, #-4]
49-
; CHECK-NEXT: .LBB0_6: // %pred.store.continue6
46+
; CHECK-NEXT: umov w11, v4.h[3]
47+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
48+
; CHECK-NEXT: b .LBB0_9
49+
; CHECK-NEXT: .LBB0_6: // %pred.store.if
5050
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
51-
; CHECK-NEXT: dup v4.2d, x8
52-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
53-
; CHECK-NEXT: xtn v4.2s, v4.2d
54-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
55-
; CHECK-NEXT: umov w12, v4.h[2]
56-
; CHECK-NEXT: tbz w12, #0, .LBB0_8
57-
; CHECK-NEXT: // %bb.7: // %pred.store.if7
51+
; CHECK-NEXT: stur w10, [x8, #-8]
52+
; CHECK-NEXT: umov w11, v4.h[1]
53+
; CHECK-NEXT: tbz w11, #0, .LBB0_4
54+
; CHECK-NEXT: .LBB0_7: // %pred.store.if5
5855
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
59-
; CHECK-NEXT: str w11, [x9]
60-
; CHECK-NEXT: .LBB0_8: // %pred.store.continue8
56+
; CHECK-NEXT: stur w10, [x8, #-4]
57+
; CHECK-NEXT: umov w11, v4.h[2]
58+
; CHECK-NEXT: tbz w11, #0, .LBB0_5
59+
; CHECK-NEXT: .LBB0_8: // %pred.store.if7
6160
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
62-
; CHECK-NEXT: dup v4.2d, x8
63-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
64-
; CHECK-NEXT: xtn v4.2s, v4.2d
65-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
66-
; CHECK-NEXT: umov w12, v4.h[3]
67-
; CHECK-NEXT: tbz w12, #0, .LBB0_1
68-
; CHECK-NEXT: // %bb.9: // %pred.store.if9
61+
; CHECK-NEXT: str w10, [x8]
62+
; CHECK-NEXT: umov w11, v4.h[3]
63+
; CHECK-NEXT: tbz w11, #0, .LBB0_1
64+
; CHECK-NEXT: .LBB0_9: // %pred.store.if9
6965
; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1
70-
; CHECK-NEXT: str w11, [x9, #4]
66+
; CHECK-NEXT: str w10, [x8, #4]
7167
; CHECK-NEXT: b .LBB0_1
7268
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
7369
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)