Skip to content

Commit e4f714d

Browse files
committed
[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare
Whilst reviewing PR #109289 and doing some analysis with various tests involving predicated blocks I noticed that we're making codegen and performance worse by sinking vector comparisons multiple times into blocks. It looks like the sinkCmpExpression in CodeGenPrepare was written for scalar comparisons where there is only a single condition register, whereas vector comparisons typically produce a vector result. For some targets, such a NEON or SVE, there are multiple allocatable vector registers that can store the result and so we should avoid sinking in that case.
1 parent 608fee8 commit e4f714d

File tree

9 files changed

+76
-81
lines changed

9 files changed

+76
-81
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -497,10 +497,10 @@ class TargetLoweringBase {
497497
return true;
498498
}
499499

500-
/// Return true if multiple condition registers are available.
501-
bool hasMultipleConditionRegisters() const {
502-
return HasMultipleConditionRegisters;
503-
}
500+
/// Return true if multiple (allocatable) predicate registers are available
501+
/// for \p VT. If there is only a single register the code generator will
502+
/// sink comparisons into the blocks of their users.
503+
virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; }
504504

505505
/// Return true if the target has BitExtract instructions.
506506
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2389,7 +2389,7 @@ class TargetLoweringBase {
23892389
EVT VT) const {
23902390
// If a target has multiple condition registers, then it likely has logical
23912391
// operations on those registers.
2392-
if (hasMultipleConditionRegisters())
2392+
if (hasMultiplePredicateRegisters(MVT::i1))
23932393
return false;
23942394
// Only do the transform if the value won't be split into multiple
23952395
// registers.
@@ -2496,15 +2496,6 @@ class TargetLoweringBase {
24962496
StackPointerRegisterToSaveRestore = R;
24972497
}
24982498

2499-
/// Tells the code generator that the target has multiple (allocatable)
2500-
/// condition registers that can be used to store the results of comparisons
2501-
/// for use by selects and conditional branches. With multiple condition
2502-
/// registers, the code generator will not aggressively sink comparisons into
2503-
/// the blocks of their users.
2504-
void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
2505-
HasMultipleConditionRegisters = hasManyRegs;
2506-
}
2507-
25082499
/// Tells the code generator that the target has BitExtract instructions.
25092500
/// The code generator will aggressively sink "shift"s into the blocks of
25102501
/// their users if the users will generate "and" instructions which can be
@@ -3473,13 +3464,6 @@ class TargetLoweringBase {
34733464
private:
34743465
const TargetMachine &TM;
34753466

3476-
/// Tells the code generator that the target has multiple (allocatable)
3477-
/// condition registers that can be used to store the results of comparisons
3478-
/// for use by selects and conditional branches. With multiple condition
3479-
/// registers, the code generator will not aggressively sink comparisons into
3480-
/// the blocks of their users.
3481-
bool HasMultipleConditionRegisters;
3482-
34833467
/// Tells the code generator that the target has BitExtract instructions.
34843468
/// The code generator will aggressively sink "shift"s into the blocks of
34853469
/// their users if the users will generate "and" instructions which can be

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1770,8 +1770,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
17701770
/// lose; some adjustment may be wanted there.
17711771
///
17721772
/// Return true if any changes are made.
1773-
static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
1774-
if (TLI.hasMultipleConditionRegisters())
1773+
static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp,
1774+
const TargetLowering &TLI) {
1775+
EVT ResVT = TLI.getValueType(DL, Cmp->getType());
1776+
if (TLI.hasMultiplePredicateRegisters(ResVT))
17751777
return false;
17761778

17771779
// Avoid sinking soft-FP comparisons, since this can move them into a loop.
@@ -2176,7 +2178,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
21762178
}
21772179

21782180
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
2179-
if (sinkCmpExpression(Cmp, *TLI))
2181+
if (sinkCmpExpression(*DL, Cmp, *TLI))
21802182
return true;
21812183

21822184
if (combineToUAddWithOverflow(Cmp, ModifiedDT))

llvm/lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
625625
MaxGluedStoresPerMemcpy = 0;
626626
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
627627
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
628-
HasMultipleConditionRegisters = false;
629628
HasExtractBitsInsn = false;
630629
JumpIsExpensive = JumpIsExpensiveOverride;
631630
PredictableSelectIsExpensive = false;

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1346,6 +1346,10 @@ class AArch64TargetLowering : public TargetLowering {
13461346
unsigned getMinimumJumpTableEntries() const override;
13471347

13481348
bool softPromoteHalfType() const override { return true; }
1349+
1350+
virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
1351+
return VT.isVector();
1352+
}
13491353
};
13501354

13511355
namespace AArch64 {

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
583583
setSchedulingPreference(Sched::RegPressure);
584584
setJumpIsExpensive(true);
585585

586-
// FIXME: This is only partially true. If we have to do vector compares, any
587-
// SGPR pair can be a condition register. If we have a uniform condition, we
588-
// are better off doing SALU operations, where there is only one SCC. For now,
589-
// we don't have a way of knowing during instruction selection if a condition
590-
// will be uniform and we always use vector compares. Assume we are using
591-
// vector compares until that is fixed.
592-
setHasMultipleConditionRegisters(true);
593-
594586
setMinCmpXchgSizeInBits(32);
595587
setSupportsUnalignedAtomics(false);
596588

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering {
387387
MVT getFenceOperandTy(const DataLayout &DL) const override {
388388
return MVT::i32;
389389
}
390+
391+
virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
392+
// FIXME: This is only partially true. If we have to do vector compares,
393+
// any SGPR pair can be a condition register. If we have a uniform
394+
// condition, we are better off doing SALU operations, where there is only
395+
// one SCC. For now, we don't have a way of knowing during instruction
396+
// selection if a condition will be uniform and we always use vector
397+
// compares. Assume we are using vector compares until that is fixed.
398+
return true;
399+
}
390400
};
391401

392402
namespace AMDGPUISD {

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,10 +1461,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
14611461

14621462
// With 32 condition bits, we don't need to sink (and duplicate) compares
14631463
// aggressively in CodeGenPrep.
1464-
if (Subtarget.useCRBits()) {
1465-
setHasMultipleConditionRegisters();
1464+
if (Subtarget.useCRBits())
14661465
setJumpIsExpensive();
1467-
}
14681466

14691467
// TODO: The default entry number is set to 64. This stops most jump table
14701468
// generation on PPC. But it is good for current PPC HWs because the indirect
@@ -19137,3 +19135,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
1913719135
return Builder.CreateOr(
1913819136
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
1913919137
}
19138+
19139+
bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const {
19140+
// With 32 condition bits, we don't need to sink (and duplicate) compares
19141+
// aggressively in CodeGenPrep.
19142+
return Subtarget.useCRBits();
19143+
}

llvm/lib/Target/PowerPC/PPCISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,8 @@ namespace llvm {
14921492
/// through to determine the optimal load/store instruction format.
14931493
unsigned computeMOFlags(const SDNode *Parent, SDValue N,
14941494
SelectionDAG &DAG) const;
1495+
1496+
virtual bool hasMultiplePredicateRegisters(EVT VT) const override;
14951497
}; // end class PPCTargetLowering
14961498

14971499
namespace PPC {

llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll

Lines changed: 43 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ target triple = "aarch64-unknown-linux-gnu"
66
define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
77
; CHECK-LABEL: no_sink_simple:
88
; CHECK: // %bb.0:
9+
; CHECK-NEXT: cmgt v2.4s, v1.4s, v0.4s
10+
; CHECK-NEXT: xtn v2.4h, v2.4s
911
; CHECK-NEXT: tbz w0, #0, .LBB0_2
1012
; CHECK-NEXT: // %bb.1: // %s
11-
; CHECK-NEXT: cmgt v1.4s, v1.4s, v0.4s
13+
; CHECK-NEXT: sshll v1.4s, v2.4h, #0
1214
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
1315
; CHECK-NEXT: str q0, [x1]
1416
; CHECK-NEXT: ret
1517
; CHECK-NEXT: .LBB0_2: // %t
16-
; CHECK-NEXT: cmgt v0.4s, v1.4s, v0.4s
18+
; CHECK-NEXT: sshll v0.4s, v2.4h, #0
1719
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
1820
; CHECK-NEXT: ret
1921
%d = icmp slt <4 x i32> %a, %b
@@ -32,68 +34,64 @@ t:
3234
define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
3335
; CHECK-LABEL: vector_loop_with_icmp:
3436
; CHECK: // %bb.0: // %entry
35-
; CHECK-NEXT: mov w8, #15 // =0xf
37+
; CHECK-NEXT: mov w9, #15 // =0xf
3638
; CHECK-NEXT: mov w10, #4 // =0x4
37-
; CHECK-NEXT: adrp x9, .LCPI1_0
39+
; CHECK-NEXT: adrp x8, .LCPI1_0
3840
; CHECK-NEXT: adrp x11, .LCPI1_1
39-
; CHECK-NEXT: dup v0.2d, x8
41+
; CHECK-NEXT: dup v0.2d, x9
4042
; CHECK-NEXT: dup v1.2d, x10
41-
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_0]
43+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
4244
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI1_1]
43-
; CHECK-NEXT: add x9, x0, #8
44-
; CHECK-NEXT: mov w10, #16 // =0x10
45-
; CHECK-NEXT: mov w11, #1 // =0x1
45+
; CHECK-NEXT: add x8, x0, #8
46+
; CHECK-NEXT: mov w9, #16 // =0x10
47+
; CHECK-NEXT: mov w10, #1 // =0x1
4648
; CHECK-NEXT: b .LBB1_2
4749
; CHECK-NEXT: .LBB1_1: // %pred.store.continue18
4850
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
4951
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
5052
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
51-
; CHECK-NEXT: subs x10, x10, #4
52-
; CHECK-NEXT: add x9, x9, #16
53+
; CHECK-NEXT: subs x9, x9, #4
54+
; CHECK-NEXT: add x8, x8, #16
5355
; CHECK-NEXT: b.eq .LBB1_10
5456
; CHECK-NEXT: .LBB1_2: // %vector.body
5557
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
56-
; CHECK-NEXT: cmhi v4.2d, v0.2d, v3.2d
57-
; CHECK-NEXT: xtn v4.2s, v4.2d
58-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
59-
; CHECK-NEXT: umov w12, v4.h[0]
60-
; CHECK-NEXT: tbz w12, #0, .LBB1_4
61-
; CHECK-NEXT: // %bb.3: // %pred.store.if
58+
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
59+
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
60+
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
61+
; CHECK-NEXT: xtn v4.4h, v4.4s
62+
; CHECK-NEXT: umov w11, v4.h[0]
63+
; CHECK-NEXT: tbnz w11, #0, .LBB1_6
64+
; CHECK-NEXT: // %bb.3: // %pred.store.continue
6265
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
63-
; CHECK-NEXT: stur w11, [x9, #-8]
64-
; CHECK-NEXT: .LBB1_4: // %pred.store.continue
66+
; CHECK-NEXT: umov w11, v4.h[1]
67+
; CHECK-NEXT: tbnz w11, #0, .LBB1_7
68+
; CHECK-NEXT: .LBB1_4: // %pred.store.continue6
6569
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
66-
; CHECK-NEXT: dup v4.2d, x8
67-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v3.2d
68-
; CHECK-NEXT: xtn v4.2s, v4.2d
69-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
70-
; CHECK-NEXT: umov w12, v4.h[1]
71-
; CHECK-NEXT: tbz w12, #0, .LBB1_6
72-
; CHECK-NEXT: // %bb.5: // %pred.store.if5
70+
; CHECK-NEXT: umov w11, v4.h[2]
71+
; CHECK-NEXT: tbnz w11, #0, .LBB1_8
72+
; CHECK-NEXT: .LBB1_5: // %pred.store.continue8
7373
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
74-
; CHECK-NEXT: stur w11, [x9, #-4]
75-
; CHECK-NEXT: .LBB1_6: // %pred.store.continue6
74+
; CHECK-NEXT: umov w11, v4.h[3]
75+
; CHECK-NEXT: tbz w11, #0, .LBB1_1
76+
; CHECK-NEXT: b .LBB1_9
77+
; CHECK-NEXT: .LBB1_6: // %pred.store.if
7678
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
77-
; CHECK-NEXT: dup v4.2d, x8
78-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
79-
; CHECK-NEXT: xtn v4.2s, v4.2d
80-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
81-
; CHECK-NEXT: umov w12, v4.h[2]
82-
; CHECK-NEXT: tbz w12, #0, .LBB1_8
83-
; CHECK-NEXT: // %bb.7: // %pred.store.if7
79+
; CHECK-NEXT: stur w10, [x8, #-8]
80+
; CHECK-NEXT: umov w11, v4.h[1]
81+
; CHECK-NEXT: tbz w11, #0, .LBB1_4
82+
; CHECK-NEXT: .LBB1_7: // %pred.store.if5
8483
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
85-
; CHECK-NEXT: str w11, [x9]
86-
; CHECK-NEXT: .LBB1_8: // %pred.store.continue8
84+
; CHECK-NEXT: stur w10, [x8, #-4]
85+
; CHECK-NEXT: umov w11, v4.h[2]
86+
; CHECK-NEXT: tbz w11, #0, .LBB1_5
87+
; CHECK-NEXT: .LBB1_8: // %pred.store.if7
8788
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
88-
; CHECK-NEXT: dup v4.2d, x8
89-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v2.2d
90-
; CHECK-NEXT: xtn v4.2s, v4.2d
91-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
92-
; CHECK-NEXT: umov w12, v4.h[3]
93-
; CHECK-NEXT: tbz w12, #0, .LBB1_1
94-
; CHECK-NEXT: // %bb.9: // %pred.store.if9
89+
; CHECK-NEXT: str w10, [x8]
90+
; CHECK-NEXT: umov w11, v4.h[3]
91+
; CHECK-NEXT: tbz w11, #0, .LBB1_1
92+
; CHECK-NEXT: .LBB1_9: // %pred.store.if9
9593
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
96-
; CHECK-NEXT: str w11, [x9, #4]
94+
; CHECK-NEXT: str w10, [x8, #4]
9795
; CHECK-NEXT: b .LBB1_1
9896
; CHECK-NEXT: .LBB1_10: // %for.cond.cleanup
9997
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)