Skip to content

[CodeGen] Avoid sinking vector comparisons during CodeGenPrepare #113158

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 5 additions & 21 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,10 +497,10 @@ class TargetLoweringBase {
return true;
}

/// Return true if multiple condition registers are available.
bool hasMultipleConditionRegisters() const {
return HasMultipleConditionRegisters;
}
/// Return true if multiple (allocatable) predicate registers are available
/// for \p VT. If there is only a single register the code generator will
/// sink comparisons into the blocks of their users.
virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; }

/// Return true if the target has BitExtract instructions.
bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
Expand Down Expand Up @@ -2389,7 +2389,7 @@ class TargetLoweringBase {
EVT VT) const {
// If a target has multiple condition registers, then it likely has logical
// operations on those registers.
if (hasMultipleConditionRegisters())
if (hasMultiplePredicateRegisters(MVT::i1))
return false;
// Only do the transform if the value won't be split into multiple
// registers.
Expand Down Expand Up @@ -2496,15 +2496,6 @@ class TargetLoweringBase {
StackPointerRegisterToSaveRestore = R;
}

/// Tells the code generator that the target has multiple (allocatable)
/// condition registers that can be used to store the results of comparisons
/// for use by selects and conditional branches. With multiple condition
/// registers, the code generator will not aggressively sink comparisons into
/// the blocks of their users.
void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
HasMultipleConditionRegisters = hasManyRegs;
}

/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
Expand Down Expand Up @@ -3473,13 +3464,6 @@ class TargetLoweringBase {
private:
const TargetMachine &TM;

/// Tells the code generator that the target has multiple (allocatable)
/// condition registers that can be used to store the results of comparisons
/// for use by selects and conditional branches. With multiple condition
/// registers, the code generator will not aggressively sink comparisons into
/// the blocks of their users.
bool HasMultipleConditionRegisters;

/// Tells the code generator that the target has BitExtract instructions.
/// The code generator will aggressively sink "shift"s into the blocks of
/// their users if the users will generate "and" instructions which can be
Expand Down
8 changes: 5 additions & 3 deletions llvm/lib/CodeGen/CodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1770,8 +1770,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
/// lose; some adjustment may be wanted there.
///
/// Return true if any changes are made.
static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
if (TLI.hasMultipleConditionRegisters())
static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp,
const TargetLowering &TLI) {
EVT ResVT = TLI.getValueType(DL, Cmp->getType());
if (TLI.hasMultiplePredicateRegisters(ResVT))
return false;

// Avoid sinking soft-FP comparisons, since this can move them into a loop.
Expand Down Expand Up @@ -2176,7 +2178,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
}

bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
if (sinkCmpExpression(*DL, Cmp, *TLI))
return true;

if (combineToUAddWithOverflow(Cmp, ModifiedDT))
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/CodeGen/TargetLoweringBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
MaxGluedStoresPerMemcpy = 0;
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
HasMultipleConditionRegisters = false;
HasExtractBitsInsn = false;
JumpIsExpensive = JumpIsExpensiveOverride;
PredictableSelectIsExpensive = false;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,10 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMinimumJumpTableEntries() const override;

bool softPromoteHalfType() const override { return true; }

virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
return VT.isVector();
}
};

namespace AArch64 {
Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);

// FIXME: This is only partially true. If we have to do vector compares, any
// SGPR pair can be a condition register. If we have a uniform condition, we
// are better off doing SALU operations, where there is only one SCC. For now,
// we don't have a way of knowing during instruction selection if a condition
// will be uniform and we always use vector compares. Assume we are using
// vector compares until that is fixed.
setHasMultipleConditionRegisters(true);

setMinCmpXchgSizeInBits(32);
setSupportsUnalignedAtomics(false);

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering {
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
}

virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
// FIXME: This is only partially true. If we have to do vector compares,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMDGPU part looks fine. In future we will probably want to address this FIXME by passing in a node, not just a type.

// any SGPR pair can be a condition register. If we have a uniform
// condition, we are better off doing SALU operations, where there is only
// one SCC. For now, we don't have a way of knowing during instruction
// selection if a condition will be uniform and we always use vector
// compares. Assume we are using vector compares until that is fixed.
return true;
}
};

namespace AMDGPUISD {
Expand Down
10 changes: 7 additions & 3 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1461,10 +1461,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,

// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
if (Subtarget.useCRBits()) {
setHasMultipleConditionRegisters();
if (Subtarget.useCRBits())
setJumpIsExpensive();
}

// TODO: The default entry number is set to 64. This stops most jump table
// generation on PPC. But it is good for current PPC HWs because the indirect
Expand Down Expand Up @@ -19137,3 +19135,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
}

bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const {
// With 32 condition bits, we don't need to sink (and duplicate) compares
// aggressively in CodeGenPrep.
return Subtarget.useCRBits();
}
2 changes: 2 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1492,6 +1492,8 @@ namespace llvm {
/// through to determine the optimal load/store instruction format.
unsigned computeMOFlags(const SDNode *Parent, SDValue N,
SelectionDAG &DAG) const;

virtual bool hasMultiplePredicateRegisters(EVT VT) const override;
}; // end class PPCTargetLowering

namespace PPC {
Expand Down
151 changes: 151 additions & 0 deletions llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
; CHECK-LABEL: no_sink_simple:
; CHECK: // %bb.0:
; CHECK-NEXT: cmgt v2.4s, v1.4s, v0.4s
; CHECK-NEXT: xtn v2.4h, v2.4s
; CHECK-NEXT: tbz w0, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %s
; CHECK-NEXT: sshll v1.4s, v2.4h, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_2: // %t
; CHECK-NEXT: sshll v0.4s, v2.4h, #0
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
%d = icmp slt <4 x i32> %a, %b
br i1 %c, label %s, label %t

s:
%s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer
store <4 x i32> %s1, ptr %p
ret <4 x i32> %s1

t:
%s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer
ret <4 x i32> %s2
}

define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
; CHECK-LABEL: vector_loop_with_icmp:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov w9, #15 // =0xf
; CHECK-NEXT: mov w10, #4 // =0x4
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: adrp x11, .LCPI1_1
; CHECK-NEXT: dup v0.2d, x9
; CHECK-NEXT: dup v1.2d, x10
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: ldr q3, [x11, :lo12:.LCPI1_1]
; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: mov w10, #1 // =0x1
; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_1: // %pred.store.continue18
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: add v2.2d, v2.2d, v1.2d
; CHECK-NEXT: add v3.2d, v3.2d, v1.2d
; CHECK-NEXT: subs x9, x9, #4
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: b.eq .LBB1_10
; CHECK-NEXT: .LBB1_2: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmhi v4.2d, v0.2d, v2.2d
; CHECK-NEXT: cmhi v5.2d, v0.2d, v3.2d
; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
; CHECK-NEXT: xtn v4.4h, v4.4s
; CHECK-NEXT: umov w11, v4.h[0]
; CHECK-NEXT: tbnz w11, #0, .LBB1_6
; CHECK-NEXT: // %bb.3: // %pred.store.continue
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbnz w11, #0, .LBB1_7
; CHECK-NEXT: .LBB1_4: // %pred.store.continue6
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbnz w11, #0, .LBB1_8
; CHECK-NEXT: .LBB1_5: // %pred.store.continue8
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB1_1
; CHECK-NEXT: b .LBB1_9
; CHECK-NEXT: .LBB1_6: // %pred.store.if
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: stur w10, [x8, #-8]
; CHECK-NEXT: umov w11, v4.h[1]
; CHECK-NEXT: tbz w11, #0, .LBB1_4
; CHECK-NEXT: .LBB1_7: // %pred.store.if5
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: stur w10, [x8, #-4]
; CHECK-NEXT: umov w11, v4.h[2]
; CHECK-NEXT: tbz w11, #0, .LBB1_5
; CHECK-NEXT: .LBB1_8: // %pred.store.if7
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: str w10, [x8]
; CHECK-NEXT: umov w11, v4.h[3]
; CHECK-NEXT: tbz w11, #0, .LBB1_1
; CHECK-NEXT: .LBB1_9: // %pred.store.if9
; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: str w10, [x8, #4]
; CHECK-NEXT: b .LBB1_1
; CHECK-NEXT: .LBB1_10: // %for.cond.cleanup
; CHECK-NEXT: ret
entry:
br label %vector.body

vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
%vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
%0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
%1 = extractelement <4 x i1> %0, i64 0
br i1 %1, label %pred.store.if, label %pred.store.continue

pred.store.if:
%2 = getelementptr inbounds i32, ptr %dest, i64 %index
store i32 1, ptr %2, align 4
br label %pred.store.continue

pred.store.continue:
%3 = extractelement <4 x i1> %0, i64 1
br i1 %3, label %pred.store.if5, label %pred.store.continue6

pred.store.if5:
%4 = or disjoint i64 %index, 1
%5 = getelementptr inbounds i32, ptr %dest, i64 %4
store i32 1, ptr %5, align 4
br label %pred.store.continue6

pred.store.continue6:
%6 = extractelement <4 x i1> %0, i64 2
br i1 %6, label %pred.store.if7, label %pred.store.continue8

pred.store.if7:
%7 = or disjoint i64 %index, 2
%8 = getelementptr inbounds i32, ptr %dest, i64 %7
store i32 1, ptr %8, align 4
br label %pred.store.continue8

pred.store.continue8:
%9 = extractelement <4 x i1> %0, i64 3
br i1 %9, label %pred.store.if9, label %pred.store.continue18

pred.store.if9:
%10 = or disjoint i64 %index, 3
%11 = getelementptr inbounds i32, ptr %dest, i64 %10
store i32 1, ptr %11, align 4
br label %pred.store.continue18

pred.store.continue18:
%index.next = add i64 %index, 4
%vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
%24 = icmp eq i64 %index.next, 16
br i1 %24, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
ret void
}
Loading
Loading