Skip to content

Commit 8a0aa53

Browse files
committed
Temporarily Revert "Temporarily Revert "[SLP] allow forming 2-way reduction patterns""
as there were testcase changes after that need to also be reverted. This reverts commit cd8748a.
1 parent a0841df commit 8a0aa53

File tree

4 files changed

+43
-20
lines changed

4 files changed

+43
-20
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,12 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
114114

115115
/// Try to find horizontal reduction or otherwise vectorize a chain of binary
116116
/// operators.
117+
/// \p Try2WayRdx specializes the analysis to only attempt a 2-element
118+
/// reduction.
117119
bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
118120
slpvectorizer::BoUpSLP &R,
119-
TargetTransformInfo *TTI);
121+
TargetTransformInfo *TTI,
122+
bool Try2WayRdx = false);
120123

121124
/// Try to vectorize trees that start at insertvalue instructions.
122125
bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6653,19 +6653,22 @@ class HorizontalReduction {
66536653

66546654
/// Attempt to vectorize the tree found by
66556655
/// matchAssociativeReduction.
6656-
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
6656+
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
66576657
if (ReducedVals.empty())
66586658
return false;
66596659

66606660
// If there is a sufficient number of reduction values, reduce
66616661
// to a nearby power-of-2. Can safely generate oversized
66626662
// vectors and rely on the backend to split them to legal sizes.
66636663
unsigned NumReducedVals = ReducedVals.size();
6664-
if (NumReducedVals < 4)
6664+
if (Try2WayRdx && NumReducedVals != 2)
6665+
return false;
6666+
unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
6667+
if (NumReducedVals < MinRdxVals)
66656668
return false;
66666669

66676670
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
6668-
6671+
unsigned MinRdxWidth = Log2_32(MinRdxVals);
66696672
Value *VectorizedTree = nullptr;
66706673

66716674
// FIXME: Fast-math-flags should be set based on the instructions in the
@@ -6701,7 +6704,7 @@ class HorizontalReduction {
67016704
SmallVector<Value *, 16> IgnoreList;
67026705
for (auto &V : ReductionOps)
67036706
IgnoreList.append(V.begin(), V.end());
6704-
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
6707+
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
67056708
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
67066709
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
67076710
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
@@ -7045,7 +7048,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
70457048
/// performed.
70467049
static bool tryToVectorizeHorReductionOrInstOperands(
70477050
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
7048-
TargetTransformInfo *TTI,
7051+
TargetTransformInfo *TTI, bool Try2WayRdx,
70497052
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
70507053
if (!ShouldVectorizeHor)
70517054
return false;
@@ -7076,7 +7079,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
70767079
if (BI || SI) {
70777080
HorizontalReduction HorRdx;
70787081
if (HorRdx.matchAssociativeReduction(P, Inst)) {
7079-
if (HorRdx.tryToReduce(R, TTI)) {
7082+
if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
70807083
Res = true;
70817084
// Set P to nullptr to avoid re-analysis of phi node in
70827085
// matchAssociativeReduction function unless this is the root node.
@@ -7119,7 +7122,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
71197122

71207123
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
71217124
BasicBlock *BB, BoUpSLP &R,
7122-
TargetTransformInfo *TTI) {
7125+
TargetTransformInfo *TTI,
7126+
bool Try2WayRdx) {
71237127
if (!V)
71247128
return false;
71257129
auto *I = dyn_cast<Instruction>(V);
@@ -7132,7 +7136,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
71327136
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
71337137
return tryToVectorize(I, R);
71347138
};
7135-
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
7139+
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
71367140
ExtraVectorization);
71377141
}
71387142

@@ -7328,6 +7332,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
73287332
PostProcessInstructions.push_back(&*it);
73297333
}
73307334

7335+
// Make a final attempt to match a 2-way reduction if nothing else worked.
7336+
// We do not try this above because it may interfere with other vectorization
7337+
// attempts.
7338+
// TODO: The constraints are copied from the above call to
7339+
// vectorizeRootInstruction(), but that might be too restrictive?
7340+
BasicBlock::iterator LastInst = --BB->end();
7341+
if (!Changed && LastInst->use_empty() &&
7342+
(LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
7343+
isa<InvokeInst>(LastInst))) {
7344+
if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
7345+
for (auto *V : LastInst->operand_values()) {
7346+
Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
7347+
/* Try2WayRdx */ true);
7348+
}
7349+
}
7350+
}
7351+
73317352
return Changed;
73327353
}
73337354

llvm/test/Feature/weak_constant.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; RUN: opt < %s -O3 -S > %t
2-
; RUN: grep undef %t | count 1
2+
; RUN: grep undef %t | count 2
33
; RUN: grep 5 %t | count 1
44
; RUN: grep 7 %t | count 1
55
; RUN: grep 9 %t | count 1

llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) {
5454
define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
5555
; CHECK-LABEL: @two_wide_fcmp_reduction(
5656
; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
57-
; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
58-
; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
59-
; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]]
60-
; CHECK-NEXT: ret i1 [[D]]
57+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
58+
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
59+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
60+
; CHECK-NEXT: ret i1 [[TMP1]]
6161
;
6262
%a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
6363
%b = extractelement <2 x i1> %a, i32 0
@@ -96,12 +96,11 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
9696
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
9797
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
9898
; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
99-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
100-
; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
101-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
102-
; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
103-
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
104-
; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
99+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
100+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
101+
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
102+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
103+
; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
105104
; CHECK: lor.lhs.false:
106105
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
107106
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0

0 commit comments

Comments
 (0)