Skip to content

Commit 7ff5770

Browse files
committed
[SLP] allow forming 2-way reduction patterns
We have a vector compare reduction problem seen in PR39665 comment 2: https://bugs.llvm.org/show_bug.cgi?id=39665#c2 Or slightly reduced here: define i1 @Cmp2(<2 x double> %a0) { %a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0> %b = extractelement <2 x i1> %a, i32 0 %c = extractelement <2 x i1> %a, i32 1 %d = and i1 %b, %c ret i1 %d } SLP would not attempt to turn this into a vector reduction because there is an artificial lower limit on that transform. We can not completely remove that limit without inducing regressions though, so this patch just hacks an extra attempt at creating a 2-way reduction to the end of the analysis. As shown in the test file, we are still not getting some of the motivating cases, so follow-on patches will be needed to solve those cases. Differential Revision: https://reviews.llvm.org/D59710
1 parent 55b4451 commit 7ff5770

File tree

4 files changed

+43
-20
lines changed

4 files changed

+43
-20
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,12 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
113113

114114
/// Try to find horizontal reduction or otherwise vectorize a chain of binary
115115
/// operators.
116+
/// \p Try2WayRdx specializes the analysis to only attempt a 2-element
117+
/// reduction.
116118
bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
117119
slpvectorizer::BoUpSLP &R,
118-
TargetTransformInfo *TTI);
120+
TargetTransformInfo *TTI,
121+
bool Try2WayRdx = false);
119122

120123
/// Try to vectorize trees that start at insertvalue instructions.
121124
bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6397,19 +6397,22 @@ class HorizontalReduction {
63976397

63986398
/// Attempt to vectorize the tree found by
63996399
/// matchAssociativeReduction.
6400-
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
6400+
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
64016401
if (ReducedVals.empty())
64026402
return false;
64036403

64046404
// If there is a sufficient number of reduction values, reduce
64056405
// to a nearby power-of-2. Can safely generate oversized
64066406
// vectors and rely on the backend to split them to legal sizes.
64076407
unsigned NumReducedVals = ReducedVals.size();
6408-
if (NumReducedVals < 4)
6408+
if (Try2WayRdx && NumReducedVals != 2)
6409+
return false;
6410+
unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
6411+
if (NumReducedVals < MinRdxVals)
64096412
return false;
64106413

64116414
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
6412-
6415+
unsigned MinRdxWidth = Log2_32(MinRdxVals);
64136416
Value *VectorizedTree = nullptr;
64146417

64156418
// FIXME: Fast-math-flags should be set based on the instructions in the
@@ -6433,7 +6436,7 @@ class HorizontalReduction {
64336436
SmallVector<Value *, 16> IgnoreList;
64346437
for (auto &V : ReductionOps)
64356438
IgnoreList.append(V.begin(), V.end());
6436-
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
6439+
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
64376440
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
64386441
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
64396442
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
@@ -6759,7 +6762,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
67596762
/// performed.
67606763
static bool tryToVectorizeHorReductionOrInstOperands(
67616764
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
6762-
TargetTransformInfo *TTI,
6765+
TargetTransformInfo *TTI, bool Try2WayRdx,
67636766
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
67646767
if (!ShouldVectorizeHor)
67656768
return false;
@@ -6790,7 +6793,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
67906793
if (BI || SI) {
67916794
HorizontalReduction HorRdx;
67926795
if (HorRdx.matchAssociativeReduction(P, Inst)) {
6793-
if (HorRdx.tryToReduce(R, TTI)) {
6796+
if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
67946797
Res = true;
67956798
// Set P to nullptr to avoid re-analysis of phi node in
67966799
// matchAssociativeReduction function unless this is the root node.
@@ -6833,7 +6836,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
68336836

68346837
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
68356838
BasicBlock *BB, BoUpSLP &R,
6836-
TargetTransformInfo *TTI) {
6839+
TargetTransformInfo *TTI,
6840+
bool Try2WayRdx) {
68376841
if (!V)
68386842
return false;
68396843
auto *I = dyn_cast<Instruction>(V);
@@ -6846,7 +6850,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
68466850
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
68476851
return tryToVectorize(I, R);
68486852
};
6849-
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
6853+
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
68506854
ExtraVectorization);
68516855
}
68526856

@@ -7042,6 +7046,23 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
70427046
PostProcessInstructions.push_back(&*it);
70437047
}
70447048

7049+
// Make a final attempt to match a 2-way reduction if nothing else worked.
7050+
// We do not try this above because it may interfere with other vectorization
7051+
// attempts.
7052+
// TODO: The constraints are copied from the above call to
7053+
// vectorizeRootInstruction(), but that might be too restrictive?
7054+
BasicBlock::iterator LastInst = --BB->end();
7055+
if (!Changed && LastInst->use_empty() &&
7056+
(LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
7057+
isa<InvokeInst>(LastInst))) {
7058+
if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
7059+
for (auto *V : LastInst->operand_values()) {
7060+
Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
7061+
/* Try2WayRdx */ true);
7062+
}
7063+
}
7064+
}
7065+
70457066
return Changed;
70467067
}
70477068

llvm/test/Feature/weak_constant.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; RUN: opt < %s -O3 -S > %t
2-
; RUN: grep undef %t | count 1
2+
; RUN: grep undef %t | count 2
33
; RUN: grep 5 %t | count 1
44
; RUN: grep 7 %t | count 1
55
; RUN: grep 9 %t | count 1

llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) {
5454
define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
5555
; CHECK-LABEL: @two_wide_fcmp_reduction(
5656
; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
57-
; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
58-
; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
59-
; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]]
60-
; CHECK-NEXT: ret i1 [[D]]
57+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
58+
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
59+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
60+
; CHECK-NEXT: ret i1 [[TMP1]]
6161
;
6262
%a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
6363
%b = extractelement <2 x i1> %a, i32 0
@@ -96,12 +96,11 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
9696
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
9797
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
9898
; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
99-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
100-
; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
101-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
102-
; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
103-
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
104-
; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
99+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
100+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
101+
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
102+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
103+
; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
105104
; CHECK: lor.lhs.false:
106105
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
107106
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0

0 commit comments

Comments
 (0)