Skip to content

Commit 714aaba

Browse files
committed
Temporarily Revert "[SLP] allow forming 2-way reduction patterns" and update testcases.
After speaking with Sanjay - seeing a number of miscompiles and working on tracking down a testcase. None of the follow on patches seem to have helped so far. This reverts commit 8a0aa53.
1 parent 8a0aa53 commit 714aaba

File tree

5 files changed

+29
-53
lines changed

5 files changed

+29
-53
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,12 +114,9 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
114114

115115
/// Try to find horizontal reduction or otherwise vectorize a chain of binary
116116
/// operators.
117-
/// \p Try2WayRdx specializes the analysis to only attempt a 2-element
118-
/// reduction.
119117
bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
120118
slpvectorizer::BoUpSLP &R,
121-
TargetTransformInfo *TTI,
122-
bool Try2WayRdx = false);
119+
TargetTransformInfo *TTI);
123120

124121
/// Try to vectorize trees that start at insertvalue instructions.
125122
bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6653,22 +6653,19 @@ class HorizontalReduction {
66536653

66546654
/// Attempt to vectorize the tree found by
66556655
/// matchAssociativeReduction.
6656-
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) {
6656+
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
66576657
if (ReducedVals.empty())
66586658
return false;
66596659

66606660
// If there is a sufficient number of reduction values, reduce
66616661
// to a nearby power-of-2. Can safely generate oversized
66626662
// vectors and rely on the backend to split them to legal sizes.
66636663
unsigned NumReducedVals = ReducedVals.size();
6664-
if (Try2WayRdx && NumReducedVals != 2)
6665-
return false;
6666-
unsigned MinRdxVals = Try2WayRdx ? 2 : 4;
6667-
if (NumReducedVals < MinRdxVals)
6664+
if (NumReducedVals < 4)
66686665
return false;
66696666

66706667
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
6671-
unsigned MinRdxWidth = Log2_32(MinRdxVals);
6668+
66726669
Value *VectorizedTree = nullptr;
66736670

66746671
// FIXME: Fast-math-flags should be set based on the instructions in the
@@ -6704,7 +6701,7 @@ class HorizontalReduction {
67046701
SmallVector<Value *, 16> IgnoreList;
67056702
for (auto &V : ReductionOps)
67066703
IgnoreList.append(V.begin(), V.end());
6707-
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) {
6704+
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
67086705
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
67096706
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
67106707
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
@@ -7048,7 +7045,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
70487045
/// performed.
70497046
static bool tryToVectorizeHorReductionOrInstOperands(
70507047
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
7051-
TargetTransformInfo *TTI, bool Try2WayRdx,
7048+
TargetTransformInfo *TTI,
70527049
const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
70537050
if (!ShouldVectorizeHor)
70547051
return false;
@@ -7079,7 +7076,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
70797076
if (BI || SI) {
70807077
HorizontalReduction HorRdx;
70817078
if (HorRdx.matchAssociativeReduction(P, Inst)) {
7082-
if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) {
7079+
if (HorRdx.tryToReduce(R, TTI)) {
70837080
Res = true;
70847081
// Set P to nullptr to avoid re-analysis of phi node in
70857082
// matchAssociativeReduction function unless this is the root node.
@@ -7122,8 +7119,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
71227119

71237120
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
71247121
BasicBlock *BB, BoUpSLP &R,
7125-
TargetTransformInfo *TTI,
7126-
bool Try2WayRdx) {
7122+
TargetTransformInfo *TTI) {
71277123
if (!V)
71287124
return false;
71297125
auto *I = dyn_cast<Instruction>(V);
@@ -7136,7 +7132,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
71367132
auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
71377133
return tryToVectorize(I, R);
71387134
};
7139-
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx,
7135+
return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
71407136
ExtraVectorization);
71417137
}
71427138

@@ -7332,23 +7328,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
73327328
PostProcessInstructions.push_back(&*it);
73337329
}
73347330

7335-
// Make a final attempt to match a 2-way reduction if nothing else worked.
7336-
// We do not try this above because it may interfere with other vectorization
7337-
// attempts.
7338-
// TODO: The constraints are copied from the above call to
7339-
// vectorizeRootInstruction(), but that might be too restrictive?
7340-
BasicBlock::iterator LastInst = --BB->end();
7341-
if (!Changed && LastInst->use_empty() &&
7342-
(LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) ||
7343-
isa<InvokeInst>(LastInst))) {
7344-
if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) {
7345-
for (auto *V : LastInst->operand_values()) {
7346-
Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI,
7347-
/* Try2WayRdx */ true);
7348-
}
7349-
}
7350-
}
7351-
73527331
return Changed;
73537332
}
73547333

llvm/test/Feature/weak_constant.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; RUN: opt < %s -O3 -S > %t
2-
; RUN: grep undef %t | count 2
2+
; RUN: grep undef %t | count 1
33
; RUN: grep 5 %t | count 1
44
; RUN: grep 7 %t | count 1
55
; RUN: grep 9 %t | count 1

llvm/test/Transforms/SLPVectorizer/X86/reduction.ll

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,15 @@ define i32 @horiz_max_multiple_uses([32 x i32]* %x, i32* %p) {
129129
define i1 @bad_insertpoint_rdx([8 x i32]* %p) #0 {
130130
; CHECK-LABEL: @bad_insertpoint_rdx(
131131
; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P:%.*]], i64 0, i64 0
132-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX22]] to <2 x i32>*
133-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16
134-
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 undef>
135-
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF]]
136-
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF]]
137-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0
138-
; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], 0
139-
; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 0
140-
; CHECK-NEXT: [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[TMP4]] to i32
141-
; CHECK-NEXT: [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[OP_EXTRA]]
132+
; CHECK-NEXT: [[T0:%.*]] = load i32, i32* [[ARRAYIDX22]], align 16
133+
; CHECK-NEXT: [[CMP23:%.*]] = icmp sgt i32 [[T0]], 0
134+
; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP23]], i32 [[T0]], i32 0
135+
; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P]], i64 0, i64 1
136+
; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[ARRAYIDX22_1]], align 4
137+
; CHECK-NEXT: [[CMP23_1:%.*]] = icmp sgt i32 [[T1]], [[SPEC_SELECT]]
138+
; CHECK-NEXT: [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[CMP23_1]] to i32
139+
; CHECK-NEXT: [[SPEC_SELECT88:%.*]] = select i1 [[CMP23_1]], i32 [[T1]], i32 [[SPEC_SELECT]]
140+
; CHECK-NEXT: [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[SPEC_SELECT88]]
142141
; CHECK-NEXT: ret i1 [[CMP23_2]]
143142
;
144143
%arrayidx22 = getelementptr inbounds [8 x i32], [8 x i32]* %p, i64 0, i64 0

llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) {
5454
define i1 @two_wide_fcmp_reduction(<2 x double> %a0) {
5555
; CHECK-LABEL: @two_wide_fcmp_reduction(
5656
; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00>
57-
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
58-
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]]
59-
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
60-
; CHECK-NEXT: ret i1 [[TMP1]]
57+
; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0
58+
; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1
59+
; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]]
60+
; CHECK-NEXT: ret i1 [[D]]
6161
;
6262
%a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0>
6363
%b = extractelement <2 x i1> %a, i32 0
@@ -96,11 +96,12 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
9696
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0
9797
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1
9898
; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
99-
; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D>
100-
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef>
101-
; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]]
102-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0
103-
; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
99+
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
100+
; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D
101+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
102+
; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D
103+
; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]]
104+
; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
104105
; CHECK: lor.lhs.false:
105106
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00>
106107
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0

0 commit comments

Comments
 (0)