Skip to content

Commit ad9909d

Browse files
[SLP]Fix perfect diamond match with extractelements in scalars
Need to drop all previous estimations/vectorizations, when found a perfect diamond match. This improves cost estimation and improves code emission. Also, need to adjust getScalarizationOverhead cost for non-poison input vector. Currently, it does not allow to estimate it correctly, so instead use conservative element-by-element insertelement cost for each unique scalar. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: #132466
1 parent 03d8529 commit ad9909d

File tree

3 files changed

+69
-62
lines changed

3 files changed

+69
-62
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5310,12 +5310,11 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
53105310
/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
53115311
/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
53125312
/// instead of a scalar.
5313-
static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
5314-
Type *ScalarTy, VectorType *Ty,
5315-
const APInt &DemandedElts,
5316-
bool Insert, bool Extract,
5317-
TTI::TargetCostKind CostKind,
5318-
ArrayRef<Value *> VL = {}) {
5313+
static InstructionCost
5314+
getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
5315+
VectorType *Ty, const APInt &DemandedElts, bool Insert,
5316+
bool Extract, TTI::TargetCostKind CostKind,
5317+
bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
53195318
assert(!isa<ScalableVectorType>(Ty) &&
53205319
"ScalableVectorType is not supported.");
53215320
assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
@@ -5339,8 +5338,26 @@ static InstructionCost getScalarizationOverhead(const TargetTransformInfo &TTI,
53395338
}
53405339
return Cost;
53415340
}
5342-
return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5343-
CostKind, VL);
5341+
APInt NewDemandedElts = DemandedElts;
5342+
InstructionCost Cost = 0;
5343+
if (!ForPoisonSrc && Insert) {
5344+
// Handle insert into non-poison vector.
5345+
// TODO: Need to teach getScalarizationOverhead about insert elements into
5346+
// non-poison input vector to better handle such cases. Currently, it is
5347+
// very conservative and may "pessimize" the vectorization.
5348+
for (unsigned I : seq(DemandedElts.getBitWidth())) {
5349+
if (!DemandedElts[I])
5350+
continue;
5351+
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, CostKind,
5352+
I, Constant::getNullValue(Ty),
5353+
VL.empty() ? nullptr : VL[I]);
5354+
}
5355+
NewDemandedElts.clearAllBits();
5356+
} else if (!NewDemandedElts.isZero()) {
5357+
Cost += TTI.getScalarizationOverhead(Ty, NewDemandedElts, Insert, Extract,
5358+
CostKind, VL);
5359+
}
5360+
return Cost;
53445361
}
53455362

53465363
/// Correctly creates insert_subvector, checking that the index is multiple of
@@ -11684,6 +11701,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1168411701
// No need to delay the cost estimation during analysis.
1168511702
return std::nullopt;
1168611703
}
11704+
/// Reset the builder to handle perfect diamond match.
11705+
void resetForSameNode() {
11706+
IsFinalized = false;
11707+
CommonMask.clear();
11708+
InVectors.clear();
11709+
Cost = 0;
11710+
VectorizedVals.clear();
11711+
SameNodesEstimated = true;
11712+
}
1168711713
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
1168811714
if (&E1 == &E2) {
1168911715
assert(all_of(Mask,
@@ -14890,15 +14916,18 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1489014916
ShuffledElements.setBit(I);
1489114917
ShuffleMask[I] = Res.first->second;
1489214918
}
14893-
if (!DemandedElements.isZero())
14894-
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
14895-
/*Insert=*/true,
14896-
/*Extract=*/false, CostKind, VL);
14897-
if (ForPoisonSrc)
14919+
if (ForPoisonSrc) {
1489814920
Cost = getScalarizationOverhead(*TTI, ScalarTy, VecTy,
1489914921
/*DemandedElts*/ ~ShuffledElements,
1490014922
/*Insert*/ true,
14901-
/*Extract*/ false, CostKind, VL);
14923+
/*Extract*/ false, CostKind,
14924+
/*ForPoisonSrc=*/true, VL);
14925+
} else if (!DemandedElements.isZero()) {
14926+
Cost += getScalarizationOverhead(*TTI, ScalarTy, VecTy, DemandedElements,
14927+
/*Insert=*/true,
14928+
/*Extract=*/false, CostKind,
14929+
/*ForPoisonSrc=*/false, VL);
14930+
}
1490214931
if (DuplicateNonConst)
1490314932
Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
1490414933
VecTy, ShuffleMask);
@@ -15556,6 +15585,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
1555615585
PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),
1555715586
MaybeAlign());
1555815587
}
15588+
/// Reset the builder to handle perfect diamond match.
15589+
void resetForSameNode() {
15590+
IsFinalized = false;
15591+
CommonMask.clear();
15592+
InVectors.clear();
15593+
}
1555915594
/// Adds 2 input vectors (in form of tree entries) and the mask for their
1556015595
/// shuffling.
1556115596
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
@@ -16111,6 +16146,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
1611116146
Mask[I] = FrontTE->findLaneForValue(V);
1611216147
}
1611316148
}
16149+
// Reset the builder(s) to correctly handle perfect diamond matched
16150+
// nodes.
16151+
ShuffleBuilder.resetForSameNode();
1611416152
ShuffleBuilder.add(*FrontTE, Mask);
1611516153
// Full matched entry found, no need to insert subvectors.
1611616154
Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});

llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,15 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) {
1010
; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8
1111
; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8
1212
; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8
13-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0
15-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1
16-
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
17-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
18-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0
19-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1
20-
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]]
21-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
22-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
23-
; CHECK-NEXT: [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
24-
; CHECK-NEXT: ret <4 x double> [[I44281]]
13+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
14+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
15+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
16+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0
17+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1
18+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2
19+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3
20+
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]]
21+
; CHECK-NEXT: ret <4 x double> [[TMP8]]
2522
;
2623
%i4238 = load double, ptr %ia, align 8
2724
%i4252 = load double, ptr %ib, align 8

llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -49,24 +49,10 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
4949
;
5050
; AVX512-LABEL: @reduce_and4(
5151
; AVX512-NEXT: entry:
52-
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
53-
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
54-
; AVX512-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
55-
; AVX512-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
56-
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
57-
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
58-
; AVX512-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
59-
; AVX512-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
60-
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
61-
; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8
62-
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9
63-
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10
64-
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11
65-
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12
66-
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13
67-
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14
68-
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15
69-
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]])
52+
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
53+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
54+
; AVX512-NEXT: [[RDX_OP:%.*]] = and <8 x i32> [[TMP0]], [[TMP1]]
55+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[RDX_OP]])
7056
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
7157
; AVX512-NEXT: ret i32 [[OP_RDX1]]
7258
;
@@ -144,24 +130,10 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
144130
; AVX2-NEXT: ret i32 [[OP_RDX]]
145131
;
146132
; AVX512-LABEL: @reduce_and4_transpose(
147-
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
148-
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
149-
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
150-
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
151-
; AVX512-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
152-
; AVX512-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
153-
; AVX512-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
154-
; AVX512-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
155-
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
156-
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8
157-
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9
158-
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10
159-
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11
160-
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12
161-
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13
162-
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14
163-
; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15
164-
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]])
133+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
134+
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
135+
; AVX512-NEXT: [[RDX_OP:%.*]] = and <8 x i32> [[TMP1]], [[TMP2]]
136+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[RDX_OP]])
165137
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
166138
; AVX512-NEXT: ret i32 [[OP_RDX1]]
167139
;

0 commit comments

Comments
 (0)