@@ -436,26 +436,6 @@ static SmallBitVector isUndefVector(const Value *V,
436
436
/// i32 6>
437
437
/// %2 = mul <4 x i8> %1, %1
438
438
/// ret <4 x i8> %2
439
- /// We convert this initially to something like:
440
- /// %x0 = extractelement <4 x i8> %x, i32 0
441
- /// %x3 = extractelement <4 x i8> %x, i32 3
442
- /// %y1 = extractelement <4 x i8> %y, i32 1
443
- /// %y2 = extractelement <4 x i8> %y, i32 2
444
- /// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
445
- /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
446
- /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
447
- /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
448
- /// %5 = mul <4 x i8> %4, %4
449
- /// %6 = extractelement <4 x i8> %5, i32 0
450
- /// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
451
- /// %7 = extractelement <4 x i8> %5, i32 1
452
- /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
453
- /// %8 = extractelement <4 x i8> %5, i32 2
454
- /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
455
- /// %9 = extractelement <4 x i8> %5, i32 3
456
- /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
457
- /// ret <4 x i8> %ins4
458
- /// InstCombiner transforms this into a shuffle and vector mul
459
439
/// Mask will return the Shuffle Mask equivalent to the extracted elements.
460
440
/// TODO: Can we split off and reuse the shuffle mask detection from
461
441
/// ShuffleVectorInst/getShuffleCost?
@@ -7505,6 +7485,14 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7505
7485
}
7506
7486
return VecBase;
7507
7487
}
7488
+ /// Checks if the specified entry \p E needs to be delayed because of its
7489
+ /// dependency nodes.
7490
+ std::optional<InstructionCost>
7491
+ needToDelay(const TreeEntry *,
7492
+ ArrayRef<SmallVector<const TreeEntry *>>) const {
7493
+ // No need to delay the cost estimation during analysis.
7494
+ return std::nullopt;
7495
+ }
7508
7496
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
7509
7497
if (&E1 == &E2) {
7510
7498
assert(all_of(Mask,
@@ -7619,13 +7607,16 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7619
7607
if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
7620
7608
CommonMask[Idx] = Mask[Idx] + VF;
7621
7609
}
7622
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
7610
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7611
+ Value *Root = nullptr) {
7623
7612
Cost += getBuildVectorCost(VL, Root);
7624
7613
if (!Root) {
7625
- assert(InVectors.empty() && "Unexpected input vectors for buildvector.");
7626
7614
// FIXME: Need to find a way to avoid use of getNullValue here.
7627
7615
SmallVector<Constant *> Vals;
7628
- for (Value *V : VL) {
7616
+ unsigned VF = VL.size();
7617
+ if (MaskVF != 0)
7618
+ VF = std::min(VF, MaskVF);
7619
+ for (Value *V : VL.take_front(VF)) {
7629
7620
if (isa<UndefValue>(V)) {
7630
7621
Vals.push_back(cast<Constant>(V));
7631
7622
continue;
@@ -7635,9 +7626,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7635
7626
return ConstantVector::get(Vals);
7636
7627
}
7637
7628
return ConstantVector::getSplat(
7638
- ElementCount::getFixed(VL.size()),
7629
+ ElementCount::getFixed(
7630
+ cast<FixedVectorType>(Root->getType())->getNumElements()),
7639
7631
getAllOnesValue(*R.DL, VL.front()->getType()));
7640
7632
}
7633
+ InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
7641
7634
/// Finalize emission of the shuffles.
7642
7635
InstructionCost
7643
7636
finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
@@ -7659,8 +7652,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7659
7652
InVectors.front() = V;
7660
7653
}
7661
7654
::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
7662
- if (CommonMask.empty())
7655
+ if (CommonMask.empty()) {
7656
+ assert(InVectors.size() == 1 && "Expected only one vector with no mask");
7663
7657
return Cost;
7658
+ }
7664
7659
return Cost +
7665
7660
createShuffle(InVectors.front(),
7666
7661
InVectors.size() == 2 ? InVectors.back() : nullptr,
@@ -7737,189 +7732,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
7737
7732
return 0;
7738
7733
if (isa<InsertElementInst>(VL[0]))
7739
7734
return InstructionCost::getInvalid();
7740
- ShuffleCostEstimator Estimator(*TTI, VectorizedVals, *this,
7741
- CheckedExtracts);
7742
- unsigned VF = E->getVectorFactor();
7743
- SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
7744
- E->ReuseShuffleIndices.end());
7745
- SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
7746
- // Build a mask out of the reorder indices and reorder scalars per this
7747
- // mask.
7748
- SmallVector<int> ReorderMask;
7749
- inversePermutation(E->ReorderIndices, ReorderMask);
7750
- if (!ReorderMask.empty())
7751
- reorderScalars(GatheredScalars, ReorderMask);
7752
- SmallVector<int> Mask;
7753
- SmallVector<int> ExtractMask;
7754
- Value *ExtractVecBase = nullptr;
7755
- bool UseVecBaseAsInput = false;
7756
- SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
7757
- SmallVector<SmallVector<const TreeEntry *>> Entries;
7758
- SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
7759
- // Check for gathered extracts.
7760
- bool Resized = false;
7761
- unsigned NumParts = TTI->getNumberOfParts(VecTy);
7762
- if (NumParts == 0 || NumParts >= GatheredScalars.size())
7763
- NumParts = 1;
7764
- if (!all_of(GatheredScalars, UndefValue::classof)) {
7765
- ExtractShuffles =
7766
- tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
7767
- if (!ExtractShuffles.empty()) {
7768
- if (Value *VecBase = Estimator.adjustExtracts(
7769
- E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
7770
- if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
7771
- if (VF == VecBaseTy->getNumElements() &&
7772
- GatheredScalars.size() != VF) {
7773
- Resized = true;
7774
- GatheredScalars.append(VF - GatheredScalars.size(),
7775
- PoisonValue::get(ScalarTy));
7776
- }
7777
- }
7778
- }
7779
-
7780
- // Do not try to look for reshuffled loads for gathered loads (they will
7781
- // be handled later), for vectorized scalars, and cases, which are
7782
- // definitely not profitable (splats and small gather nodes.)
7783
- if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
7784
- E->isAltShuffle() ||
7785
- all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
7786
- isSplat(E->Scalars) ||
7787
- (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
7788
- GatherShuffles =
7789
- isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
7790
- }
7791
- if (!GatherShuffles.empty()) {
7792
- if (GatherShuffles.size() == 1 &&
7793
- *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
7794
- Entries.front().front()->isSame(E->Scalars)) {
7795
- // Perfect match in the graph, will reuse the previously vectorized
7796
- // node. Cost is 0.
7797
- LLVM_DEBUG(
7798
- dbgs()
7799
- << "SLP: perfect diamond match for gather bundle "
7800
- << shortBundleName(VL) << ".\n");
7801
- // Restore the mask for previous partially matched values.
7802
- Mask.resize(E->Scalars.size());
7803
- const TreeEntry *FrontTE = Entries.front().front();
7804
- if (FrontTE->ReorderIndices.empty() &&
7805
- ((FrontTE->ReuseShuffleIndices.empty() &&
7806
- E->Scalars.size() == FrontTE->Scalars.size()) ||
7807
- (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
7808
- std::iota(Mask.begin(), Mask.end(), 0);
7809
- } else {
7810
- for (auto [I, V] : enumerate(E->Scalars)) {
7811
- if (isa<PoisonValue>(V)) {
7812
- Mask[I] = PoisonMaskElem;
7813
- continue;
7814
- }
7815
- Mask[I] = FrontTE->findLaneForValue(V);
7816
- }
7817
- }
7818
- Estimator.add(*FrontTE, Mask);
7819
- return Estimator.finalize(E->getCommonMask());
7820
- }
7821
- if (!Resized) {
7822
- if (GatheredScalars.size() != VF &&
7823
- any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
7824
- return any_of(TEs, [&](const TreeEntry *TE) {
7825
- return TE->getVectorFactor() == VF;
7826
- });
7827
- }))
7828
- GatheredScalars.append(VF - GatheredScalars.size(),
7829
- PoisonValue::get(ScalarTy));
7830
- }
7831
- // Remove shuffled elements from list of gathers.
7832
- for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
7833
- if (Mask[I] != PoisonMaskElem)
7834
- GatheredScalars[I] = PoisonValue::get(ScalarTy);
7835
- }
7836
- LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
7837
- << " entries for bundle "
7838
- << shortBundleName(VL) << ".\n");
7839
- unsigned SliceSize = E->Scalars.size() / NumParts;
7840
- SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
7841
- for (const auto [I, TEs] : enumerate(Entries)) {
7842
- if (TEs.empty()) {
7843
- assert(!GatherShuffles[I] &&
7844
- "No shuffles with empty entries list expected.");
7845
- continue;
7846
- }
7847
- assert((TEs.size() == 1 || TEs.size() == 2) &&
7848
- "Expected shuffle of 1 or 2 entries.");
7849
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
7850
- VecMask.assign(VecMask.size(), PoisonMaskElem);
7851
- copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
7852
- Estimator.add(*TEs.front(), *TEs.back(), VecMask);
7853
- }
7854
- if (all_of(GatheredScalars, PoisonValue ::classof))
7855
- return Estimator.finalize(E->ReuseShuffleIndices);
7856
- return Estimator.finalize(
7857
- E->ReuseShuffleIndices, E->Scalars.size(),
7858
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
7859
- Vec = Estimator.gather(GatheredScalars,
7860
- Constant::getNullValue(FixedVectorType::get(
7861
- ScalarTy, GatheredScalars.size())));
7862
- });
7863
- }
7864
- if (!ExtractShuffles.empty()) {
7865
- Value *Vec1 = nullptr;
7866
- // Gather of extractelements can be represented as just a shuffle of
7867
- // a single/two vectors the scalars are extracted from.
7868
- // Find input vectors.
7869
- Value *Vec2 = nullptr;
7870
- for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
7871
- if (!Mask.empty() && Mask[I] != PoisonMaskElem)
7872
- ExtractMask[I] = PoisonMaskElem;
7873
- }
7874
- if (UseVecBaseAsInput) {
7875
- Vec1 = ExtractVecBase;
7876
- } else {
7877
- for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
7878
- if (ExtractMask[I] == PoisonMaskElem)
7879
- continue;
7880
- if (isa<UndefValue>(E->Scalars[I]))
7881
- continue;
7882
- auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
7883
- Value *VecOp = EI->getVectorOperand();
7884
- if (const auto *TE = getTreeEntry(VecOp))
7885
- if (TE->VectorizedValue)
7886
- VecOp = TE->VectorizedValue;
7887
- if (!Vec1) {
7888
- Vec1 = VecOp;
7889
- } else if (Vec1 != EI->getVectorOperand()) {
7890
- assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
7891
- "Expected only 1 or 2 vectors shuffle.");
7892
- Vec2 = VecOp;
7893
- }
7894
- }
7895
- }
7896
- if (Vec2) {
7897
- Estimator.add(Vec1, Vec2, ExtractMask);
7898
- } else if (Vec1) {
7899
- Estimator.add(Vec1, ExtractMask, /*ForExtracts=*/true);
7900
- } else {
7901
- Estimator.add(PoisonValue::get(FixedVectorType::get(
7902
- ScalarTy, GatheredScalars.size())),
7903
- ExtractMask, /*ForExtracts=*/true);
7904
- }
7905
- }
7906
- if (!all_of(GatheredScalars, PoisonValue::classof)) {
7907
- auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
7908
- bool SameGathers = VL.equals(Gathers);
7909
- if (!SameGathers)
7910
- return Estimator.finalize(
7911
- E->ReuseShuffleIndices, E->Scalars.size(),
7912
- [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
7913
- Vec = Estimator.gather(
7914
- GatheredScalars, Constant::getNullValue(FixedVectorType::get(
7915
- ScalarTy, GatheredScalars.size())));
7916
- });
7917
- Value *BV = Estimator.gather(Gathers);
7918
- SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
7919
- std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
7920
- Estimator.add(BV, ReuseMask);
7921
- }
7922
- return Estimator.finalize(E->ReuseShuffleIndices);
7735
+ return processBuildVector<ShuffleCostEstimator, InstructionCost>(
7736
+ E, *TTI, VectorizedVals, *this, CheckedExtracts);
7923
7737
}
7924
7738
InstructionCost CommonCost = 0;
7925
7739
SmallVector<int> Mask;
@@ -10337,6 +10151,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10337
10151
10338
10152
/// Adjusts extractelements after reusing them.
10339
10153
Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10154
+ ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10340
10155
unsigned NumParts, bool &UseVecBaseAsInput) {
10341
10156
UseVecBaseAsInput = false;
10342
10157
SmallPtrSet<Value *, 4> UniqueBases;
@@ -10441,14 +10256,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10441
10256
}
10442
10257
/// Checks if the specified entry \p E needs to be delayed because of its
10443
10258
/// dependency nodes.
10444
- Value *needToDelay(const TreeEntry *E,
10445
- ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
10259
+ std::optional<Value *>
10260
+ needToDelay(const TreeEntry *E,
10261
+ ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
10446
10262
// No need to delay emission if all deps are ready.
10447
10263
if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
10448
10264
return all_of(
10449
10265
TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
10450
10266
}))
10451
- return nullptr ;
10267
+ return std::nullopt ;
10452
10268
// Postpone gather emission, will be emitted after the end of the
10453
10269
// process to keep correct order.
10454
10270
auto *VecTy = FixedVectorType::get(E->Scalars.front()->getType(),
@@ -10558,7 +10374,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10558
10374
inversePermutation(Order, NewMask);
10559
10375
add(V1, NewMask);
10560
10376
}
10561
- Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
10377
+ Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10378
+ Value *Root = nullptr) {
10562
10379
return R.gather(VL, Root);
10563
10380
}
10564
10381
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10819,15 +10636,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10819
10636
cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
10820
10637
ExtractEntries.push_back(TE);
10821
10638
}
10822
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, ExtractEntries)) {
10639
+ if (std::optional<ResTy> Delayed =
10640
+ ShuffleBuilder.needToDelay(E, ExtractEntries)) {
10823
10641
// Delay emission of gathers which are not ready yet.
10824
10642
PostponedGathers.insert(E);
10825
10643
// Postpone gather emission, will be emitted after the end of the
10826
10644
// process to keep correct order.
10827
- return Delayed;
10645
+ return * Delayed;
10828
10646
}
10829
10647
if (Value *VecBase = ShuffleBuilder.adjustExtracts(
10830
- E, ExtractMask, NumParts, UseVecBaseAsInput)) {
10648
+ E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
10831
10649
ExtractVecBase = VecBase;
10832
10650
if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
10833
10651
if (VF == VecBaseTy->getNumElements() &&
@@ -10848,12 +10666,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10848
10666
isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
10849
10667
}
10850
10668
if (!GatherShuffles.empty()) {
10851
- if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
10669
+ if (std::optional<ResTy> Delayed =
10670
+ ShuffleBuilder.needToDelay(E, Entries)) {
10852
10671
// Delay emission of gathers which are not ready yet.
10853
10672
PostponedGathers.insert(E);
10854
10673
// Postpone gather emission, will be emitted after the end of the
10855
10674
// process to keep correct order.
10856
- return Delayed;
10675
+ return * Delayed;
10857
10676
}
10858
10677
if (GatherShuffles.size() == 1 &&
10859
10678
*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -11062,14 +10881,16 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11062
10881
IsUsedInExpr &=
11063
10882
FindReusedSplat(VecMask, TEs.front()->getVectorFactor());
11064
10883
ShuffleBuilder.add(*TEs.front(), VecMask);
11065
- IsNonPoisoned &=
11066
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
10884
+ if (TEs.front()->VectorizedValue)
10885
+ IsNonPoisoned &=
10886
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
11067
10887
} else {
11068
10888
IsUsedInExpr = false;
11069
10889
ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
11070
- IsNonPoisoned &=
11071
- isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
11072
- isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
10890
+ if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
10891
+ IsNonPoisoned &=
10892
+ isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
10893
+ isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
11073
10894
}
11074
10895
}
11075
10896
}
@@ -11128,7 +10949,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11128
10949
if (!all_of(GatheredScalars, PoisonValue::classof)) {
11129
10950
SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
11130
10951
TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
11131
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
10952
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size() );
11132
10953
ShuffleBuilder.add(BV, BVMask);
11133
10954
}
11134
10955
if (all_of(NonConstants, [=](Value *V) {
@@ -11142,13 +10963,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11142
10963
E->ReuseShuffleIndices, E->Scalars.size(),
11143
10964
[&](Value *&Vec, SmallVectorImpl<int> &Mask) {
11144
10965
TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
11145
- Vec = ShuffleBuilder.gather(NonConstants, Vec);
10966
+ Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
11146
10967
});
11147
10968
} else if (!allConstant(GatheredScalars)) {
11148
10969
// Gather unique scalars and all constants.
11149
10970
SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
11150
10971
TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
11151
- Value *BV = ShuffleBuilder.gather(GatheredScalars);
10972
+ Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size() );
11152
10973
ShuffleBuilder.add(BV, ReuseMask);
11153
10974
Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
11154
10975
} else {
0 commit comments