-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]Improve/fix extracts calculations for non-power-of-2 elements. #93213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Improve/fix extracts calculations for non-power-of-2 elements. #93213
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesOne of the previous patches introduced initial support for non-power-of-2 Patch is 59.99 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93213.diff 8 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 08ecbe304429e..f044a8cdd2f31 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -255,6 +255,21 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
return isConstant(I->getOperand(2));
}
+/// Returns power-of-2 number of elements in a single register (part), given the
+/// total number of elements \p Size and number of registers (parts) \p
+/// NumParts.
+static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
+ return PowerOf2Ceil(divideCeil(Size, NumParts));
+}
+
+/// Returns correct remaining number of elements, considering total amount \p
+/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
+/// and current register (part) \p Part.
+static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
+ unsigned Part) {
+ return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
+}
+
#if !defined(NDEBUG)
/// Print a short descriptor of the instruction bundle suitable for debug output.
static std::string shortBundleName(ArrayRef<Value *> VL) {
@@ -4081,7 +4096,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
const int VF = GetVF(I);
if (VF == 0)
continue;
- MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+ unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
+ MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
// Shuffle of at least 2 vectors - ignore.
if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
std::fill(Slice.begin(), Slice.end(), NumScalars);
@@ -4091,7 +4107,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
// Try to include as much elements from the mask as possible.
int FirstMin = INT_MAX;
int SecondVecFound = false;
- for (int K : seq<int>(0, PartSz)) {
+ for (int K : seq<int>(Limit)) {
int Idx = Mask[I * PartSz + K];
if (Idx == PoisonMaskElem) {
Value *V = GatheredScalars[I * PartSz + K];
@@ -4116,7 +4132,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
ShuffledSubMasks.set(I);
continue;
}
- for (int K : seq<int>(0, PartSz)) {
+ for (int K : seq<int>(Limit)) {
int Idx = Mask[I * PartSz + K];
if (Idx == PoisonMaskElem)
continue;
@@ -4139,14 +4155,15 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
}
}
};
- int PartSz = NumScalars / NumParts;
+ int PartSz = getPartNumElems(NumScalars, NumParts);
if (!ExtractShuffles.empty())
TransformMaskToOrder(
CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
if (!ExtractShuffles[I])
return 0U;
unsigned VF = 0;
- for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+ unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
+ for (unsigned Idx : seq<unsigned>(Sz)) {
int K = I * PartSz + Idx;
if (ExtractMask[K] == PoisonMaskElem)
continue;
@@ -4777,12 +4794,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
::addMask(ReorderMask, TE.ReuseShuffleIndices);
unsigned VF = ReorderMask.size();
OrdersType ResOrder(VF, VF);
- unsigned NumParts = VF / Sz;
+ unsigned NumParts = divideCeil(VF, Sz);
SmallBitVector UsedVals(NumParts);
for (unsigned I = 0; I < VF; I += Sz) {
int Val = PoisonMaskElem;
unsigned UndefCnt = 0;
- if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+ unsigned Limit = std::min(Sz, VF - I);
+ if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
[&](int Idx) {
if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
Val = Idx;
@@ -8281,19 +8299,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
return Sz;
return std::max(Sz, VecTy->getNumElements());
});
- unsigned NumSrcRegs =
- TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
- if (NumSrcRegs == 0)
- NumSrcRegs = 1;
// FIXME: this must be moved to TTI for better estimation.
- unsigned EltsPerVector = PowerOf2Ceil(std::max(
- divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+ unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
auto CheckPerRegistersShuffle =
- [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+ [&](MutableArrayRef<int> Mask,
+ SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+ if (NumElts <= EltsPerVector)
+ return std::nullopt;
DenseSet<int> RegIndices;
// Check that if trying to permute same single/2 input vectors.
TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
int FirstRegId = -1;
+ Indices.assign(1, -1);
for (int &I : Mask) {
if (I == PoisonMaskElem)
continue;
@@ -8303,8 +8320,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
RegIndices.insert(RegId);
if (RegIndices.size() > 2)
return std::nullopt;
- if (RegIndices.size() == 2)
+ if (RegIndices.size() == 2) {
ShuffleKind = TTI::SK_PermuteTwoSrc;
+ if (Indices.size() == 1)
+ Indices.push_back(-1);
+ }
+ if (RegId == FirstRegId)
+ Indices.front() = I % NumElts;
+ else
+ Indices.back() = I % NumElts;
I = (I % NumElts) % EltsPerVector +
(RegId == FirstRegId ? 0 : EltsPerVector);
}
@@ -8315,22 +8339,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
// Process extracts in blocks of EltsPerVector to check if the source vector
// operand can be re-used directly. If not, add the cost of creating a
// shuffle to extract the values into a vector register.
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ for (unsigned Part : seq<unsigned>(NumParts)) {
if (!ShuffleKinds[Part])
continue;
- ArrayRef<int> MaskSlice =
- Mask.slice(Part * EltsPerVector,
- (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
- ? Mask.size() % EltsPerVector
- : EltsPerVector);
+ ArrayRef<int> MaskSlice = Mask.slice(
+ Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
copy(MaskSlice, SubMask.begin());
+ SmallVector<int> Indices;
std::optional<TTI::ShuffleKind> RegShuffleKind =
- CheckPerRegistersShuffle(SubMask);
+ CheckPerRegistersShuffle(SubMask, Indices);
if (!RegShuffleKind) {
- Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
- FixedVectorType::get(ScalarTy, NumElts),
- MaskSlice);
+ if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
+ !ShuffleVectorInst::isIdentityMask(
+ MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
+ Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+ FixedVectorType::get(ScalarTy, NumElts),
+ MaskSlice);
continue;
}
if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -8339,6 +8364,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
FixedVectorType::get(ScalarTy, EltsPerVector),
SubMask);
}
+ for (int Idx : Indices) {
+ Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+ FixedVectorType::get(ScalarTy, NumElts),
+ std::nullopt, CostKind, Idx,
+ FixedVectorType::get(ScalarTy, EltsPerVector));
+ }
}
return Cost;
}
@@ -8366,11 +8397,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
InVectors.front().get<const TreeEntry *>() == &E1 &&
InVectors.back().get<const TreeEntry *>() == E2) ||
(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
- assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
+ assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
[](int Idx) { return Idx == PoisonMaskElem; }) &&
"Expected all poisoned elements.");
- ArrayRef<int> SubMask =
- ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+ ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
return;
}
@@ -8690,10 +8721,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
});
});
SmallPtrSet<Value *, 4> UniqueBases;
- unsigned SliceSize = VL.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
- ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
- for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+ ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
+ for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
// Ignore non-extractelement scalars.
if (isa<UndefValue>(V) ||
(!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -8790,7 +8822,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
- unsigned SliceSize = Mask.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -8807,7 +8839,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
if (NumParts == 0 || NumParts >= Mask.size())
NumParts = 1;
- unsigned SliceSize = Mask.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
const auto *It =
find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -10664,12 +10696,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
Mask.assign(VL.size(), PoisonMaskElem);
- unsigned SliceSize = VL.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
// Scan list of gathered scalars for extractelements that can be represented
// as shuffles.
- MutableArrayRef<Value *> SubVL =
- MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+ MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
+ Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVector<int> SubMask;
std::optional<TTI::ShuffleKind> Res =
tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -11073,10 +11105,11 @@ BoUpSLP::isGatherShuffledEntry(
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
- unsigned SliceSize = VL.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
- ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ ArrayRef<Value *> SubVL =
+ VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
std::optional<TTI::ShuffleKind> SubRes =
isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
@@ -11679,11 +11712,12 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
// into a long virtual vector register, forming the original vector.
Value *Vec = nullptr;
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
- unsigned SliceSize = E->Scalars.size() / NumParts;
- for (unsigned Part = 0; Part < NumParts; ++Part) {
+ unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+ for (unsigned Part : seq<unsigned>(NumParts)) {
+ unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
ArrayRef<Value *> VL =
- ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
- MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+ ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+ MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
constexpr int MaxBases = 2;
SmallVector<Value *, MaxBases> Bases(MaxBases);
#ifndef NDEBUG
@@ -11720,7 +11754,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
assert((Part == 0 || all_of(seq<unsigned>(0, Part),
[&](unsigned P) {
ArrayRef<int> SubMask =
- Mask.slice(P * SliceSize, SliceSize);
+ Mask.slice(P * SliceSize,
+ getNumElems(Mask.size(),
+ SliceSize, P));
return all_of(SubMask, [](int Idx) {
return Idx == PoisonMaskElem;
});
@@ -12104,13 +12140,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
Idx == 0) ||
(Mask.size() == InputVF &&
ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
- std::iota(std::next(Mask.begin(), I * SliceSize),
- std::next(Mask.begin(), (I + 1) * SliceSize), 0);
+ std::iota(
+ std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(),
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+ 0);
} else {
unsigned IVal =
*find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
- std::fill(std::next(Mask.begin(), I * SliceSize),
- std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
+ std::fill(
+ std::next(Mask.begin(), I * SliceSize),
+ std::next(Mask.begin(),
+ I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+ IVal);
}
return true;
};
@@ -12370,7 +12412,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
}
if (!GatherShuffles.empty()) {
- unsigned SliceSize = E->Scalars.size() / NumParts;
+ unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
for (const auto [I, TEs] : enumerate(Entries)) {
if (TEs.empty()) {
@@ -12380,7 +12422,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
}
assert((TEs.size() == 1 || TEs.size() == 2) &&
"Expected shuffle of 1 or 2 entries.");
- auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+ unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
+ auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
VecMask.assign(VecMask.size(), PoisonMaskElem);
copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
if (TEs.size() == 1) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index dfa8be9741779..aceee8840bb40 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -56,25 +56,12 @@ define half @reduction_half16(<16 x half> %vec16) {
;
; VI-LABEL: @reduction_half16(
; VI-NEXT: entry:
-; VI-NEXT: [[ELT8:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 8
-; VI-NEXT: [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
-; VI-NEXT: [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
-; VI-NEXT: [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
-; VI-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; VI-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; VI-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; VI-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VI-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; VI-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]])
-; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT8]]
-; VI-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT9]], [[ELT10]]
-; VI-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT11]], [[ELT12]]
-; VI-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
-; VI-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; VI-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX2]], [[OP_RDX3]]
-; VI-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX4]], [[OP_RDX5]]
-; VI-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX6]], [[ELT15]]
-; VI-NEXT: ret half [[OP_RDX7]]
+; VI-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VI-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP2]])
+; VI-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; VI-NEXT: ret half [[OP_RDX]]
;
entry:
%elt0 = extractelement <16 x half> %vec16, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c38b519b..9608608a18098 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) {
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
auto CheckPerRegistersShuffle = | ||
[&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> { | ||
[&](MutableArrayRef<int> Mask, | ||
SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@alexey-bataev Should this be SmallVector<int> &Indices
? I noticed on #94546 that we never add the costs of TTI::SK_ExtractSubvector below
One of the previous patches introduced initial support for non-power-of-2
number of elements but some parts of the SLP vectorizer still were not
adjusted to handle the costs correctly. Patch fixes it by improving
analysis of the non-power-of-2 number of elements and fixes in the cost
of the extractelements instructions.