Skip to content

Commit 9980c99

Browse files
committed
[SLP]Improve shuffles cost estimation where possible.
Improved/fixed cost modeling for shuffles by providing masks, improved cost model for non-identity insertelements. Differential Revision: https://reviews.llvm.org/D115462
1 parent 4c1e487 commit 9980c99

File tree

11 files changed

+282
-301
lines changed

11 files changed

+282
-301
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 68 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5557,17 +5557,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
55575557
for (auto *V : VL) {
55585558
++Idx;
55595559

5560-
// Need to exclude undefs from analysis.
5561-
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5562-
continue;
5563-
55645560
// Reached the start of a new vector registers.
55655561
if (Idx % EltsPerVector == 0) {
55665562
RegMask.assign(EltsPerVector, UndefMaskElem);
55675563
AllConsecutive = true;
55685564
continue;
55695565
}
55705566

5567+
// Need to exclude undefs from analysis.
5568+
if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5569+
continue;
5570+
55715571
// Check all extracts for a vector register on the target directly
55725572
// extract values in order.
55735573
unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
@@ -6012,23 +6012,42 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
60126012
assert(E->ReuseShuffleIndices.empty() &&
60136013
"Unique insertelements only are expected.");
60146014
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
6015-
60166015
unsigned const NumElts = SrcVecTy->getNumElements();
60176016
unsigned const NumScalars = VL.size();
6017+
6018+
unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
6019+
6020+
unsigned OffsetBeg = *getInsertIndex(VL.front());
6021+
unsigned OffsetEnd = OffsetBeg;
6022+
for (Value *V : VL.drop_front()) {
6023+
unsigned Idx = *getInsertIndex(V);
6024+
if (OffsetBeg > Idx)
6025+
OffsetBeg = Idx;
6026+
else if (OffsetEnd < Idx)
6027+
OffsetEnd = Idx;
6028+
}
6029+
unsigned VecSz = NumElts;
6030+
unsigned VecScalarsSz = NumScalars;
6031+
if (NumOfParts > 0) {
6032+
VecScalarsSz = (NumElts + NumOfParts - 1) / NumOfParts;
6033+
VecSz = PowerOf2Ceil(
6034+
(1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
6035+
VecScalarsSz);
6036+
}
6037+
60186038
APInt DemandedElts = APInt::getZero(NumElts);
60196039
// TODO: Add support for Instruction::InsertValue.
60206040
SmallVector<int> Mask;
60216041
if (!E->ReorderIndices.empty()) {
60226042
inversePermutation(E->ReorderIndices, Mask);
6023-
Mask.append(NumElts - NumScalars, UndefMaskElem);
60246043
} else {
6025-
Mask.assign(NumElts, UndefMaskElem);
6026-
std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
6044+
Mask.assign(VecSz, UndefMaskElem);
6045+
std::iota(Mask.begin(), std::next(Mask.begin(), VecSz), 0);
60276046
}
6028-
unsigned Offset = *getInsertIndex(VL0);
60296047
bool IsIdentity = true;
6030-
SmallVector<int> PrevMask(NumElts, UndefMaskElem);
6048+
SmallVector<int> PrevMask(VecSz, UndefMaskElem);
60316049
Mask.swap(PrevMask);
6050+
unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
60326051
for (unsigned I = 0; I < NumScalars; ++I) {
60336052
unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
60346053
DemandedElts.setBit(InsertIdx);
@@ -6041,32 +6060,45 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
60416060
Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
60426061
/*Insert*/ true, /*Extract*/ false);
60436062

6044-
if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
6045-
// FIXME: Replace with SK_InsertSubvector once it is properly supported.
6046-
unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
6047-
Cost += TTI->getShuffleCost(
6048-
TargetTransformInfo::SK_PermuteSingleSrc,
6049-
FixedVectorType::get(SrcVecTy->getElementType(), Sz));
6050-
} else if (!IsIdentity) {
6051-
auto *FirstInsert =
6052-
cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6053-
return !is_contained(E->Scalars,
6054-
cast<Instruction>(V)->getOperand(0));
6055-
}));
6056-
if (isUndefVector(FirstInsert->getOperand(0))) {
6057-
Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
6063+
// First cost - resize to actual vector size if not identity shuffle or
6064+
// need to shift the vector.
6065+
// Do not calculate the cost if the actual size is the register size and
6066+
// we can merge this shuffle with the following SK_Select.
6067+
auto *ActualVecTy =
6068+
FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
6069+
if ((!IsIdentity || Offset != OffsetBeg) && VecScalarsSz != VecSz)
6070+
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
6071+
ActualVecTy, Mask);
6072+
auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
6073+
return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
6074+
}));
6075+
// Second cost - permutation with subvector, if some elements are from the
6076+
// initial vector or inserting a subvector.
6077+
// TODO: Implement the analysis of the FirstInsert->getOperand(0)
6078+
// subvector of ActualVecTy.
6079+
if (!isUndefVector(FirstInsert->getOperand(0)) &&
6080+
(Offset != OffsetBeg || (OffsetEnd + 1) % VecScalarsSz != 0)) {
6081+
unsigned InsertVecSz = PowerOf2Ceil(OffsetEnd - OffsetBeg + 1);
6082+
if (InsertVecSz != VecSz) {
6083+
Cost += TTI->getShuffleCost(
6084+
TTI::SK_InsertSubvector,
6085+
(InsertVecSz < VecScalarsSz && NumOfParts > 0)
6086+
? FixedVectorType::get(SrcVecTy->getElementType(),
6087+
VecScalarsSz)
6088+
: ActualVecTy,
6089+
None, OffsetBeg - Offset,
6090+
FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz));
60586091
} else {
6059-
SmallVector<int> InsertMask(NumElts);
6060-
std::iota(InsertMask.begin(), InsertMask.end(), 0);
6061-
for (unsigned I = 0; I < NumElts; I++) {
6092+
for (unsigned I = 0; I < OffsetBeg; ++I)
6093+
Mask[I] = I;
6094+
for (unsigned I = OffsetBeg; I <= OffsetEnd; ++I)
60626095
if (Mask[I] != UndefMaskElem)
6063-
InsertMask[Offset + I] = NumElts + I;
6064-
}
6065-
Cost +=
6066-
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask);
6096+
Mask[I] = I + VecSz;
6097+
for (unsigned I = OffsetEnd + 1; I < VecSz; ++I)
6098+
Mask[I] = I;
6099+
Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, ActualVecTy, Mask);
60676100
}
60686101
}
6069-
60706102
return Cost;
60716103
}
60726104
case Instruction::ZExt:
@@ -6519,7 +6551,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
65196551
// No need to vectorize inserts of gathered values.
65206552
if (VectorizableTree.size() == 2 &&
65216553
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
6522-
VectorizableTree[1]->State == TreeEntry::NeedToGather)
6554+
VectorizableTree[1]->State == TreeEntry::NeedToGather &&
6555+
(VectorizableTree[1]->getVectorFactor() <= 2 ||
6556+
!(isSplat(VectorizableTree[1]->Scalars) ||
6557+
allConstant(VectorizableTree[1]->Scalars))))
65236558
return true;
65246559

65256560
// We can vectorize the tree if its size is greater than or equal to the

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) {
2222
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
2323
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
2424
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
25+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
26+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
2527
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
26-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
27-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
28-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
29-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
30-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
3128
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
32-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
33-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
34-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
29+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
30+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
31+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
32+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
33+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
34+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
3535
;
3636
entry:
3737
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) {
218218
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
219219
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
220220
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
221+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
222+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
221223
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
222-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
223-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
224-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
225-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
226-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
227224
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
228-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
229-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
230-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
225+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
226+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
227+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]])
228+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
229+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
230+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
231231
;
232232
entry:
233233
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) {
301301
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
302302
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
303303
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
304+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
305+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
304306
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
305-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
306-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
307-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
308-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
309-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
310307
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
311-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
312-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
313-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
308+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
309+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
310+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]])
311+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
312+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
313+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
314314
;
315315
entry:
316316
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) {
477477
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
478478
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
479479
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
480+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
481+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
480482
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
481-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
482-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
483-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
484-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
485-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
486483
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
487-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
488-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
489-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
484+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
485+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
486+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]])
487+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
488+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
489+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
490490
;
491491
entry:
492492
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) {
519519
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
520520
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
521521
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
522+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
523+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
522524
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
523-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
524-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
525-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
526-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
527-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
528525
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
529-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]])
530-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
531-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
526+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
527+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
528+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
529+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
530+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
532532
;
533533
entry:
534534
%0 = load <4 x float>, <4 x float>* %a, align 16
@@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) {
10101010
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
10111011
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
10121012
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1013+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1014+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
10131015
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1014-
; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0
1015-
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1
1016-
; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1017-
; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1018-
; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
10191016
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1020-
; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
1021-
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3
1022-
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
1017+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0
1018+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1
1019+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]])
1020+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1021+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1022+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
10231023
;
10241024
entry:
10251025
%0 = load <4 x float>, <4 x float>* %a, align 16

0 commit comments

Comments
 (0)