@@ -5685,17 +5685,17 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
5685
5685
for (auto *V : VL) {
5686
5686
++Idx;
5687
5687
5688
- // Need to exclude undefs from analysis.
5689
- if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5690
- continue ;
5691
-
5692
5688
// Reached the start of a new vector registers.
5693
5689
if (Idx % EltsPerVector == 0 ) {
5694
5690
RegMask.assign (EltsPerVector, UndefMaskElem);
5695
5691
AllConsecutive = true ;
5696
5692
continue ;
5697
5693
}
5698
5694
5695
+ // Need to exclude undefs from analysis.
5696
+ if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem)
5697
+ continue ;
5698
+
5699
5699
// Check all extracts for a vector register on the target directly
5700
5700
// extract values in order.
5701
5701
unsigned CurrentIdx = *getExtractIndex (cast<Instruction>(V));
@@ -6145,61 +6145,102 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
6145
6145
assert (E->ReuseShuffleIndices .empty () &&
6146
6146
" Unique insertelements only are expected." );
6147
6147
auto *SrcVecTy = cast<FixedVectorType>(VL0->getType ());
6148
-
6149
6148
unsigned const NumElts = SrcVecTy->getNumElements ();
6150
6149
unsigned const NumScalars = VL.size ();
6150
+
6151
+ unsigned NumOfParts = TTI->getNumberOfParts (SrcVecTy);
6152
+
6153
+ unsigned OffsetBeg = *getInsertIndex (VL.front ());
6154
+ unsigned OffsetEnd = OffsetBeg;
6155
+ for (Value *V : VL.drop_front ()) {
6156
+ unsigned Idx = *getInsertIndex (V);
6157
+ if (OffsetBeg > Idx)
6158
+ OffsetBeg = Idx;
6159
+ else if (OffsetEnd < Idx)
6160
+ OffsetEnd = Idx;
6161
+ }
6162
+ unsigned VecScalarsSz = PowerOf2Ceil (NumElts);
6163
+ if (NumOfParts > 0 )
6164
+ VecScalarsSz = PowerOf2Ceil ((NumElts + NumOfParts - 1 ) / NumOfParts);
6165
+ unsigned VecSz =
6166
+ (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
6167
+ VecScalarsSz;
6168
+ unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
6169
+ unsigned InsertVecSz = std::min<unsigned >(
6170
+ PowerOf2Ceil (OffsetEnd - OffsetBeg + 1 ),
6171
+ ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
6172
+ VecScalarsSz);
6173
+ bool IsWholeSubvector =
6174
+ OffsetBeg == Offset && ((OffsetEnd + 1 ) % VecScalarsSz == 0 );
6175
+ // Check if we can safely insert a subvector. If it is not possible, just
6176
+ // generate a whole-sized vector and shuffle the source vector and the new
6177
+ // subvector.
6178
+ if (OffsetBeg + InsertVecSz > VecSz) {
6179
+ // Align OffsetBeg to generate correct mask.
6180
+ OffsetBeg = alignDown (OffsetBeg, VecSz, Offset);
6181
+ InsertVecSz = VecSz;
6182
+ }
6183
+
6151
6184
APInt DemandedElts = APInt::getZero (NumElts);
6152
6185
// TODO: Add support for Instruction::InsertValue.
6153
6186
SmallVector<int > Mask;
6154
6187
if (!E->ReorderIndices .empty ()) {
6155
6188
inversePermutation (E->ReorderIndices , Mask);
6156
- Mask.append (NumElts - NumScalars , UndefMaskElem);
6189
+ Mask.append (InsertVecSz - Mask. size () , UndefMaskElem);
6157
6190
} else {
6158
- Mask.assign (NumElts , UndefMaskElem);
6159
- std::iota (Mask.begin (), std::next (Mask.begin (), NumScalars ), 0 );
6191
+ Mask.assign (VecSz , UndefMaskElem);
6192
+ std::iota (Mask.begin (), std::next (Mask.begin (), InsertVecSz ), 0 );
6160
6193
}
6161
- unsigned Offset = *getInsertIndex (VL0);
6162
6194
bool IsIdentity = true ;
6163
- SmallVector<int > PrevMask (NumElts , UndefMaskElem);
6195
+ SmallVector<int > PrevMask (InsertVecSz , UndefMaskElem);
6164
6196
Mask.swap (PrevMask);
6165
6197
for (unsigned I = 0 ; I < NumScalars; ++I) {
6166
6198
unsigned InsertIdx = *getInsertIndex (VL[PrevMask[I]]);
6167
6199
DemandedElts.setBit (InsertIdx);
6168
- IsIdentity &= InsertIdx - Offset == I;
6169
- Mask[InsertIdx - Offset ] = I;
6200
+ IsIdentity &= InsertIdx - OffsetBeg == I;
6201
+ Mask[InsertIdx - OffsetBeg ] = I;
6170
6202
}
6171
6203
assert (Offset < NumElts && " Failed to find vector index offset" );
6172
6204
6173
6205
InstructionCost Cost = 0 ;
6174
6206
Cost -= TTI->getScalarizationOverhead (SrcVecTy, DemandedElts,
6175
6207
/* Insert*/ true , /* Extract*/ false );
6176
6208
6177
- if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0 ) {
6178
- // FIXME: Replace with SK_InsertSubvector once it is properly supported.
6179
- unsigned Sz = PowerOf2Ceil (Offset + NumScalars);
6180
- Cost += TTI->getShuffleCost (
6181
- TargetTransformInfo::SK_PermuteSingleSrc,
6182
- FixedVectorType::get (SrcVecTy->getElementType (), Sz));
6183
- } else if (!IsIdentity) {
6184
- auto *FirstInsert =
6185
- cast<Instruction>(*find_if (E->Scalars , [E](Value *V) {
6186
- return !is_contained (E->Scalars ,
6187
- cast<Instruction>(V)->getOperand (0 ));
6188
- }));
6189
- if (isUndefVector (FirstInsert->getOperand (0 ))) {
6190
- Cost += TTI->getShuffleCost (TTI::SK_PermuteSingleSrc, SrcVecTy, Mask);
6209
+ // First cost - resize to actual vector size if not identity shuffle or
6210
+ // need to shift the vector.
6211
+ // Do not calculate the cost if the actual size is the register size and
6212
+ // we can merge this shuffle with the following SK_Select.
6213
+ auto *InsertVecTy =
6214
+ FixedVectorType::get (SrcVecTy->getElementType (), InsertVecSz);
6215
+ if (!IsIdentity)
6216
+ Cost += TTI->getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
6217
+ InsertVecTy, Mask);
6218
+ auto *FirstInsert = cast<Instruction>(*find_if (E->Scalars , [E](Value *V) {
6219
+ return !is_contained (E->Scalars , cast<Instruction>(V)->getOperand (0 ));
6220
+ }));
6221
+ // Second cost - permutation with subvector, if some elements are from the
6222
+ // initial vector or inserting a subvector.
6223
+ // TODO: Implement the analysis of the FirstInsert->getOperand(0)
6224
+ // subvector of ActualVecTy.
6225
+ if (!isUndefVector (FirstInsert->getOperand (0 )) && NumScalars != NumElts &&
6226
+ !IsWholeSubvector) {
6227
+ if (InsertVecSz != VecSz) {
6228
+ auto *ActualVecTy =
6229
+ FixedVectorType::get (SrcVecTy->getElementType (), VecSz);
6230
+ Cost += TTI->getShuffleCost (TTI::SK_InsertSubvector, ActualVecTy,
6231
+ None, OffsetBeg - Offset, InsertVecTy);
6191
6232
} else {
6192
- SmallVector<int > InsertMask (NumElts);
6193
- std::iota (InsertMask.begin (), InsertMask.end (), 0 );
6194
- for (unsigned I = 0 ; I < NumElts; I++) {
6233
+ for (unsigned I = 0 , End = OffsetBeg - Offset; I < End; ++I)
6234
+ Mask[I] = I;
6235
+ for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
6236
+ I <= End; ++I)
6195
6237
if (Mask[I] != UndefMaskElem)
6196
- InsertMask[Offset + I] = NumElts + I ;
6197
- }
6198
- Cost +=
6199
- TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask );
6238
+ Mask[ I] = I + VecSz ;
6239
+ for ( unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
6240
+ Mask[I] = I;
6241
+ Cost += TTI->getShuffleCost (TTI::SK_PermuteTwoSrc, InsertVecTy, Mask );
6200
6242
}
6201
6243
}
6202
-
6203
6244
return Cost;
6204
6245
}
6205
6246
case Instruction::ZExt:
@@ -6659,7 +6700,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
6659
6700
// No need to vectorize inserts of gathered values.
6660
6701
if (VectorizableTree.size () == 2 &&
6661
6702
isa<InsertElementInst>(VectorizableTree[0 ]->Scalars [0 ]) &&
6662
- VectorizableTree[1 ]->State == TreeEntry::NeedToGather)
6703
+ VectorizableTree[1 ]->State == TreeEntry::NeedToGather &&
6704
+ (VectorizableTree[1 ]->getVectorFactor () <= 2 ||
6705
+ !(isSplat (VectorizableTree[1 ]->Scalars ) ||
6706
+ allConstant (VectorizableTree[1 ]->Scalars ))))
6663
6707
return true ;
6664
6708
6665
6709
// We can vectorize the tree if its size is greater than or equal to the
@@ -7693,6 +7737,11 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
7693
7737
return createBuildVector (VL);
7694
7738
}
7695
7739
Value *BoUpSLP::createBuildVector (ArrayRef<Value *> VL) {
7740
+ assert (any_of (VectorizableTree,
7741
+ [VL](const std::unique_ptr<TreeEntry> &TE) {
7742
+ return TE->State == TreeEntry::NeedToGather && TE->isSame (VL);
7743
+ }) &&
7744
+ " Non-matching gather node." );
7696
7745
unsigned VF = VL.size ();
7697
7746
// Exploit possible reuse of values across lanes.
7698
7747
SmallVector<int > ReuseShuffleIndicies;
0 commit comments