@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
9044
9044
return ArgTys;
9045
9045
}
9046
9046
9047
+ // The cost model may determine that vectorizing and eliminating a series of
9048
+ // ExtractElements is beneficial. However, if the input vector is a function
9049
+ // argument, the calling convention may require extractions in the geneerated
9050
+ // code. In this scenario, vectorizaino would then not eliminate the
9051
+ // ExtractElement sequence, but would add additional vectorization code.
9052
+ // getCCCostFromScalars does the proper accounting for this.
9053
+ static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
9054
+ unsigned ScalarSize,
9055
+ TargetTransformInfo *TTI) {
9056
+ SetVector<Value *> ArgRoots;
9057
+ for (unsigned I = 0; I < ScalarSize; I++) {
9058
+ auto *Scalar = Scalars[I];
9059
+ if (!Scalar)
9060
+ continue;
9061
+ auto *EE = dyn_cast<ExtractElementInst>(Scalar);
9062
+ if (!EE)
9063
+ continue;
9064
+
9065
+ auto *Vec = EE->getOperand(0);
9066
+ if (!Vec->getType()->isVectorTy())
9067
+ continue;
9068
+
9069
+ auto F = EE->getFunction();
9070
+ auto FoundIt = find_if(
9071
+ F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
9072
+
9073
+ if (FoundIt == F->arg_end())
9074
+ continue;
9075
+
9076
+ if (!ArgRoots.contains(Vec))
9077
+ ArgRoots.insert(Vec);
9078
+ }
9079
+
9080
+ if (!ArgRoots.size())
9081
+ return 0;
9082
+
9083
+ unsigned Cost = 0;
9084
+ for (auto ArgOp : ArgRoots) {
9085
+ Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
9086
+ .getValue()
9087
+ .value_or(0);
9088
+ }
9089
+ return Cost;
9090
+ }
9091
+
9047
9092
InstructionCost
9048
9093
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9049
9094
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9075
9120
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
9076
9121
9077
9122
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9123
+ InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
9078
9124
if (E->State == TreeEntry::NeedToGather) {
9079
9125
if (allConstant(VL))
9080
- return 0 ;
9126
+ return CommonCost ;
9081
9127
if (isa<InsertElementInst>(VL[0]))
9082
9128
return InstructionCost::getInvalid();
9083
- return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9084
- E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9129
+ return CommonCost +
9130
+ processBuildVector<ShuffleCostEstimator, InstructionCost>(
9131
+ E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9085
9132
}
9086
- InstructionCost CommonCost = 0;
9087
9133
SmallVector<int> Mask;
9088
9134
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9089
9135
if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10241
10287
10242
10288
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10243
10289
Cost += C;
10290
+
10291
+ // Calculate the cost difference of propagating a vector vs series of scalars
10292
+ // across blocks. This may be nonzero in the case of illegal vectors.
10293
+ Instruction *VL0 = TE.getMainOp();
10294
+ bool IsAPhi = VL0 && isa<PHINode>(VL0);
10295
+ bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
10296
+ bool LiveThru = false;
10297
+ if (HasNextEntry) {
10298
+ Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
10299
+ LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
10300
+ }
10301
+ if (IsAPhi || LiveThru) {
10302
+ VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
10303
+ Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
10304
+ if (ScalarTy && isValidElementType(ScalarTy)) {
10305
+ InstructionCost ScalarDFlow =
10306
+ TTI->getDataFlowCost(ScalarTy,
10307
+ /*IsCallingConv*/ false) *
10308
+ TE.getVectorFactor();
10309
+ InstructionCost VectorDFlow =
10310
+ TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
10311
+ Cost += (VectorDFlow - ScalarDFlow);
10312
+ }
10313
+ }
10314
+
10244
10315
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10245
10316
<< shortBundleName(TE.Scalars) << ".\n"
10246
10317
<< "SLP: Current total cost = " << Cost << "\n");
@@ -10257,15 +10328,24 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10257
10328
for (ExternalUser &EU : ExternalUses) {
10258
10329
// We only add extract cost once for the same scalar.
10259
10330
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10260
- !ExtractCostCalculated.insert(EU.Scalar).second)
10331
+ !ExtractCostCalculated.insert(EU.Scalar).second) {
10261
10332
continue;
10333
+ }
10262
10334
10263
10335
// Uses by ephemeral values are free (because the ephemeral value will be
10264
10336
// removed prior to code generation, and so the extraction will be
10265
10337
// removed as well).
10266
10338
if (EphValues.count(EU.User))
10267
10339
continue;
10268
10340
10341
+ // Account for any additional costs required by CallingConvention for the
10342
+ // type.
10343
+ if (isa_and_nonnull<ReturnInst>(EU.User)) {
10344
+ Cost +=
10345
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
10346
+ continue;
10347
+ }
10348
+
10269
10349
// No extract cost for vector "scalar"
10270
10350
if (isa<FixedVectorType>(EU.Scalar->getType()))
10271
10351
continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10566
10646
if (ViewSLPTree)
10567
10647
ViewGraph(this, "SLP" + F->getName(), false, Str);
10568
10648
#endif
10569
-
10570
10649
return Cost;
10571
10650
}
10572
10651
0 commit comments