-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP]: Introduce and use getDataFlowCost #112999
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Change-Id: I6a9155f4af3f8ccc943ab9d46c07dab07dc9b5c5
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Jeffrey Byrnes (jrbyrnes) ChangesThis adds getDataFlowCost to the cost model. For certain vector types (e.g. vectors of illegal types), there may be costs which are not currently captured by the cost model. For example, selectionDAGBuilder will likely scalarize vectors of illegal types that cost basic block boundaries. Similar scalarization may occur when handling illegal vector arguments or return values. This scalarization is ultimately a cost of vectorization, and it should be accounted. That said, for legal types, this type of legalization scalarization will not occur. Moreover, when it does occur, the scalarization cost is the same as the cost of the scalarized version. However, AMDGPU has code in place to reduce this type of scalarization; thus the target override. Full diff: https://github.com/llvm/llvm-project/pull/112999.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
Function *F, Type *RetTy, ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
+ /// \returns The cost of propagating Type \p DataType through Basic Block /
+ /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+ /// associated with either a function argument or return. Otherwise, \p
+ /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+ /// (this is relevant because SelectionDAG builder may, for example, scalarize
+ /// illegal vectors across blocks, which introduces extract/insert code).
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
/// \returns The number of pieces into which the provided type must be
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
+ virtual InstructionCost getDataFlowCost(Type *DataType,
+ bool IsCallingConv) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind) override {
return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+ return Impl.getDataFlowCost(DataType, IsCallingConv);
+ }
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+ return 0;
+ }
+
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return 10;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+ return 0;
+ }
+
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
return Cost;
}
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) const {
+ InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..5d58cc62dbde09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) {
+ if (isTypeLegal(DataType) || IsCallingConv)
+ return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+ return getNumberOfParts(DataType);
+}
+
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..c195c860075eb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
return ArgTys;
}
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+ unsigned ScalarSize,
+ TargetTransformInfo *TTI) {
+ SetVector<Value *> ArgRoots;
+ for (unsigned I = 0; I < ScalarSize; I++) {
+ auto *Scalar = Scalars[I];
+ if (!Scalar)
+ continue;
+ auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+ if (!EE)
+ continue;
+
+ auto *Vec = EE->getOperand(0);
+ if (!Vec->getType()->isVectorTy())
+ continue;
+
+ auto F = EE->getFunction();
+ auto FoundIt = find_if(
+ F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+ if (FoundIt == F->arg_end())
+ continue;
+
+ if (!ArgRoots.contains(Vec))
+ ArgRoots.insert(Vec);
+ }
+
+ if (!ArgRoots.size())
+ return 0;
+
+ unsigned Cost = 0;
+ for (auto ArgOp : ArgRoots) {
+ Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+ .getValue()
+ .value_or(0);
+ }
+ return Cost;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
- return 0;
+ return CommonCost;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- return processBuildVector<ShuffleCostEstimator, InstructionCost>(
- E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+ return CommonCost +
+ processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
}
- InstructionCost CommonCost = 0;
SmallVector<int> Mask;
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
+
+ // Calculate the cost difference of propagating a vector vs series of scalars
+ // across blocks. This may be nonzero in the case of illegal vectors.
+ Instruction *VL0 = TE.getMainOp();
+ bool IsAPhi = VL0 && isa<PHINode>(VL0);
+ bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+ bool LiveThru = false;
+ if (HasNextEntry) {
+ Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+ LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+ }
+ if (IsAPhi || LiveThru) {
+ VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+ Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ InstructionCost ScalarDFlow =
+ TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv*/ false) *
+ TE.getVectorFactor();
+ InstructionCost VectorDFlow =
+ TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+ Cost += (VectorDFlow - ScalarDFlow);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
- !ExtractCostCalculated.insert(EU.Scalar).second)
+ !ExtractCostCalculated.insert(EU.Scalar).second) {
continue;
+ }
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (EphValues.count(EU.User))
continue;
+ // Account for any additional costs required by CallingConvention for the
+ // type.
+ if (isa_and_nonnull<ReturnInst>(EU.User)) {
+ Cost +=
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+ continue;
+ }
+
// No extract cost for vector "scalar"
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
-
return Cost;
}
|
@llvm/pr-subscribers-llvm-analysis Author: Jeffrey Byrnes (jrbyrnes) ChangesThis adds getDataFlowCost to the cost model. For certain vector types (e.g. vectors of illegal types), there may be costs which are not currently captured by the cost model. For example, selectionDAGBuilder will likely scalarize vectors of illegal types that cost basic block boundaries. Similar scalarization may occur when handling illegal vector arguments or return values. This scalarization is ultimately a cost of vectorization, and it should be accounted. That said, for legal types, this type of legalization scalarization will not occur. Moreover, when it does occur, the scalarization cost is the same as the cost of the scalarized version. However, AMDGPU has code in place to reduce this type of scalarization; thus the target override. Full diff: https://github.com/llvm/llvm-project/pull/112999.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
Function *F, Type *RetTy, ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
+ /// \returns The cost of propagating Type \p DataType through Basic Block /
+ /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+ /// associated with either a function argument or return. Otherwise, \p
+ /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+ /// (this is relevant because SelectionDAG builder may, for example, scalarize
+ /// illegal vectors across blocks, which introduces extract/insert code).
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
/// \returns The number of pieces into which the provided type must be
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
+ virtual InstructionCost getDataFlowCost(Type *DataType,
+ bool IsCallingConv) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind) override {
return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+ return Impl.getDataFlowCost(DataType, IsCallingConv);
+ }
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+ return 0;
+ }
+
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return 10;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+ return 0;
+ }
+
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
return Cost;
}
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) const {
+ InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..5d58cc62dbde09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) {
+ if (isTypeLegal(DataType) || IsCallingConv)
+ return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+ return getNumberOfParts(DataType);
+}
+
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..c195c860075eb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
return ArgTys;
}
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+ unsigned ScalarSize,
+ TargetTransformInfo *TTI) {
+ SetVector<Value *> ArgRoots;
+ for (unsigned I = 0; I < ScalarSize; I++) {
+ auto *Scalar = Scalars[I];
+ if (!Scalar)
+ continue;
+ auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+ if (!EE)
+ continue;
+
+ auto *Vec = EE->getOperand(0);
+ if (!Vec->getType()->isVectorTy())
+ continue;
+
+ auto F = EE->getFunction();
+ auto FoundIt = find_if(
+ F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+ if (FoundIt == F->arg_end())
+ continue;
+
+ if (!ArgRoots.contains(Vec))
+ ArgRoots.insert(Vec);
+ }
+
+ if (!ArgRoots.size())
+ return 0;
+
+ unsigned Cost = 0;
+ for (auto ArgOp : ArgRoots) {
+ Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+ .getValue()
+ .value_or(0);
+ }
+ return Cost;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
- return 0;
+ return CommonCost;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- return processBuildVector<ShuffleCostEstimator, InstructionCost>(
- E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+ return CommonCost +
+ processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
}
- InstructionCost CommonCost = 0;
SmallVector<int> Mask;
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
+
+ // Calculate the cost difference of propagating a vector vs series of scalars
+ // across blocks. This may be nonzero in the case of illegal vectors.
+ Instruction *VL0 = TE.getMainOp();
+ bool IsAPhi = VL0 && isa<PHINode>(VL0);
+ bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+ bool LiveThru = false;
+ if (HasNextEntry) {
+ Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+ LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+ }
+ if (IsAPhi || LiveThru) {
+ VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+ Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ InstructionCost ScalarDFlow =
+ TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv*/ false) *
+ TE.getVectorFactor();
+ InstructionCost VectorDFlow =
+ TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+ Cost += (VectorDFlow - ScalarDFlow);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
- !ExtractCostCalculated.insert(EU.Scalar).second)
+ !ExtractCostCalculated.insert(EU.Scalar).second) {
continue;
+ }
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (EphValues.count(EU.User))
continue;
+ // Account for any additional costs required by CallingConvention for the
+ // type.
+ if (isa_and_nonnull<ReturnInst>(EU.User)) {
+ Cost +=
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+ continue;
+ }
+
// No extract cost for vector "scalar"
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
-
return Cost;
}
|
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesThis adds getDataFlowCost to the cost model. For certain vector types (e.g. vectors of illegal types), there may be costs which are not currently captured by the cost model. For example, selectionDAGBuilder will likely scalarize vectors of illegal types that cost basic block boundaries. Similar scalarization may occur when handling illegal vector arguments or return values. This scalarization is ultimately a cost of vectorization, and it should be accounted. That said, for legal types, this type of legalization scalarization will not occur. Moreover, when it does occur, the scalarization cost is the same as the cost of the scalarized version. However, AMDGPU has code in place to reduce this type of scalarization; thus the target override. Full diff: https://github.com/llvm/llvm-project/pull/112999.diff 7 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a4..934012b2e53f5c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1534,6 +1534,14 @@ class TargetTransformInfo {
Function *F, Type *RetTy, ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
+ /// \returns The cost of propagating Type \p DataType through Basic Block /
+ /// function boundaries. If \p IsCallingConv is specified, then \p DataType is
+ /// associated with either a function argument or return. Otherwise, \p
+ /// DataType is used in either a GEP instruction, or spans across BasicBlocks
+ /// (this is relevant because SelectionDAG builder may, for example, scalarize
+ /// illegal vectors across blocks, which introduces extract/insert code).
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const;
+
/// \returns The number of pieces into which the provided type must be
/// split during legalization. Zero is returned when the answer is unknown.
unsigned getNumberOfParts(Type *Tp) const;
@@ -2096,6 +2104,8 @@ class TargetTransformInfo::Concept {
virtual InstructionCost getCallInstrCost(Function *F, Type *RetTy,
ArrayRef<Type *> Tys,
TTI::TargetCostKind CostKind) = 0;
+ virtual InstructionCost getDataFlowCost(Type *DataType,
+ bool IsCallingConv) = 0;
virtual unsigned getNumberOfParts(Type *Tp) = 0;
virtual InstructionCost
getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr) = 0;
@@ -2781,6 +2791,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
TTI::TargetCostKind CostKind) override {
return Impl.getCallInstrCost(F, RetTy, Tys, CostKind);
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) override {
+ return Impl.getDataFlowCost(DataType, IsCallingConv);
+ }
unsigned getNumberOfParts(Type *Tp) override {
return Impl.getNumberOfParts(Tp);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43c..5a25a88c3eb460 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -772,6 +772,10 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) const {
+ return 0;
+ }
+
// Assume that we have a register of the right size for the type.
unsigned getNumberOfParts(Type *Tp) const { return 1; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 9f8d3ded9b3c1a..c6a5c38a1b3fd5 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2410,6 +2410,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return 10;
}
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv) {
+ return 0;
+ }
+
unsigned getNumberOfParts(Type *Tp) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
return LT.first.isValid() ? *LT.first.getValue() : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f0..edef9afa747d62 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1116,6 +1116,13 @@ TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
return Cost;
}
+InstructionCost TargetTransformInfo::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) const {
+ InstructionCost Cost = TTIImpl->getDataFlowCost(DataType, IsCallingConv);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
return TTIImpl->getNumberOfParts(Tp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 437e01c37c6b60..5d58cc62dbde09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -306,6 +306,14 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
return !F || !ST->isSingleLaneExecution(*F);
}
+InstructionCost GCNTTIImpl::getDataFlowCost(Type *DataType,
+ bool IsCallingConv) {
+ if (isTypeLegal(DataType) || IsCallingConv)
+ return BaseT::getDataFlowCost(DataType, IsCallingConv);
+
+ return getNumberOfParts(DataType);
+}
+
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
// registers. See getRegisterClassForType for the implementation.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b423df17302ca2..c195c860075eb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -161,6 +161,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
+ InstructionCost getDataFlowCost(Type *DataType, bool IsCallingConv);
+
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ae0819c964bef3..42617eb4cf2095 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9044,6 +9044,51 @@ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
return ArgTys;
}
+// The cost model may determine that vectorizing and eliminating a series of
+// ExtractElements is beneficial. However, if the input vector is a function
+// argument, the calling convention may require extractions in the geneerated
+// code. In this scenario, vectorizaino would then not eliminate the
+// ExtractElement sequence, but would add additional vectorization code.
+// getCCCostFromScalars does the proper accounting for this.
+static unsigned getCCCostFromScalars(ArrayRef<Value *> &Scalars,
+ unsigned ScalarSize,
+ TargetTransformInfo *TTI) {
+ SetVector<Value *> ArgRoots;
+ for (unsigned I = 0; I < ScalarSize; I++) {
+ auto *Scalar = Scalars[I];
+ if (!Scalar)
+ continue;
+ auto *EE = dyn_cast<ExtractElementInst>(Scalar);
+ if (!EE)
+ continue;
+
+ auto *Vec = EE->getOperand(0);
+ if (!Vec->getType()->isVectorTy())
+ continue;
+
+ auto F = EE->getFunction();
+ auto FoundIt = find_if(
+ F->args(), [&Vec](Argument &I) { return Vec == cast<Value>(&I); });
+
+ if (FoundIt == F->arg_end())
+ continue;
+
+ if (!ArgRoots.contains(Vec))
+ ArgRoots.insert(Vec);
+ }
+
+ if (!ArgRoots.size())
+ return 0;
+
+ unsigned Cost = 0;
+ for (auto ArgOp : ArgRoots) {
+ Cost += TTI->getDataFlowCost(ArgOp->getType(), /*IsCallingConv*/ true)
+ .getValue()
+ .value_or(0);
+ }
+ return Cost;
+}
+
InstructionCost
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9075,15 +9120,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ InstructionCost CommonCost = getCCCostFromScalars(VL, VL.size(), TTI);
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
- return 0;
+ return CommonCost;
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
- return processBuildVector<ShuffleCostEstimator, InstructionCost>(
- E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
+ return CommonCost +
+ processBuildVector<ShuffleCostEstimator, InstructionCost>(
+ E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
}
- InstructionCost CommonCost = 0;
SmallVector<int> Mask;
bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
if (!E->ReorderIndices.empty() &&
@@ -10241,6 +10287,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
Cost += C;
+
+ // Calculate the cost difference of propagating a vector vs series of scalars
+ // across blocks. This may be nonzero in the case of illegal vectors.
+ Instruction *VL0 = TE.getMainOp();
+ bool IsAPhi = VL0 && isa<PHINode>(VL0);
+ bool HasNextEntry = VL0 && ((I + 1) < VectorizableTree.size());
+ bool LiveThru = false;
+ if (HasNextEntry) {
+ Instruction *VL1 = VectorizableTree[I + 1]->getMainOp();
+ LiveThru = VL1 && (VL0->getParent() != VL1->getParent());
+ }
+ if (IsAPhi || LiveThru) {
+ VectorType *VTy = dyn_cast<VectorType>(VL0->getType());
+ Type *ScalarTy = VTy ? VTy->getElementType() : VL0->getType();
+ if (ScalarTy && isValidElementType(ScalarTy)) {
+ InstructionCost ScalarDFlow =
+ TTI->getDataFlowCost(ScalarTy,
+ /*IsCallingConv*/ false) *
+ TE.getVectorFactor();
+ InstructionCost VectorDFlow =
+ TTI->getDataFlowCost(FixedVectorType::get(ScalarTy, TE.getVectorFactor()), /*IsCallingConv*/ false);
+ Cost += (VectorDFlow - ScalarDFlow);
+ }
+ }
+
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
<< shortBundleName(TE.Scalars) << ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
@@ -10257,8 +10328,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
- !ExtractCostCalculated.insert(EU.Scalar).second)
+ !ExtractCostCalculated.insert(EU.Scalar).second) {
continue;
+ }
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
@@ -10266,6 +10338,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (EphValues.count(EU.User))
continue;
+ // Account for any additional costs required by CallingConvention for the
+ // type.
+ if (isa_and_nonnull<ReturnInst>(EU.User)) {
+ Cost +=
+ TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true);
+ continue;
+ }
+
// No extract cost for vector "scalar"
if (isa<FixedVectorType>(EU.Scalar->getType()))
continue;
@@ -10566,7 +10646,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
-
return Cost;
}
|
See #113002 for a usecase |
This is not NFC |
Fixed |
// type. | ||
if (isa_and_nonnull<ReturnInst>(EU.User)) { | ||
Cost += | ||
TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true); | |
TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv=*/ true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to add it here, you can check it in getVectorInstrCost, just need to pass an extra info about the user instruction, if needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The case we care about is vector return types. These will be skipped by the early continue check and won't fall to the getVectorInstrCost
. It seems cleaner to just add getDataFlowCost
rather than removing the continue check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code does not check for vector return type, it is externally used scalar. What exactly do you want to calculate here: the cost of passing value, returned by extractelement instruction, or the cost of vectorized buildvector (replaced by shuffles or just original vectors in case of the identity shuffle)?
// type. | ||
if (isa_and_nonnull<ReturnInst>(EU.User)) { | ||
Cost += | ||
TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to add it here, you can check it in getVectorInstrCost, just need to pass an extra info about the user instruction, if needed
All these changes require tests |
Change-Id: I5c7ee6604012880bd96d137c69d3d8f6fb6ff1f8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs tests
if (isa<FixedVectorType>(EU.Scalar->getType())) { | ||
// Account for any additional costs required by CallingConvention for the | ||
// type. | ||
if (isa_and_nonnull<ReturnInst>(EU.User)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Outgoing call arguments too. But this is just the type legality cost?
Co-authored-by: Matt Arsenault <[email protected]>
// scalars across blocks. This may be nonzero in the case of illegal | ||
// vectors. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment talks about illegal vector types but the code affects only legal vector types
ScalarCost += TTI->getDataFlowCost(ScalarTy, | ||
/*IsCallingConv=*/false) * |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it account register pressure or something else?
|
||
// Calculate the cost difference of propagating a vector vs series of scalars | ||
// across blocks. This may be nonzero in the case of illegal vectors. | ||
Instruction *VL0 = TE.getMainOp(); | ||
if (VL0 && ((I + 1) < VectorizableTree.size())) { | ||
Instruction *VL1 = VectorizableTree[I + 1]->getMainOp(); | ||
if (VL1 && (VL0->getParent() != VL1->getParent())) { | ||
Type *ScalarTy = VL0->getType()->getScalarType(); | ||
if (ScalarTy && isValidElementType(ScalarTy)) { | ||
InstructionCost ScalarDFlow = | ||
TTI->getDataFlowCost(ScalarTy, | ||
/*IsCallingConv=*/false) * | ||
TE.getVectorFactor(); | ||
InstructionCost VectorDFlow = TTI->getDataFlowCost( | ||
FixedVectorType::get(ScalarTy, TE.getVectorFactor()), | ||
/*IsCallingConv=*/false); | ||
Cost += (VectorDFlow - ScalarDFlow); | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- It should not be here, it should be implemented in getEntryCost.
vectorizableTree[I + 1]
does not always point to the operand of the previous node
// type. | ||
if (isa_and_nonnull<ReturnInst>(EU.User)) { | ||
Cost += | ||
TTI->getDataFlowCost(EU.Scalar->getType(), /*IsCallingConv*/ true); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code does not check for vector return type, it is externally used scalar. What exactly do you want to calculate here: the cost of passing value, returned by extractelement instruction, or the cost of vectorized buildvector (replaced by shuffles or just original vectors in case of the identity shuffle)?
This adds getDataFlowCost to the cost model. For certain vector types (e.g. vectors of illegal types), there may be costs which are not currently captured by the cost model. For example, selectionDAGBuilder will likely scalarize vectors of illegal types that cost basic block boundaries. Similar scalarization may occur when handling illegal vector arguments or return values. This scalarization is ultimately a cost of vectorization, and it should be accounted. That said, for legal types, this type of legalization scalarization will not occur. Moreover, when it does occur, the scalarization cost is the same as the cost of the scalarized version. However, AMDGPU has code in place to reduce this type of scalarization; thus the target override.