-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP][TTI][X86]Add addsub pattern cost estimation. #76461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP][TTI][X86]Add addsub pattern cost estimation. #76461
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-x86 Author: Alexey Bataev (alexey-bataev) ChangesSLP/TTI do not know about the cost estimation for addsub pattern, supported by X86. Previously the support for pattern detection was added (seeTTI::isLegalAltInstr), but the cost still did not estimated properly. Full diff: https://github.com/llvm/llvm-project/pull/76461.diff 8 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 735be3680aea0d..079e848a0ab51c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1243,6 +1243,18 @@ class TargetTransformInfo {
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr) const;
+ /// Returns the cost estimation for alternating opcode pattern that can be
+ /// lowered to a single instruction on the target. In X86 this is for the
+ /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in
+ /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being
+ /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
+ /// when \p Opcode0 is selected and `1` when Opcode1 is selected.
+ /// \p VecTy is the vector type of the instruction to be generated.
+ InstructionCost getAltInstrCost(
+ VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+ const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The exact mask may be passed as Mask, or else the array will be empty.
/// The index and subtype parameters are used by the subvector insertion and
@@ -1944,6 +1956,10 @@ class TargetTransformInfo::Concept {
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
+ virtual InstructionCost getAltInstrCost(
+ VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+ const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0;
virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
@@ -2555,6 +2571,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
Args, CxtI);
}
+ InstructionCost
+ getAltInstrCost(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+ const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind) const override {
+ return Impl.getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+ }
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d8f523e9792ba..7ad3ce512a3552 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -554,6 +554,13 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+ unsigned Opcode1,
+ const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind) const {
+ return InstructionCost::getInvalid();
+ }
+
InstructionCost
getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3f76dfdaac317c..67246afa23147a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -862,6 +862,15 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
return Cost;
}
+InstructionCost TargetTransformInfo::getAltInstrCost(
+ VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+ const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const {
+ InstructionCost Cost =
+ TTIImpl->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+ assert(Cost >= 0 && "TTI should not produce negative costs!");
+ return Cost;
+}
+
InstructionCost TargetTransformInfo::getShuffleCost(
ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8a04987e768a12..e09dc7ff02a070 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1459,6 +1459,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
Args, CxtI);
}
+InstructionCost
+X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+ unsigned Opcode1, const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind) const {
+ if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
+ return TTI::TCC_Basic;
+ return InstructionCost::getInvalid();
+}
+
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b9..07a3fff4f84b3e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -140,6 +140,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
+ InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+ unsigned Opcode1,
+ const SmallBitVector &OpcodeMask,
+ TTI::TargetCostKind CostKind) const;
+
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c325ad8a291a2..0213c3d5c65243 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8428,6 +8428,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Mask);
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
FinalVecTy, Mask);
+ // Patterns like [fadd,fsub] can be combined into a single instruction
+ // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
+ // need to take into account their order when looking for the most used
+ // order.
+ unsigned Opcode0 = E->getOpcode();
+ unsigned Opcode1 = E->getAltOpcode();
+ // The opcode mask selects between the two opcodes.
+ SmallBitVector OpcodeMask(E->Scalars.size(), false);
+ for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
+ if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
+ OpcodeMask.set(Lane);
+ // If this pattern is supported by the target then we consider the
+ // order.
+ if (TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+ InstructionCost AltVecCost =
+ TTI->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+ return AltVecCost < VecCost ? AltVecCost : VecCost;
+ }
+ // TODO: Check the reverse order too.
return VecCost;
};
return GetCostDiff(GetScalarCost, GetVectorCost);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
index d4c71285a93abf..87063fc3f7a820 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -103,21 +103,23 @@ define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, pt
; ENABLED-LABEL: @test_supernode_addsub_alt(
; ENABLED-NEXT: entry:
; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, ptr [[SARRAY:%.*]], i64 1
; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
-; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
-; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
-; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
-; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
-; ENABLED-NEXT: store double [[SUB0]], ptr [[SARRAY]], align 8
-; ENABLED-NEXT: store double [[ADD1]], ptr [[IDXS1]], align 8
+; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
+; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
+; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
; ENABLED-NEXT: ret void
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index aa3c2be7dc9c26..17f9f371ff6ef9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -12,22 +12,24 @@ define void @foo() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr undef, align 8
; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]]
; CHECK: bb4:
; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
-; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> <double poison, double poison, double undef, double undef>, double [[SUB1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float>
-; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]]
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
-; CHECK-NEXT: [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT: [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
; CHECK-NEXT: br label [[BB2]]
;
entry:
|
Currently we emit gathers for scalars being vectorized in the tree as a pair of extractelement/insertelement instructions. Instead we can try to find all required vectors and emit shuffle vector instructions directly, improving the code and reducing compile time. Part of non-power-of-2 vectorization. Differential Revision: https://reviews.llvm.org/D110978
✅ With the latest revision this PR passed the C/C++ code formatter. |
SLP/TTI do not know about the cost estimation for addsub pattern, supported by X86. Previously the support for pattern detection was added (seeTTI::isLegalAltInstr), but the cost still did not estimated properly.
f83f6ee
to
a8c171c
Compare
This change is causing a build failure on Windows bots: |
This reverts commit bc8c4bb. Change is failing to build on several bots: - https://lab.llvm.org/buildbot/#/builders/127/builds/60184 - https://lab.llvm.org/buildbot/#/builders/123/builds/23709 - https://lab.llvm.org/buildbot/#/builders/216/builds/32302
SLP/TTI do not know about the cost estimation for addsub pattern, supported by X86. Previously the support for pattern detection was added (seeTTI::isLegalAltInstr), but the cost still did not estimated properly.
SLP/TTI do not know about the cost estimation for addsub pattern, supported by X86. Previously the support for pattern detection was added (seeTTI::isLegalAltInstr), but the cost still did not estimated properly.