Skip to content

[SLP][TTI][X86]Add addsub pattern cost estimation. #76461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1243,6 +1243,18 @@ class TargetTransformInfo {
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr) const;

/// Returns the cost estimation for alternating opcode pattern that can be
/// lowered to a single instruction on the target. In X86 this is for the
/// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in
/// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being
/// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
/// when \p Opcode0 is selected and `1` when Opcode1 is selected.
/// \p VecTy is the vector type of the instruction to be generated.
InstructionCost getAltInstrCost(
VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

/// \return The cost of a shuffle instruction of kind Kind and of type Tp.
/// The exact mask may be passed as Mask, or else the array will be empty.
/// The index and subtype parameters are used by the subvector insertion and
Expand Down Expand Up @@ -1944,6 +1956,10 @@ class TargetTransformInfo::Concept {
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
virtual InstructionCost getAltInstrCost(
VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0;

virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
Expand Down Expand Up @@ -2555,6 +2571,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
Args, CxtI);
}
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1,
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind) const override {
return Impl.getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
}

InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,13 @@ class TargetTransformInfoImplBase {
return 1;
}

InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1,
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind) const {
return InstructionCost::getInvalid();
}

InstructionCost
getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,15 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
return Cost;
}

InstructionCost TargetTransformInfo::getAltInstrCost(
VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const {
InstructionCost Cost =
TTIImpl->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}

InstructionCost TargetTransformInfo::getShuffleCost(
ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1459,6 +1459,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
Args, CxtI);
}

InstructionCost
X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1, const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind) const {
if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
return TTI::TCC_Basic;
return InstructionCost::getInvalid();
}

InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask,
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
unsigned Opcode1,
const SmallBitVector &OpcodeMask,
TTI::TargetCostKind CostKind) const;

InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8428,6 +8428,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
Mask);
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
FinalVecTy, Mask);
// Patterns like [fadd,fsub] can be combined into a single instruction
// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
// need to take into account their order when looking for the most used
// order.
unsigned Opcode0 = E->getOpcode();
unsigned Opcode1 = E->getAltOpcode();
// The opcode mask selects between the two opcodes.
SmallBitVector OpcodeMask(E->Scalars.size(), false);
for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
OpcodeMask.set(Lane);
// If this pattern is supported by the target then we consider the
// order.
if (TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
InstructionCost AltVecCost =
TTI->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
return AltVecCost < VecCost ? AltVecCost : VecCost;
}
// TODO: Check the reverse order too.
return VecCost;
};
return GetCostDiff(GetScalarCost, GetVectorCost);
Expand Down
22 changes: 12 additions & 10 deletions llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
Original file line number Diff line number Diff line change
Expand Up @@ -103,21 +103,23 @@ define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, pt
; ENABLED-LABEL: @test_supernode_addsub_alt(
; ENABLED-NEXT: entry:
; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
; ENABLED-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, ptr [[SARRAY:%.*]], i64 1
; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
; ENABLED-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
; ENABLED-NEXT: [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
; ENABLED-NEXT: [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
; ENABLED-NEXT: [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
; ENABLED-NEXT: store double [[SUB0]], ptr [[SARRAY]], align 8
; ENABLED-NEXT: store double [[ADD1]], ptr [[IDXS1]], align 8
; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
; ENABLED-NEXT: ret void
;
entry:
Expand Down
20 changes: 11 additions & 9 deletions llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,24 @@ define void @foo() {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ]
; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr undef, align 8
; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]]
; CHECK: bb4:
; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double
; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
; CHECK-NEXT: [[SUB1:%.*]] = fsub double undef, undef
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> <double poison, double poison, double undef, double undef>, double [[SUB1]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]]
; CHECK-NEXT: [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float>
; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]]
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
; CHECK-NEXT: [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
; CHECK-NEXT: [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
; CHECK-NEXT: [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
; CHECK-NEXT: br label [[BB2]]
;
entry:
Expand Down