-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[CostModel] Handle vector struct results and cost llvm.sincos
#123210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
#include "llvm/ADT/SmallVector.h" | ||
#include "llvm/Analysis/LoopInfo.h" | ||
#include "llvm/Analysis/OptimizationRemarkEmitter.h" | ||
#include "llvm/Analysis/TargetLibraryInfo.h" | ||
#include "llvm/Analysis/TargetTransformInfo.h" | ||
#include "llvm/Analysis/TargetTransformInfoImpl.h" | ||
#include "llvm/Analysis/ValueTracking.h" | ||
|
@@ -285,6 +286,64 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | |
return false; | ||
} | ||
|
||
/// Several intrinsics that return structs (including llvm.sincos[pi] and | ||
/// llvm.modf) can be lowered to a vector library call (for certain VFs). The | ||
/// vector library functions correspond to the scalar calls (e.g. sincos or | ||
/// modf), which unlike the intrinsic return values via output pointers. This | ||
/// helper checks if a vector call exists for the given intrinsic, and returns | ||
/// the cost, which includes the cost of the mask (if required), and the loads | ||
/// for values returned via output pointers. \p LC is the scalar libcall and | ||
/// \p CallRetElementIndex (optional) is the struct element which is mapped to | ||
/// the call return value. If std::nullopt is returned, then no vector library | ||
/// call is available, so the intrinsic should be assigned the default cost | ||
/// (e.g. scalarization). | ||
std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost( | ||
const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, | ||
RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) { | ||
Type *RetTy = ICA.getReturnType(); | ||
// Vector variants of the intrinsic can be mapped to a vector library call. | ||
auto const *LibInfo = ICA.getLibInfo(); | ||
if (!LibInfo || !isa<StructType>(RetTy) || | ||
!isVectorizedStructTy(cast<StructType>(RetTy))) | ||
return std::nullopt; | ||
|
||
// Find associated libcall. | ||
const char *LCName = getTLI()->getLibcallName(LC); | ||
if (!LCName) | ||
return std::nullopt; | ||
|
||
// Search for a corresponding vector variant. | ||
LLVMContext &Ctx = RetTy->getContext(); | ||
ElementCount VF = getVectorizedTypeVF(RetTy); | ||
VecDesc const *VD = nullptr; | ||
for (bool Masked : {false, true}) { | ||
if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked))) | ||
break; | ||
} | ||
if (!VD) | ||
return std::nullopt; | ||
|
||
// Cost the call + mask. | ||
auto Cost = | ||
thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind); | ||
if (VD->isMasked()) | ||
Cost += thisT()->getShuffleCost( | ||
TargetTransformInfo::SK_Broadcast, | ||
VectorType::get(IntegerType::getInt1Ty(Ctx), VF), {}, CostKind, 0, | ||
nullptr, {}); | ||
|
||
// Lowering to a library call (with output pointers) may require us to emit | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Always adding on the cost of a load seems pessimistic? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is slightly pessimistic, but the cost is still reasonably low at 12 or 13 (rather than 10 for a plain libcall), so the vectorizier still chooses to widen the intrinsic. It also means if libraries add structure-returning variants they'll preferred as they'll have a slightly lower cost. |
||
// reloads for the results. | ||
for (auto [Idx, VectorTy] : enumerate(getContainedTypes(RetTy))) { | ||
if (Idx == CallRetElementIndex) | ||
continue; | ||
Cost += thisT()->getMemoryOpCost( | ||
Instruction::Load, VectorTy, | ||
thisT()->getDataLayout().getABITypeAlign(VectorTy), 0, CostKind); | ||
} | ||
return Cost; | ||
} | ||
|
||
protected: | ||
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) | ||
: BaseT(DL) {} | ||
|
@@ -1726,9 +1785,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | |
|
||
Type *RetTy = ICA.getReturnType(); | ||
|
||
ElementCount RetVF = | ||
(RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() | ||
: ElementCount::getFixed(1)); | ||
ElementCount RetVF = isVectorizedTy(RetTy) ? getVectorizedTypeVF(RetTy) | ||
: ElementCount::getFixed(1); | ||
|
||
const IntrinsicInst *I = ICA.getInst(); | ||
const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | ||
FastMathFlags FMF = ICA.getFlags(); | ||
|
@@ -1997,6 +2056,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | |
} | ||
case Intrinsic::experimental_vector_match: | ||
return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind); | ||
case Intrinsic::sincos: { | ||
Type *Ty = getContainedTypes(RetTy).front(); | ||
EVT VT = getTLI()->getValueType(DL, Ty); | ||
RTLIB::Libcall LC = RTLIB::getSINCOS(VT.getScalarType()); | ||
if (auto Cost = | ||
getMultipleResultIntrinsicVectorLibCallCost(ICA, CostKind, LC)) | ||
return *Cost; | ||
// Otherwise, fallback to default scalarization cost. | ||
break; | ||
} | ||
} | ||
|
||
// Assume that we need to scalarize this intrinsic.) | ||
|
@@ -2005,10 +2074,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | |
InstructionCost ScalarizationCost = InstructionCost::getInvalid(); | ||
if (RetVF.isVector() && !RetVF.isScalable()) { | ||
ScalarizationCost = 0; | ||
if (!RetTy->isVoidTy()) | ||
ScalarizationCost += getScalarizationOverhead( | ||
cast<VectorType>(RetTy), | ||
/*Insert*/ true, /*Extract*/ false, CostKind); | ||
if (!RetTy->isVoidTy()) { | ||
for (Type *VectorTy : getContainedTypes(RetTy)) { | ||
ScalarizationCost += getScalarizationOverhead( | ||
cast<VectorType>(VectorTy), | ||
/*Insert=*/true, /*Extract=*/false, CostKind); | ||
} | ||
} | ||
ScalarizationCost += | ||
getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); | ||
} | ||
|
@@ -2689,27 +2761,32 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { | |
// Else, assume that we need to scalarize this intrinsic. For math builtins | ||
// this will emit a costly libcall, adding call overhead and spills. Make it | ||
// very expensive. | ||
if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { | ||
if (isVectorizedTy(RetTy)) { | ||
ArrayRef<Type *> RetVTys = getContainedTypes(RetTy); | ||
|
||
// Scalable vectors cannot be scalarized, so return Invalid. | ||
if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { | ||
return isa<ScalableVectorType>(Ty); | ||
})) | ||
if (any_of(concat<Type *const>(RetVTys, Tys), | ||
[](Type *Ty) { return isa<ScalableVectorType>(Ty); })) | ||
return InstructionCost::getInvalid(); | ||
|
||
InstructionCost ScalarizationCost = | ||
SkipScalarizationCost | ||
? ScalarizationCostPassed | ||
: getScalarizationOverhead(RetVTy, /*Insert*/ true, | ||
/*Extract*/ false, CostKind); | ||
InstructionCost ScalarizationCost = ScalarizationCostPassed; | ||
if (!SkipScalarizationCost) { | ||
ScalarizationCost = 0; | ||
for (Type *RetVTy : RetVTys) { | ||
ScalarizationCost += getScalarizationOverhead( | ||
cast<VectorType>(RetVTy), /*Insert=*/true, | ||
/*Extract=*/false, CostKind); | ||
} | ||
} | ||
|
||
unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); | ||
unsigned ScalarCalls = getVectorizedTypeVF(RetTy).getFixedValue(); | ||
SmallVector<Type *, 4> ScalarTys; | ||
for (Type *Ty : Tys) { | ||
if (Ty->isVectorTy()) | ||
Ty = Ty->getScalarType(); | ||
ScalarTys.push_back(Ty); | ||
} | ||
IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); | ||
IntrinsicCostAttributes Attrs(IID, toScalarizedTy(RetTy), ScalarTys, FMF); | ||
InstructionCost ScalarCost = | ||
thisT()->getIntrinsicInstrCost(Attrs, CostKind); | ||
for (Type *Ty : Tys) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,13 +17,15 @@ | |
//===----------------------------------------------------------------------===// | ||
|
||
#include "llvm/Analysis/CostModel.h" | ||
#include "llvm/Analysis/TargetLibraryInfo.h" | ||
#include "llvm/Analysis/TargetTransformInfo.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/IR/IntrinsicInst.h" | ||
#include "llvm/IR/PassManager.h" | ||
#include "llvm/Pass.h" | ||
#include "llvm/Support/CommandLine.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
|
||
using namespace llvm; | ||
|
||
static cl::opt<TargetTransformInfo::TargetCostKind> CostKind( | ||
|
@@ -42,25 +44,31 @@ static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost", | |
cl::desc("Calculate intrinsics cost based only on argument types"), | ||
cl::init(false)); | ||
|
||
static cl::opt<bool> PreferIntrinsicCost( | ||
"prefer-intrinsic-cost", | ||
cl::desc("Prefer using getIntrinsicInstrCost over getInstructionCost"), | ||
cl::init(false)); | ||
|
||
#define CM_NAME "cost-model" | ||
#define DEBUG_TYPE CM_NAME | ||
|
||
PreservedAnalyses CostModelPrinterPass::run(Function &F, | ||
FunctionAnalysisManager &AM) { | ||
auto &TTI = AM.getResult<TargetIRAnalysis>(F); | ||
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); | ||
OS << "Printing analysis 'Cost Model Analysis' for function '" << F.getName() << "':\n"; | ||
for (BasicBlock &B : F) { | ||
for (Instruction &Inst : B) { | ||
// TODO: Use a pass parameter instead of cl::opt CostKind to determine | ||
// which cost kind to print. | ||
InstructionCost Cost; | ||
auto *II = dyn_cast<IntrinsicInst>(&Inst); | ||
if (II && TypeBasedIntrinsicCost) { | ||
IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II, | ||
InstructionCost::getInvalid(), true); | ||
if (II && (PreferIntrinsicCost || TypeBasedIntrinsicCost)) { | ||
IntrinsicCostAttributes ICA( | ||
II->getIntrinsicID(), *II, InstructionCost::getInvalid(), | ||
/*TypeBasedOnly=*/TypeBasedIntrinsicCost, &TLI); | ||
Comment on lines
+67
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't know if this is a libcall or not here, I also don't understand why this is guarded by TypeBasedIntrinsicCost. Can we remove that one too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I only use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I must admit though it does seem a little odd now because This is just a suggestion, but if we had a
that way you can collapse the two options into one and both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've renamed the flag There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should at least have firm plan on removing the flag or setting it to true, so users benefit from the new code by default. It may not be an issue for AArch64, but people building for AArch64 also won't get any benefit unless they know to set this flag and there will be very little coverage of the code on larger projects. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to be clear this flag only affects the test cost-model print pass (so users = LLVM developers), not LLVM more generally. The loop vectorizer in #128035 always passes the TLI. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing the default would not be much trouble though -- it just requires adding a flag to a few tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem I see with changing the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I'm happy to address this in the follow-up PR (since there will be a little test churn in that patch anyway). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ok that's good, thanks. I think ideally we would get rid of the option as follow-up, as it seems confusing to have the cost model printer show something that's different to the cost that is actually used in the passes making the queries. |
||
Cost = TTI.getIntrinsicInstrCost(ICA, CostKind); | ||
} | ||
else { | ||
} else { | ||
Cost = TTI.getInstructionCost(&Inst, CostKind); | ||
} | ||
|
||
|
MacDue marked this conversation as resolved.
Show resolved
Hide resolved
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" | ||
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s | ||
; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -prefer-intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB | ||
|
||
define void @sincos() { | ||
; CHECK-LABEL: 'sincos' | ||
; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) | ||
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) | ||
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) | ||
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) | ||
; | ||
; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) | ||
; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) | ||
; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) | ||
; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) | ||
; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) | ||
; | ||
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) | ||
; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) | ||
; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) | ||
; CHECK: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) | ||
; CHECK: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) | ||
; | ||
; CHECK-VECLIB-LABEL: 'sincos' | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) | ||
; | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) | ||
; | ||
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) | ||
; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) | ||
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) | ||
; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) | ||
; | ||
%f16 = call { half, half } @llvm.sincos.f16(half poison) | ||
%f32 = call { float, float } @llvm.sincos.f32(float poison) | ||
%f64 = call { double, double } @llvm.sincos.f64(double poison) | ||
%f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) | ||
|
||
%v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) | ||
%v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) | ||
%v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) | ||
%v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) | ||
%v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) | ||
|
||
%nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.v8f16(<vscale x 8 x half> poison) | ||
%nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.v4f32(<vscale x 4 x float> poison) | ||
%nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.v2f64(<vscale x 2 x double> poison) | ||
%nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.v1f128(<vscale x 1 x fp128> poison) | ||
%nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.v8f32(<vscale x 8 x float> poison) | ||
|
||
ret void | ||
} |
Uh oh!
There was an error while loading. Please reload this page.