-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Improve cost model for legal subvec insert/extract #81135
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a3030ee
b5b9931
37b3bbc
53cac36
69f866f
595faf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -500,6 +500,11 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { | |
return TTI::PSK_Software; | ||
} | ||
|
||
static bool isUnpackedVectorVT(EVT VecVT) { | ||
return VecVT.isScalableVector() && | ||
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; | ||
} | ||
|
||
InstructionCost | ||
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | ||
TTI::TargetCostKind CostKind) { | ||
|
@@ -568,6 +573,39 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |
} | ||
return Cost; | ||
} | ||
case Intrinsic::vector_extract: | ||
case Intrinsic::vector_insert: { | ||
// If both the vector and subvector types are legal types and the index | ||
// is 0, then this should be a no-op or simple operation; return a | ||
// relatively low cost. | ||
|
||
// If arguments aren't actually supplied, then we cannot determine the | ||
// value of the index. We also want to skip predicate types. | ||
if (ICA.getArgs().size() != ICA.getArgTypes().size() || | ||
ICA.getReturnType()->getScalarType()->isIntegerTy(1)) | ||
break; | ||
|
||
LLVMContext &C = RetTy->getContext(); | ||
EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]); | ||
bool IsExtract = ICA.getID() == Intrinsic::vector_extract; | ||
EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy) | ||
: getTLI()->getValueType(DL, ICA.getArgTypes()[1]); | ||
// Skip this if either the vector or subvector types are unpacked | ||
// SVE types; they may get lowered to stack stores and loads. | ||
if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT)) | ||
break; | ||
|
||
TargetLoweringBase::LegalizeKind SubVecLK = | ||
getTLI()->getTypeConversion(C, SubVecVT); | ||
TargetLoweringBase::LegalizeKind VecLK = | ||
getTLI()->getTypeConversion(C, VecVT); | ||
const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; | ||
const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be just |
||
if (SubVecLK.first == TargetLoweringBase::TypeLegal && | ||
VecLK.first == TargetLoweringBase::TypeLegal && CIdx && CIdx->isZero()) | ||
return TTI::TCC_Free; | ||
break; | ||
} | ||
case Intrinsic::bitreverse: { | ||
static const CostTblEntry BitreverseTbl[] = { | ||
{Intrinsic::bitreverse, MVT::i32, 1}, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,11 +32,15 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x | |
|
||
define void @vector_insert_extract_idxzero_128b() #1 { | ||
; CHECK-LABEL: 'vector_insert_extract_idxzero_128b' | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b' | ||
|
@@ -45,27 +49,43 @@ define void @vector_insert_extract_idxzero_128b() #1 { | |
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps not something to address in this patch, but I think it's wrong for this case to return an invalid cost, even when it cannot deduce anything for the operands. At worst it's a high cost for using spills and fills. But invalid cost basically says that we can't lower the general case, which isn't true. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That comes from the generic cost model code, which might just give up for scalable vectors. I guess we need to override the cost for more variants. But yes, in a later patch. |
||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0) | ||
%extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0) | ||
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
ret void | ||
} | ||
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64) | ||
declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i64) | ||
declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i64) | ||
declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64) | ||
declare <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float>, <2 x float>, i64) | ||
declare <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half>, i64) | ||
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64) | ||
declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64) | ||
|
||
define void @vector_insert_extract_idxzero_256b() #2 { | ||
; CHECK-LABEL: 'vector_insert_extract_idxzero_256b' | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b' | ||
|
@@ -74,13 +94,21 @@ define void @vector_insert_extract_idxzero_256b() #2 { | |
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void | ||
; | ||
%insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0) | ||
%extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> undef, i64 0) | ||
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0) | ||
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0) | ||
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0) | ||
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0) | ||
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0) | ||
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0) | ||
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0) | ||
ret void | ||
} | ||
declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64) | ||
|
Uh oh!
There was an error while loading. Please reload this page.