Skip to content

[AArch64] Improve cost model for legal subvec insert/extract #81135

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,11 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}

static bool isUnpackedVectorVT(EVT VecVT) {
return VecVT.isScalableVector() &&
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
}

InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
Expand Down Expand Up @@ -568,6 +573,39 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
return Cost;
}
case Intrinsic::vector_extract:
case Intrinsic::vector_insert: {
// If both the vector and subvector types are legal types and the index
// is 0, then this should be a no-op or simple operation; return a
// relatively low cost.

// If arguments aren't actually supplied, then we cannot determine the
// value of the index. We also want to skip predicate types.
if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
ICA.getReturnType()->getScalarType()->isIntegerTy(1))
break;

LLVMContext &C = RetTy->getContext();
EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
: getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
// Skip this if either the vector or subvector types are unpacked
// SVE types; they may get lowered to stack stores and loads.
if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
break;

TargetLoweringBase::LegalizeKind SubVecLK =
getTLI()->getTypeConversion(C, SubVecVT);
TargetLoweringBase::LegalizeKind VecLK =
getTLI()->getTypeConversion(C, VecVT);
const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
Copy link
Collaborator

@paulwalker-arm paulwalker-arm Mar 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be just cast<> and the null check removed? I ask because the LangRef says the index operand must be a constant integer.

if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
VecLK.first == TargetLoweringBase::TypeLegal && CIdx && CIdx->isZero())
return TTI::TCC_Free;
break;
}
case Intrinsic::bitreverse: {
static const CostTblEntry BitreverseTbl[] = {
{Intrinsic::bitreverse, MVT::i32, 1},
Expand Down
36 changes: 32 additions & 4 deletions llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,15 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x

define void @vector_insert_extract_idxzero_128b() #1 {
; CHECK-LABEL: 'vector_insert_extract_idxzero_128b'
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
Expand All @@ -45,27 +49,43 @@ define void @vector_insert_extract_idxzero_128b() #1 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps not something to address in this patch, but I think it's wrong for this case to return an invalid cost, even when it cannot deduce anything for the operands. At worst it's a high cost for using spills and fills. But invalid cost basically says that we can't lower the general case, which isn't true.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That comes from the generic cost model code, which might just give up for scalable vectors. I guess we need to override the cost for more variants. But yes, in a later patch.

; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
%extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
ret void
}
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i64)
declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i64)
declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
declare <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float>, <2 x float>, i64)
declare <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half>, i64)
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64)

define void @vector_insert_extract_idxzero_256b() #2 {
; CHECK-LABEL: 'vector_insert_extract_idxzero_256b'
; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
Expand All @@ -74,13 +94,21 @@ define void @vector_insert_extract_idxzero_256b() #2 {
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
%extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> undef, i64 0)
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
ret void
}
declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)
Expand Down
32 changes: 18 additions & 14 deletions llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
; CHECK-LABEL: define void @test_ins_ext_cost(
; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
; CHECK-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
; CHECK-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
; CHECK-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
; CHECK-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
; CHECK-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
; CHECK-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
; CHECK-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
; CHECK-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
; CHECK-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
; CHECK-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
; CHECK-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
; CHECK-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
; CHECK: exit:
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
; CHECK-NEXT: [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
; CHECK-NEXT: [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
; CHECK-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
; CHECK-NEXT: [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
; CHECK-NEXT: [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
; CHECK-NEXT: [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
; CHECK-NEXT: [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
; CHECK-NEXT: [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
; CHECK-NEXT: [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
; CHECK-NEXT: [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
; CHECK-NEXT: ret void
;
entry:
Expand Down