Skip to content

Commit 03f852f

Browse files
authored
[AArch64] Improve cost model for legal subvec insert/extract (llvm#81135)
Currently we model subvector inserts and extracts as shuffles, potentially going as far as scalarizing. If the types are legal then they can just be simple zip/unzip operations, or possible even no-ops. Change the cost to a relatively small one to ensure that simple loops featuring such operations between fixed and scalable vector types that are effectively the same at a given sve width can be unrolled and further optimized.
1 parent 89e41e2 commit 03f852f

File tree

3 files changed

+88
-18
lines changed

3 files changed

+88
-18
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,11 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
500500
return TTI::PSK_Software;
501501
}
502502

503+
static bool isUnpackedVectorVT(EVT VecVT) {
504+
return VecVT.isScalableVector() &&
505+
VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
506+
}
507+
503508
InstructionCost
504509
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
505510
TTI::TargetCostKind CostKind) {
@@ -568,6 +573,39 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
568573
}
569574
return Cost;
570575
}
576+
case Intrinsic::vector_extract:
577+
case Intrinsic::vector_insert: {
578+
// If both the vector and subvector types are legal types and the index
579+
// is 0, then this should be a no-op or simple operation; return a
580+
// relatively low cost.
581+
582+
// If arguments aren't actually supplied, then we cannot determine the
583+
// value of the index. We also want to skip predicate types.
584+
if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
585+
ICA.getReturnType()->getScalarType()->isIntegerTy(1))
586+
break;
587+
588+
LLVMContext &C = RetTy->getContext();
589+
EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
590+
bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
591+
EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
592+
: getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
593+
// Skip this if either the vector or subvector types are unpacked
594+
// SVE types; they may get lowered to stack stores and loads.
595+
if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
596+
break;
597+
598+
TargetLoweringBase::LegalizeKind SubVecLK =
599+
getTLI()->getTypeConversion(C, SubVecVT);
600+
TargetLoweringBase::LegalizeKind VecLK =
601+
getTLI()->getTypeConversion(C, VecVT);
602+
const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
603+
const ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
604+
if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
605+
VecLK.first == TargetLoweringBase::TypeLegal && CIdx && CIdx->isZero())
606+
return TTI::TCC_Free;
607+
break;
608+
}
571609
case Intrinsic::bitreverse: {
572610
static const CostTblEntry BitreverseTbl[] = {
573611
{Intrinsic::bitreverse, MVT::i32, 1},

llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,15 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
3232

3333
define void @vector_insert_extract_idxzero_128b() #1 {
3434
; CHECK-LABEL: 'vector_insert_extract_idxzero_128b'
35-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
36-
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
35+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
36+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
3737
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
3838
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
3939
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
40+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
41+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
42+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
43+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
4044
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4145
;
4246
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_128b'
@@ -45,27 +49,43 @@ define void @vector_insert_extract_idxzero_128b() #1 {
4549
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
4650
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
4751
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
52+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
53+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
54+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
55+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
4856
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
4957
;
5058
%insert_legal_fixed_into_scalable = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> undef, i64 0)
5159
%extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
5260
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
5361
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
5462
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
63+
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
64+
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
65+
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
66+
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
5567
ret void
5668
}
5769
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
5870
declare <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double>, i64)
5971
declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1>, <vscale x 2 x i1>, i64)
6072
declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
73+
declare <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float>, <2 x float>, i64)
74+
declare <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half>, i64)
75+
declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
76+
declare <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float>, i64)
6177

6278
define void @vector_insert_extract_idxzero_256b() #2 {
6379
; CHECK-LABEL: 'vector_insert_extract_idxzero_256b'
64-
; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
65-
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
80+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
81+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
6682
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
6783
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
6884
; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
85+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
86+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
87+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
88+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
6989
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
7090
;
7191
; TYPE_BASED_ONLY-LABEL: 'vector_insert_extract_idxzero_256b'
@@ -74,13 +94,21 @@ define void @vector_insert_extract_idxzero_256b() #2 {
7494
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
7595
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
7696
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
97+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
98+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
99+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
100+
; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
77101
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
78102
;
79103
%insert_legal_fixed_into_scalable = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> undef, i64 0)
80104
%extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nx4f32(<vscale x 4 x float> undef, i64 0)
81105
%insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
82106
%extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
83107
%extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
108+
%insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
109+
%extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
110+
%insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
111+
%extract_nxv4f32_nxv8f32 = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> undef, i64 0)
84112
ret void
85113
}
86114
declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16>, <16 x i16>, i64)

llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,26 +12,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
1212
; CHECK-LABEL: define void @test_ins_ext_cost(
1313
; CHECK-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
1414
; CHECK-NEXT: entry:
15-
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
16-
; CHECK: for.body:
17-
; CHECK-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
18-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
19-
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
20-
; CHECK-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
21-
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
22-
; CHECK-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
23-
; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
24-
; CHECK-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
15+
; CHECK-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
16+
; CHECK-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
17+
; CHECK-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
2518
; CHECK-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
2619
; CHECK-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
2720
; CHECK-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
2821
; CHECK-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
2922
; CHECK-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
3023
; CHECK-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
31-
; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
32-
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
33-
; CHECK-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
34-
; CHECK: exit:
24+
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
25+
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
26+
; CHECK-NEXT: [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
27+
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
28+
; CHECK-NEXT: [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
29+
; CHECK-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
30+
; CHECK-NEXT: [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
31+
; CHECK-NEXT: [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
32+
; CHECK-NEXT: [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
33+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
34+
; CHECK-NEXT: [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
35+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
36+
; CHECK-NEXT: [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
37+
; CHECK-NEXT: [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
38+
; CHECK-NEXT: store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
3539
; CHECK-NEXT: ret void
3640
;
3741
entry:

0 commit comments

Comments
 (0)