Skip to content

Commit 27d0cec

Browse files
committed
[LV] Add initial support for vectorizing literal struct return values
This patch adds initial support for vectorizing literal struct return values. Currently, this is limited to the case where the struct is homogeneous (all elements have the same type) and not packed. The users of the call also must all be `extractvalue` instructions. The intended use case for this is vectorizing intrinsics such as: ``` declare { float, float } @llvm.sincos.f32(float %x) ``` Mapping them to structure-returning library calls such as: ``` declare { <4 x float>, <4 x i32> } @Sleef_sincosf4_u10advsimd(<4 x float>) ``` Or their widened form (such as `@llvm.sincos.v4f32` in this case). Implementing this required two main changes: 1. Supporting widening `extractvalue` 2. Adding support for vectorized struct types in LV * This is mostly limited to parts of the cost model and scalarization Since the supported use case is narrow, the required changes are relatively small.
1 parent 7c72941 commit 27d0cec

File tree

10 files changed

+291
-92
lines changed

10 files changed

+291
-92
lines changed

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -422,10 +422,6 @@ class LoopVectorizationLegality {
422422
/// has a vectorized variant available.
423423
bool hasVectorCallVariants() const { return VecCallVariantsFound; }
424424

425-
/// Returns true if there is at least one function call in the loop which
426-
/// returns a struct type and needs to be vectorized.
427-
bool hasStructVectorCall() const { return StructVecCallFound; }
428-
429425
unsigned getNumStores() const { return LAI->getNumStores(); }
430426
unsigned getNumLoads() const { return LAI->getNumLoads(); }
431427

@@ -648,12 +644,6 @@ class LoopVectorizationLegality {
648644
/// the use of those function variants.
649645
bool VecCallVariantsFound = false;
650646

651-
/// If we find a call (to be vectorized) that returns a struct type, record
652-
/// that so we can bail out until this is supported.
653-
/// TODO: Remove this flag once vectorizing calls with struct returns is
654-
/// supported.
655-
bool StructVecCallFound = false;
656-
657647
/// Indicates whether this loop has an uncountable early exit, i.e. an
658648
/// uncountable exiting block that is not the latch.
659649
bool HasUncountableEarlyExit = false;

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -954,23 +954,16 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
954954
if (CI && !VFDatabase::getMappings(*CI).empty())
955955
VecCallVariantsFound = true;
956956

957-
auto CanWidenInstructionTy = [this](Instruction const &Inst) {
957+
auto CanWidenInstructionTy = [](Instruction const &Inst) {
958958
Type *InstTy = Inst.getType();
959959
if (!isa<StructType>(InstTy))
960960
return canVectorizeTy(InstTy);
961961

962962
// For now, we only recognize struct values returned from calls where
963963
// all users are extractvalue as vectorizable. All element types of the
964964
// struct must be types that can be widened.
965-
if (isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966-
all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
967-
// TODO: Remove the `StructVecCallFound` flag once vectorizing calls
968-
// with struct returns is supported.
969-
StructVecCallFound = true;
970-
return true;
971-
}
972-
973-
return false;
965+
return isa<CallInst>(Inst) && canWidenCallReturnType(InstTy) &&
966+
all_of(Inst.users(), IsaPred<ExtractValueInst>);
974967
};
975968

976969
// Check that the instruction return type is vectorizable.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2357,7 +2357,9 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23572357
VPReplicateRecipe *RepRecipe,
23582358
const VPLane &Lane,
23592359
VPTransformState &State) {
2360-
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2360+
assert((!Instr->getType()->isAggregateType() ||
2361+
canVectorizeTy(Instr->getType())) &&
2362+
"Expected vectorizable or non-aggregate type.");
23612363

23622364
// Does this instruction return a value ?
23632365
bool IsVoidRetTy = Instr->getType()->isVoidTy();
@@ -2953,10 +2955,10 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
29532955
return ScalarCallCost;
29542956
}
29552957

2956-
static Type *maybeVectorizeType(Type *Elt, ElementCount VF) {
2957-
if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2958-
return Elt;
2959-
return VectorType::get(Elt, VF);
2958+
static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2959+
if (VF.isScalar() || !canVectorizeTy(Ty))
2960+
return Ty;
2961+
return toVectorizedTy(Ty, VF);
29602962
}
29612963

29622964
InstructionCost
@@ -3723,9 +3725,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
37233725

37243726
// ExtractValue instructions must be uniform, because the operands are
37253727
// known to be loop-invariant.
3726-
if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
3727-
assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3728-
"Expected aggregate value to be loop invariant");
3728+
auto *EVI = dyn_cast<ExtractValueInst>(&I);
3729+
if (EVI && IsOutOfScope(EVI->getAggregateOperand())) {
37293730
AddToWorklistIfAllowed(EVI);
37303731
continue;
37313732
}
@@ -4608,8 +4609,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
46084609
llvm_unreachable("unhandled recipe");
46094610
}
46104611

4611-
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4612-
Type *VectorTy = toVectorTy(ScalarTy, VF);
4612+
auto WillWiden = [&TTI, VF](Type *VectorTy) {
46134613
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
46144614
if (!NumLegalParts)
46154615
return false;
@@ -4640,7 +4640,8 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
46404640
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
46414641
if (!Visited.insert({ScalarTy}).second)
46424642
continue;
4643-
if (WillWiden(ScalarTy))
4643+
Type *WideTy = toVectorizedTy(ScalarTy, VF);
4644+
if (any_of(getContainedTypes(WideTy), WillWiden))
46444645
return true;
46454646
}
46464647
}
@@ -5597,10 +5598,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
55975598
// Compute the scalarization overhead of needed insertelement instructions
55985599
// and phi nodes.
55995600
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5600-
ScalarCost += TTI.getScalarizationOverhead(
5601-
cast<VectorType>(toVectorTy(I->getType(), VF)),
5602-
APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
5603-
/*Extract*/ false, CostKind);
5601+
Type *WideTy = toVectorizedTy(I->getType(), VF);
5602+
for (Type *VectorTy : getContainedTypes(WideTy)) {
5603+
ScalarCost += TTI.getScalarizationOverhead(
5604+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
5605+
/*Insert=*/true,
5606+
/*Extract=*/false, CostKind);
5607+
}
56045608
ScalarCost +=
56055609
VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
56065610
}
@@ -6098,13 +6102,17 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
60986102
return 0;
60996103

61006104
InstructionCost Cost = 0;
6101-
Type *RetTy = toVectorTy(I->getType(), VF);
6105+
Type *RetTy = toVectorizedTy(I->getType(), VF);
61026106
if (!RetTy->isVoidTy() &&
6103-
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6104-
Cost += TTI.getScalarizationOverhead(
6105-
cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6106-
/*Insert*/ true,
6107-
/*Extract*/ false, CostKind);
6107+
(!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore())) {
6108+
6109+
for (Type *VectorTy : getContainedTypes(RetTy)) {
6110+
Cost += TTI.getScalarizationOverhead(
6111+
cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
6112+
/*Insert=*/true,
6113+
/*Extract=*/false, CostKind);
6114+
}
6115+
}
61086116

61096117
// Some targets keep addresses scalar.
61106118
if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6362,9 +6370,9 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
63626370

63636371
bool MaskRequired = Legal->isMaskRequired(CI);
63646372
// Compute corresponding vector type for return value and arguments.
6365-
Type *RetTy = toVectorTy(ScalarRetTy, VF);
6373+
Type *RetTy = toVectorizedTy(ScalarRetTy, VF);
63666374
for (Type *ScalarTy : ScalarTys)
6367-
Tys.push_back(toVectorTy(ScalarTy, VF));
6375+
Tys.push_back(toVectorizedTy(ScalarTy, VF));
63686376

63696377
// An in-loop reduction using an fmuladd intrinsic is a special case;
63706378
// we don't want the normal cost for that intrinsic.
@@ -6554,7 +6562,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
65546562
HasSingleCopyAfterVectorization(I, VF));
65556563
VectorTy = RetTy;
65566564
} else
6557-
VectorTy = toVectorTy(RetTy, VF);
6565+
VectorTy = toVectorizedTy(RetTy, VF);
65586566

65596567
if (VF.isVector() && VectorTy->isVectorTy() &&
65606568
!TTI.getNumberOfParts(VectorTy))
@@ -8674,7 +8682,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
86748682
case Instruction::Shl:
86758683
case Instruction::Sub:
86768684
case Instruction::Xor:
8677-
case Instruction::Freeze:
8685+
case Instruction::Freeze: {
86788686
SmallVector<VPValue *> NewOps(Operands);
86798687
if (Instruction::isBinaryOp(I->getOpcode())) {
86808688
// The legacy cost model uses SCEV to check if some of the operands are
@@ -8697,6 +8705,15 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
86978705
NewOps[1] = GetConstantViaSCEV(NewOps[1]);
86988706
}
86998707
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8708+
}
8709+
case Instruction::ExtractValue: {
8710+
SmallVector<VPValue *> NewOps(Operands);
8711+
Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
8712+
for (unsigned Idx : cast<ExtractValueInst>(I)->getIndices())
8713+
NewOps.push_back(
8714+
Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
8715+
return new VPWidenRecipe(*I, make_range(NewOps.begin(), NewOps.end()));
8716+
}
87008717
};
87018718
}
87028719

@@ -10036,7 +10053,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
1003610053
VectorType::get(UI->getType(), State.VF));
1003710054
State.set(this, Poison);
1003810055
}
10039-
State.packScalarIntoVectorValue(this, *State.Lane);
10056+
State.packScalarIntoVectorizedValue(this, *State.Lane);
1004010057
}
1004110058
return;
1004210059
}
@@ -10553,13 +10570,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1055310570
return false;
1055410571
}
1055510572

10556-
if (LVL.hasStructVectorCall()) {
10557-
reportVectorizationFailure("Auto-vectorization of calls that return struct "
10558-
"types is not yet supported",
10559-
"StructCallVectorizationUnsupported", ORE, L);
10560-
return false;
10561-
}
10562-
1056310573
// Entrance to the VPlan-native vectorization path. Outer loops are processed
1056410574
// here. They may require CFG and instruction level transformations before
1056510575
// even evaluating whether vectorization is profitable. Since we cannot modify

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -335,10 +335,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
335335
} else {
336336
// Initialize packing with insertelements to start from undef.
337337
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
338-
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
338+
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
339339
set(Def, Undef);
340340
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
341-
packScalarIntoVectorValue(Def, Lane);
341+
packScalarIntoVectorizedValue(Def, Lane);
342342
VectorValue = get(Def);
343343
}
344344
Builder.restoreIP(OldIP);
@@ -391,13 +391,24 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
391391
Builder.SetCurrentDebugLocation(DIL);
392392
}
393393

394-
void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
395-
const VPLane &Lane) {
394+
void VPTransformState::packScalarIntoVectorizedValue(VPValue *Def,
395+
const VPLane &Lane) {
396396
Value *ScalarInst = get(Def, Lane);
397-
Value *VectorValue = get(Def);
398-
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
399-
Lane.getAsRuntimeExpr(Builder, VF));
400-
set(Def, VectorValue);
397+
Value *WideValue = get(Def);
398+
Value *LaneExpr = Lane.getAsRuntimeExpr(Builder, VF);
399+
if (auto *StructTy = dyn_cast<StructType>(WideValue->getType())) {
400+
// We must handle each element of a vectorized struct type.
401+
for (unsigned I = 0, E = StructTy->getNumElements(); I != E; I++) {
402+
Value *ScalarValue = Builder.CreateExtractValue(ScalarInst, I);
403+
Value *VectorValue = Builder.CreateExtractValue(WideValue, I);
404+
VectorValue =
405+
Builder.CreateInsertElement(VectorValue, ScalarValue, LaneExpr);
406+
WideValue = Builder.CreateInsertValue(WideValue, VectorValue, I);
407+
}
408+
} else {
409+
WideValue = Builder.CreateInsertElement(WideValue, ScalarInst, LaneExpr);
410+
}
411+
set(Def, WideValue);
401412
}
402413

403414
BasicBlock *

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ struct VPTransformState {
281281
set(Def, V, VPLane(0));
282282
return;
283283
}
284-
assert((VF.isScalar() || V->getType()->isVectorTy()) &&
284+
assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
285285
"scalar values must be stored as (0, 0)");
286286
Data.VPV2Vector[Def] = V;
287287
}
@@ -330,8 +330,9 @@ struct VPTransformState {
330330
/// Set the debug location in the builder using the debug location \p DL.
331331
void setDebugLocFrom(DebugLoc DL);
332332

333-
/// Construct the vector value of a scalarized value \p V one lane at a time.
334-
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane);
333+
/// Construct the vectorized value of a scalarized value \p V one lane at a
334+
/// time.
335+
void packScalarIntoVectorizedValue(VPValue *Def, const VPLane &Lane);
335336

336337
/// Hold state information used when constructing the CFG of the output IR,
337338
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,13 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
124124
case Instruction::FNeg:
125125
case Instruction::Freeze:
126126
return inferScalarType(R->getOperand(0));
127+
case Instruction::ExtractValue: {
128+
assert(R->getNumOperands() == 2 && "expected single level extractvalue");
129+
auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
130+
auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
131+
unsigned Idx = CI->getZExtValue();
132+
return StructTy->getTypeAtIndex(Idx);
133+
}
127134
default:
128135
break;
129136
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1099,7 +1099,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
10991099
Arguments.push_back(V);
11001100
}
11011101

1102-
Type *RetTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
1102+
Type *RetTy = toVectorizedTy(Ctx.Types.inferScalarType(this), VF);
11031103
SmallVector<Type *> ParamTys;
11041104
for (unsigned I = 0; I != getNumOperands(); ++I)
11051105
ParamTys.push_back(
@@ -1405,6 +1405,15 @@ void VPWidenRecipe::execute(VPTransformState &State) {
14051405
State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
14061406
break;
14071407
}
1408+
case Instruction::ExtractValue: {
1409+
assert(getNumOperands() == 2 && "expected single level extractvalue");
1410+
Value *Op = State.get(getOperand(0));
1411+
auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
1412+
unsigned Idx = CI->getZExtValue();
1413+
Value *Extract = Builder.CreateExtractValue(Op, Idx);
1414+
State.set(this, Extract);
1415+
break;
1416+
}
14081417
case Instruction::Freeze: {
14091418
Value *Op = State.get(getOperand(0));
14101419

@@ -1506,6 +1515,9 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
15061515
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
15071516
Ctx.CostKind);
15081517
}
1518+
case Instruction::ExtractValue:
1519+
return Ctx.TTI.getInstructionCost(cast<Instruction>(getUnderlyingValue()),
1520+
TTI::TCK_RecipThroughput);
15091521
case Instruction::ICmp:
15101522
case Instruction::FCmp: {
15111523
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());

llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1-
; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s
2-
; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s
1+
; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
32

43
target triple = "aarch64-unknown-linux-gnu"
54

65
; Tests basic vectorization of scalable homogeneous struct literal returns.
76

8-
; TODO: Support vectorization in this case.
9-
; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
107
define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
118
; CHECK-LABEL: define void @struct_return_f32_widen
12-
; CHECK-NOT: vector.body:
9+
; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
10+
; CHECK: vector.body:
11+
; CHECK: [[WIDE_CALL:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
12+
; CHECK: [[WIDE_A:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 0
13+
; CHECK: [[WIDE_B:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[WIDE_CALL]], 1
14+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_A]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
15+
; CHECK: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[WIDE_B]], ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
1316
entry:
1417
br label %for.body
1518

@@ -32,11 +35,15 @@ exit:
3235
ret void
3336
}
3437

35-
; TODO: Support vectorization in this case.
36-
; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
3738
define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) {
3839
; CHECK-LABEL: define void @struct_return_f64_widen
39-
; CHECK-NOT: vector.body:
40+
; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]])
41+
; CHECK: vector.body:
42+
; CHECK: [[WIDE_CALL:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
43+
; CHECK: [[WIDE_A:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 0
44+
; CHECK: [[WIDE_B:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[WIDE_CALL]], 1
45+
; CHECK: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_A]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
46+
; CHECK: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[WIDE_B]], ptr {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
4047
entry:
4148
br label %for.body
4249

@@ -59,11 +66,16 @@ exit:
5966
ret void
6067
}
6168

62-
; TODO: Support vectorization in this case.
63-
; CHECK-REMARKS: remark: {{.*}} loop not vectorized: Auto-vectorization of calls that return struct types is not yet supported
6469
define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) {
6570
; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks
66-
; CHECK-NOT: vector.body:
71+
; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]])
72+
; CHECK: entry:
73+
; CHECK: br i1 false, label %scalar.ph, label %vector.memcheck
74+
; CHECK: vector.memcheck:
75+
; CHECK: vector.body:
76+
; CHECK: call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
77+
; CHECK: for.body:
78+
; CHECK: call { float, float } @foo(float [[LOAD:%.*]])
6779
entry:
6880
br label %for.body
6981

0 commit comments

Comments
 (0)