Skip to content

Commit 6108d50

Browse files
authored
[VPlan] Add ReductionStartVector VPInstruction. (#142290)
Add a new VPInstruction::ReductionStartVector opcode to create the start values for wide reductions. This more accurately models the start value creation in VPlan and simplifies VPReductionPHIRecipe::execute. Down the line it also allows removing VPReductionPHIRecipe::RdxDesc. PR: #142290
1 parent 6cbd91e commit 6108d50

20 files changed

+177
-114
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7235,8 +7235,14 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
72357235
cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
72367236
const RecurrenceDescriptor &RdxDesc =
72377237
EpiRedHeaderPhi->getRecurrenceDescriptor();
7238-
Value *MainResumeValue =
7239-
EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7238+
Value *MainResumeValue;
7239+
if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
7240+
assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7241+
VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7242+
"unexpected start recipe");
7243+
MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
7244+
} else
7245+
MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
72407246
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
72417247
RdxDesc.getRecurrenceKind())) {
72427248
Value *StartV = EpiRedResult->getOperand(1)->getLiveInIRValue();
@@ -9173,7 +9179,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
91739179
continue;
91749180

91759181
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9176-
Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
9182+
Type *PhiTy = PhiR->getUnderlyingValue()->getType();
91779183
// If tail is folded by masking, introduce selects between the phi
91789184
// and the users outside the vector region of each reduction, at the
91799185
// beginning of the dedicated latch block.
@@ -9311,6 +9317,27 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93119317
// start value.
93129318
PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
93139319
}
9320+
RecurKind RK = RdxDesc.getRecurrenceKind();
9321+
if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
9322+
!RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
9323+
!RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
9324+
VPBuilder PHBuilder(Plan->getVectorPreheader());
9325+
VPValue *Iden = Plan->getOrAddLiveIn(
9326+
getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
9327+
// If the PHI is used by a partial reduction, set the scale factor.
9328+
unsigned ScaleFactor =
9329+
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
9330+
.value_or(1);
9331+
Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
9332+
auto *ScaleFactorVPV =
9333+
Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
9334+
VPValue *StartV = PHBuilder.createNaryOp(
9335+
VPInstruction::ReductionStartVector,
9336+
{PhiR->getStartValue(), Iden, ScaleFactorVPV},
9337+
PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9338+
: FastMathFlags());
9339+
PhiR->setOperand(0, StartV);
9340+
}
93149341
}
93159342
for (VPRecipeBase *R : ToDelete)
93169343
R->eraseFromParent();
@@ -9816,6 +9843,15 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
98169843
Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
98179844
ResumeV =
98189845
Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
9846+
} else {
9847+
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
9848+
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9849+
if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
9850+
assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9851+
"unexpected start value");
9852+
VPI->setOperand(0, StartVal);
9853+
continue;
9854+
}
98199855
}
98209856
} else {
98219857
// Retrieve the induction resume values for wide inductions from

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
934934
/// Scale the first operand (vector step) by the second operand
935935
/// (scalar-step). Casts both operands to the result type if needed.
936936
WideIVStep,
937+
/// Start vector for reductions with 3 operands: the original start value,
938+
/// the identity value for the reduction and an integer indicating the
939+
/// scaling factor.
940+
ReductionStartVector,
937941
// Creates a step vector starting from 0 to VF with a step of 1.
938942
StepVector,
939943

@@ -2231,7 +2235,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
22312235
bool onlyFirstLaneUsed(const VPValue *Op) const override {
22322236
assert(is_contained(operands(), Op) &&
22332237
"Op must be an operand of the recipe");
2234-
return Op == getStartValue();
2238+
return isOrdered() || isInLoop();
22352239
}
22362240
};
22372241

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
7474
switch (Opcode) {
7575
case Instruction::ExtractElement:
7676
case Instruction::Freeze:
77+
case VPInstruction::ReductionStartVector:
7778
return inferScalarType(R->getOperand(0));
7879
case Instruction::Select: {
7980
Type *ResTy = inferScalarType(R->getOperand(1));
@@ -395,6 +396,10 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) {
395396
return RR->getVFScaleFactor();
396397
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
397398
return RR->getVFScaleFactor();
399+
assert(
400+
(!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
401+
VPInstruction::ReductionStartVector) &&
402+
"getting scaling factor of reduction-start-vector not implemented yet");
398403
return 1;
399404
}
400405

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 28 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
604604
return Builder.CreateVectorSplat(
605605
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
606606
}
607+
case VPInstruction::ReductionStartVector: {
608+
if (State.VF.isScalar())
609+
return State.get(getOperand(0), true);
610+
IRBuilderBase::FastMathFlagGuard FMFG(Builder);
611+
Builder.setFastMathFlags(getFastMathFlags());
612+
// If this start vector is scaled then it should produce a vector with fewer
613+
// elements than the VF.
614+
ElementCount VF = State.VF.divideCoefficientBy(
615+
cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
616+
auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
617+
Constant *Zero = Builder.getInt32(0);
618+
return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
619+
Zero);
620+
}
607621
case VPInstruction::ComputeAnyOfResult: {
608622
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
609623
// and will be removed by breaking up the recipe further.
@@ -899,6 +913,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
899913
case VPInstruction::PtrAdd:
900914
case VPInstruction::WideIVStep:
901915
case VPInstruction::StepVector:
916+
case VPInstruction::ReductionStartVector:
902917
return false;
903918
default:
904919
return true;
@@ -929,6 +944,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
929944
case VPInstruction::CanonicalIVIncrementForPart:
930945
case VPInstruction::BranchOnCount:
931946
case VPInstruction::BranchOnCond:
947+
case VPInstruction::ReductionStartVector:
932948
return true;
933949
case VPInstruction::PtrAdd:
934950
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
@@ -1034,6 +1050,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
10341050
case VPInstruction::FirstActiveLane:
10351051
O << "first-active-lane";
10361052
break;
1053+
case VPInstruction::ReductionStartVector:
1054+
O << "reduction-start-vector";
1055+
break;
10371056
default:
10381057
O << Instruction::getOpcodeName(getOpcode());
10391058
}
@@ -1617,6 +1636,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
16171636
Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
16181637
Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
16191638
Opcode == VPInstruction::WideIVStep ||
1639+
Opcode == VPInstruction::ReductionStartVector ||
16201640
Opcode == VPInstruction::ComputeReductionResult;
16211641
case OperationType::NonNegOp:
16221642
return Opcode == Instruction::ZExt;
@@ -3847,17 +3867,19 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
38473867
#endif
38483868

38493869
void VPReductionPHIRecipe::execute(VPTransformState &State) {
3850-
// If this phi is fed by a scaled reduction then it should output a
3851-
// vector with fewer elements than the VF.
3852-
ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
3870+
// Reductions do not have to start at zero. They can start with
3871+
// any loop invariant values.
3872+
VPValue *StartVPV = getStartValue();
38533873

38543874
// In order to support recurrences we need to be able to vectorize Phi nodes.
38553875
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
38563876
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
38573877
// this value when we vectorize all of the instructions that use the PHI.
3858-
auto *ScalarTy = State.TypeAnalysis.inferScalarType(this);
3878+
BasicBlock *VectorPH =
3879+
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
38593880
bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3860-
Type *VecTy = ScalarPHI ? ScalarTy : VectorType::get(ScalarTy, VF);
3881+
Value *StartV = State.get(StartVPV, ScalarPHI);
3882+
Type *VecTy = StartV->getType();
38613883

38623884
BasicBlock *HeaderBB = State.CFG.PrevBB;
38633885
assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
@@ -3866,49 +3888,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
38663888
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
38673889
State.set(this, Phi, IsInLoop);
38683890

3869-
BasicBlock *VectorPH =
3870-
State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
3871-
// Create start and identity vector values for the reduction in the preheader.
3872-
// TODO: Introduce recipes in VPlan preheader to create initial values.
3873-
IRBuilderBase::InsertPointGuard IPBuilder(State.Builder);
3874-
State.Builder.SetInsertPoint(VectorPH->getTerminator());
3875-
3876-
// Reductions do not have to start at zero. They can start with
3877-
// any loop invariant values.
3878-
VPValue *StartVPV = getStartValue();
3879-
RecurKind RK = RdxDesc.getRecurrenceKind();
3880-
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
3881-
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
3882-
RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
3883-
// [I|F]FindLastIV will use a sentinel value to initialize the reduction
3884-
// phi or the resume value from the main vector loop when vectorizing the
3885-
// epilogue loop. In the exit block, ComputeReductionResult will generate
3886-
// checks to verify if the reduction result is the sentinel value. If the
3887-
// result is the sentinel value, it will be corrected back to the start
3888-
// value.
3889-
// TODO: The sentinel value is not always necessary. When the start value is
3890-
// a constant, and smaller than the start value of the induction variable,
3891-
// the start value can be directly used to initialize the reduction phi.
3892-
Phi->addIncoming(State.get(StartVPV, ScalarPHI), VectorPH);
3893-
return;
3894-
}
3895-
3896-
Value *Iden = getRecurrenceIdentity(RK, VecTy->getScalarType(),
3897-
RdxDesc.getFastMathFlags());
3898-
unsigned CurrentPart = getUnrollPart(*this);
3899-
Value *StartV = StartVPV->getLiveInIRValue();
3900-
if (!ScalarPHI) {
3901-
if (CurrentPart == 0) {
3902-
Iden = State.Builder.CreateVectorSplat(VF, Iden);
3903-
Constant *Zero = State.Builder.getInt32(0);
3904-
StartV = State.Builder.CreateInsertElement(Iden, StartV, Zero);
3905-
} else {
3906-
Iden = State.Builder.CreateVectorSplat(VF, Iden);
3907-
}
3908-
}
3909-
3910-
Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
3911-
Phi->addIncoming(StartVal, VectorPH);
3891+
Phi->addIncoming(StartV, VectorPH);
39123892
}
39133893

39143894
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,16 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
11531153
return;
11541154
}
11551155
}
1156+
// Simplify redundant ReductionStartVector recipes after unrolling.
1157+
VPValue *StartV;
1158+
if (match(Def, m_VPInstruction<VPInstruction::ReductionStartVector>(
1159+
m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1160+
Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1161+
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1162+
return PhiR && PhiR->isInLoop();
1163+
});
1164+
return;
1165+
}
11561166
}
11571167

11581168
void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {

llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/IR/Intrinsics.h"
2626

2727
using namespace llvm;
28+
using namespace llvm::VPlanPatternMatch;
2829

2930
namespace {
3031

@@ -223,6 +224,22 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
223224
Copy->addOperand(R);
224225
Copy->addOperand(getConstantVPV(Part));
225226
} else if (RdxPhi) {
227+
// If the start value is a ReductionStartVector, use the identity value
228+
// (second operand) for unrolled parts. If the scaling factor is > 1,
229+
// create a new ReductionStartVector with the scale factor and both
230+
// operands set to the identity value.
231+
if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
232+
assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
233+
"unexpected start VPInstruction");
234+
if (match(VPI->getOperand(2), m_SpecificInt(1))) {
235+
Copy->setOperand(0, VPI->getOperand(1));
236+
} else if (Part == 1) {
237+
auto *C = VPI->clone();
238+
C->setOperand(0, C->getOperand(1));
239+
C->insertAfter(VPI);
240+
addUniformForAllParts(C);
241+
}
242+
}
226243
Copy->addOperand(getConstantVPV(Part));
227244
} else {
228245
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
@@ -233,7 +250,6 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
233250

234251
/// Handle non-header-phi recipes.
235252
void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
236-
using namespace llvm::VPlanPatternMatch;
237253
if (match(&R, m_BranchOnCond(m_VPValue())) ||
238254
match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
239255
return;
@@ -301,7 +317,6 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
301317
}
302318
}
303319

304-
using namespace llvm::VPlanPatternMatch;
305320
void UnrollState::unrollBlock(VPBlockBase *VPB) {
306321
auto *VPR = dyn_cast<VPRegionBlock>(VPB);
307322
if (VPR) {

llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,16 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
6060
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8
6161
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]]
6262
; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[N_VEC5]] to i8
63-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
63+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0
6464
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
65-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
66-
; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0
65+
; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0
6766
; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer
67+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT11]], <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
6868
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
6969
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
7070
; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
7171
; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
72-
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT11]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
72+
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
7373
; CHECK-NEXT: [[IV:%.*]] = trunc i32 [[INDEX6]] to i8
7474
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]]
7575
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i32 0
@@ -87,12 +87,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
8787
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
8888
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
8989
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
90-
; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
91-
; CHECK-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
90+
; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
91+
; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
9292
; CHECK-NEXT: br label %[[LOOP:.*]]
9393
; CHECK: [[LOOP]]:
94-
; CHECK-NEXT: [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
95-
; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
94+
; CHECK-NEXT: [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
95+
; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
9696
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]]
9797
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP1]], align 8
9898
; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,8 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
161161
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
162162
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
163163
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
164-
; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
165164
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
165+
; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
166166
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
167167
; CHECK: vec.epilog.vector.body:
168168
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]

llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
1717
; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
1818
; CHECK-EMPTY:
1919
; CHECK-NEXT: vector.ph:
20+
; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
2021
; CHECK-NEXT: Successor(s): vector loop
2122
; CHECK-EMPTY:
2223
; CHECK-NEXT: <x1> vector loop: {
2324
; CHECK-NEXT: vector.body:
2425
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
25-
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
26+
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
2627
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
2728
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
2829
; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
@@ -83,11 +84,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
8384
; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
8485
; CHECK-EMPTY:
8586
; CHECK-NEXT: ir-bb<vector.ph>:
87+
; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
8688
; CHECK-NEXT: Successor(s): vector.body
8789
; CHECK-EMPTY:
8890
; CHECK-NEXT: vector.body:
8991
; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
90-
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add> (VF scaled by 1/4)
92+
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
9193
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
9294
; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
9395
; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>

0 commit comments

Comments
 (0)