Skip to content

[LV][VPlan] Implement VPlan-based cost for exit condition. #125640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
60 changes: 16 additions & 44 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6343,6 +6343,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
InstructionCost Cost = 0;

if (canTruncateToMinimalBitwidth(I, VF)) {
Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
Expand All @@ -6354,11 +6355,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
}

// If the Cmp instruction has multiple uses in the loop, it
// will generate a scalar Cmp for latch and a vector Cmp for other uses.
if (I == TheLoop->getLatchCmpInst() && !I->hasOneUse())
Cost += TTI.getCmpSelInstrCost(I->getOpcode(), ValTy,
CmpInst::makeCmpResultType(ValTy),
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, I);

VectorTy = toVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(
I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
return Cost + TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy,
CmpInst::makeCmpResultType(VectorTy),
cast<CmpInst>(I)->getPredicate(),
CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, I);
}
case Instruction::Store:
case Instruction::Load: {
Expand Down Expand Up @@ -6901,46 +6913,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
}
}

/// Compute the cost of all exiting conditions of the loop using the legacy
/// cost model. This is to match the legacy behavior, which adds the cost of
/// all exit conditions. Note that this over-estimates the cost, as there will
/// be a single condition to control the vector loop.
SmallVector<BasicBlock *> Exiting;
CM.TheLoop->getExitingBlocks(Exiting);
SetVector<Instruction *> ExitInstrs;
// Collect all exit conditions.
for (BasicBlock *EB : Exiting) {
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
continue;
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
ExitInstrs.insert(CondI);
}
}
// Compute the cost of all instructions only feeding the exit conditions.
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
Instruction *CondI = ExitInstrs[I];
if (!OrigLoop->contains(CondI) ||
!CostCtx.SkipCostComputation.insert(CondI).second)
continue;
InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
LLVM_DEBUG({
dbgs() << "Cost of " << CondICost << " for VF " << VF
<< ": exit condition instruction " << *CondI << "\n";
});
Cost += CondICost;
for (Value *Op : CondI->operands()) {
auto *OpI = dyn_cast<Instruction>(Op);
if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
any_of(OpI->users(), [&ExitInstrs, this](User *U) {
return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
!ExitInstrs.contains(cast<Instruction>(U));
}))
continue;
ExitInstrs.insert(OpI);
}
}

// Pre-compute the costs for branches except for the backedge, as the number
// of replicate regions in a VPlan may not directly match the number of
// branches, which would lead to different decisions.
Expand Down
31 changes: 31 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -807,6 +807,37 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::BranchOnCount: {
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));

// If the vector loop only executed once (VF == original trip count), ignore
// the cost of cmp.
// TODO: We can remove this after hoist `unrollByUF` and
// `optimizeForVFandUF` which will optimize BranchOnCount out.
auto TC = dyn_cast_if_present<ConstantInt>(
getParent()->getPlan()->getTripCount()->getUnderlyingValue());
if (TC && VF.isFixed() && TC->getSExtValue() == VF.getFixedValue())
return 0;

// BranchOnCount will generate icmp_eq + br instructions and the
// cost of branch will be calculated in VPRegionBlock.
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr,
CmpInst::ICMP_EQ, Ctx.CostKind);
}
case VPInstruction::BranchOnCond: {
// BranchOnCond is free since the branch cost is already
// calculated by VPBB.
if (vputils::onlyFirstLaneUsed(getOperand(0)))
return 0;

// Otherwise, BranchOnCond will generate `extractelement` to extract the
// condition from vector type.
return Ctx.TTI.getVectorInstrCost(
Instruction::ExtractElement,
cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
Ctx.CostKind, 0, nullptr, nullptr);
}
case VPInstruction::FirstActiveLane: {
// Calculate the cost of determining the lane index.
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -661,29 +661,59 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-LABEL: define void @multiple_exit_conditions(
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
; DEFAULT-NEXT: [[ENTRY:.*]]:
; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], 8
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP9]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; DEFAULT: [[VECTOR_PH]]:
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP5:%.*]] = mul i64 [[TMP10]], 8
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP11:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP12:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP14:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp <vscale x 2 x i16> [[TMP11]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp <vscale x 2 x i16> [[TMP12]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP17:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP18:%.*]] = uitofp <vscale x 2 x i16> [[TMP14]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2
; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP21]]
; DEFAULT-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 4
; DEFAULT-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP24]]
; DEFAULT-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 6
; DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP27]]
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP15]], ptr [[TMP4]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP16]], ptr [[TMP22]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP17]], ptr [[TMP25]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP18]], ptr [[TMP28]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; DEFAULT-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; DEFAULT: [[SCALAR_PH]]:
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; DEFAULT-NEXT: br label %[[LOOP:.*]]
; DEFAULT: [[LOOP]]:
; DEFAULT-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ loop.end:

define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %src) {
; CHECK-LABEL: LV: Checking a loop in 'vectorization_not_profitable_due_to_trunc'
; CHECK: LV: Selecting VF: 1.
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
; CHECK: Calculating cost of work in exit block vector.early.exit:
; CHECK-NEXT: Cost of 6 for VF 2: EMIT vp<{{.*}}> = first-active-lane ir<{{.*}}>
; CHECK-NEXT: Cost of 2 for VF 2: EMIT vp<{{.*}}> = extractelement ir<{{.*}}>, vp<{{.*}}>
; CHECK: LV: Minimum required TC for runtime checks to be profitable:28
; CHECK: LV: Found a vectorizable loop (2)
entry:
br label %loop.header

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
entry:
Expand Down Expand Up @@ -43,12 +44,13 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}>
; CHECK: Cost for VF 16: 57
; CHECK: LV: Selecting VF: vscale x 2
entry:
Expand Down Expand Up @@ -80,12 +82,13 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 27
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 48
; CHECK: LV: Selecting VF: 16
entry:
Expand Down Expand Up @@ -116,11 +119,12 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user'
; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 12
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 13
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 4
; CHECK: LV: Selecting VF: 16
entry:
Expand Down
Loading