Skip to content

Commit 809f857

Browse files
authored
[VPlan] Support early-exit loops in optimizeForVFAndUF. (#131539)
Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV PR: #131539
1 parent d63cc4c commit 809f857

File tree

2 files changed

+75
-51
lines changed

2 files changed

+75
-51
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,35 +1163,75 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
11631163
return MadeChange;
11641164
}
11651165

1166-
/// Try to simplify the branch condition of \p Plan. This may restrict the
1167-
/// resulting plan to \p BestVF and \p BestUF.
1168-
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1169-
unsigned BestUF,
1170-
PredicatedScalarEvolution &PSE) {
1171-
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1172-
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1173-
auto *Term = &ExitingVPBB->back();
1174-
// Try to simplify the branch condition if TC <= VF * UF when preparing to
1175-
// execute the plan for the main vector loop. We only do this if the
1176-
// terminator is:
1177-
// 1. BranchOnCount, or
1178-
// 2. BranchOnCond where the input is Not(ActiveLaneMask).
1166+
/// Return true if \p Cond is known to be true for given \p BestVF and \p
1167+
/// BestUF.
1168+
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1169+
ElementCount BestVF, unsigned BestUF,
1170+
ScalarEvolution &SE) {
11791171
using namespace llvm::VPlanPatternMatch;
1180-
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
1181-
!match(Term,
1182-
m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
1172+
if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
1173+
return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1174+
&SE](VPValue *C) {
1175+
return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE);
1176+
});
1177+
1178+
auto *CanIV = Plan.getCanonicalIV();
1179+
if (!match(Cond, m_Binary<Instruction::ICmp>(
1180+
m_Specific(CanIV->getBackedgeValue()),
1181+
m_Specific(&Plan.getVectorTripCount()))) ||
1182+
cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
1183+
CmpInst::ICMP_EQ)
11831184
return false;
11841185

1185-
ScalarEvolution &SE = *PSE.getSE();
1186+
// The compare checks CanIV + VFxUF == vector trip count. The vector trip
1187+
// count is not conveniently available as SCEV so far, so we compare directly
1188+
// against the original trip count. This is stricter than necessary, as we
1189+
// will only return true if the trip count == vector trip count.
1190+
// TODO: Use SCEV for vector trip count once available, to cover cases where
1191+
// vector trip count == UF * VF, but original trip count != UF * VF.
11861192
const SCEV *TripCount =
11871193
vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
11881194
assert(!isa<SCEVCouldNotCompute>(TripCount) &&
11891195
"Trip count SCEV must be computable");
11901196
ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
11911197
const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1192-
if (TripCount->isZero() ||
1193-
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1198+
return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
1199+
}
1200+
1201+
/// Try to simplify the branch condition of \p Plan. This may restrict the
1202+
/// resulting plan to \p BestVF and \p BestUF.
1203+
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1204+
unsigned BestUF,
1205+
PredicatedScalarEvolution &PSE) {
1206+
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1207+
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1208+
auto *Term = &ExitingVPBB->back();
1209+
VPValue *Cond;
1210+
ScalarEvolution &SE = *PSE.getSE();
1211+
using namespace llvm::VPlanPatternMatch;
1212+
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1213+
match(Term, m_BranchOnCond(
1214+
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
1215+
// Try to simplify the branch condition if TC <= VF * UF when the latch
1216+
// terminator is BranchOnCount or BranchOnCond where the input is
1217+
// Not(ActiveLaneMask).
1218+
const SCEV *TripCount =
1219+
vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE);
1220+
assert(!isa<SCEVCouldNotCompute>(TripCount) &&
1221+
"Trip count SCEV must be computable");
1222+
ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1223+
const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
1224+
if (TripCount->isZero() ||
1225+
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1226+
return false;
1227+
} else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) {
1228+
// For BranchOnCond, check if we can prove the condition to be true using VF
1229+
// and UF.
1230+
if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE))
1231+
return false;
1232+
} else {
11941233
return false;
1234+
}
11951235

11961236
// The vector loop region only executes once. If possible, completely remove
11971237
// the region, otherwise replace the terminator controlling the latch with

llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
5555
; VF8UF2: [[VECTOR_PH]]:
5656
; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
5757
; VF8UF2: [[VECTOR_BODY]]:
58-
; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
59-
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
58+
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
6059
; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
6160
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
6261
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
63-
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
6462
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
65-
; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
66-
; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
67-
; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
63+
; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]]
6864
; VF8UF2: [[MIDDLE_SPLIT]]:
6965
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
7066
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -83,7 +79,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
8379
; VF8UF2: [[LOOP_LATCH]]:
8480
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
8581
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
86-
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
82+
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
8783
; VF8UF2: [[EXIT]]:
8884
; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
8985
; VF8UF2-NEXT: ret i8 [[RES]]
@@ -95,16 +91,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
9591
; VF16UF1: [[VECTOR_PH]]:
9692
; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
9793
; VF16UF1: [[VECTOR_BODY]]:
98-
; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
99-
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
94+
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
10095
; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
10196
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
10297
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
103-
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
10498
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
105-
; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
106-
; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
107-
; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
99+
; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]]
108100
; VF16UF1: [[MIDDLE_SPLIT]]:
109101
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
110102
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -123,7 +115,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
123115
; VF16UF1: [[LOOP_LATCH]]:
124116
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
125117
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
126-
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
118+
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
127119
; VF16UF1: [[EXIT]]:
128120
; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
129121
; VF16UF1-NEXT: ret i8 [[RES]]
@@ -198,23 +190,19 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
198190
; VF8UF2: [[VECTOR_PH]]:
199191
; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
200192
; VF8UF2: [[VECTOR_BODY]]:
201-
; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
202-
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
193+
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
203194
; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
204195
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
205196
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
206-
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
207197
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
208-
; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
209-
; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
210-
; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
198+
; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]]
211199
; VF8UF2: [[MIDDLE_SPLIT]]:
212200
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
213201
; VF8UF2: [[MIDDLE_BLOCK]]:
214202
; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
215203
; VF8UF2: [[VECTOR_EARLY_EXIT]]:
216204
; VF8UF2-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
217-
; VF8UF2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
205+
; VF8UF2-NEXT: [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
218206
; VF8UF2-NEXT: br label %[[EXIT]]
219207
; VF8UF2: [[SCALAR_PH]]:
220208
; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -228,9 +216,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
228216
; VF8UF2: [[LOOP_LATCH]]:
229217
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
230218
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
231-
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
219+
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
232220
; VF8UF2: [[EXIT]]:
233-
; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ]
221+
; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
234222
; VF8UF2-NEXT: ret i64 [[RES]]
235223
;
236224
; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
@@ -240,23 +228,19 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
240228
; VF16UF1: [[VECTOR_PH]]:
241229
; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
242230
; VF16UF1: [[VECTOR_BODY]]:
243-
; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
244-
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
231+
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
245232
; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
246233
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
247234
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
248-
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
249235
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
250-
; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
251-
; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
252-
; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
236+
; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]]
253237
; VF16UF1: [[MIDDLE_SPLIT]]:
254238
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
255239
; VF16UF1: [[MIDDLE_BLOCK]]:
256240
; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
257241
; VF16UF1: [[VECTOR_EARLY_EXIT]]:
258242
; VF16UF1-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true)
259-
; VF16UF1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
243+
; VF16UF1-NEXT: [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
260244
; VF16UF1-NEXT: br label %[[EXIT]]
261245
; VF16UF1: [[SCALAR_PH]]:
262246
; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
@@ -270,9 +254,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
270254
; VF16UF1: [[LOOP_LATCH]]:
271255
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
272256
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
273-
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
257+
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
274258
; VF16UF1: [[EXIT]]:
275-
; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ]
259+
; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
276260
; VF16UF1-NEXT: ret i64 [[RES]]
277261
;
278262
entry:

0 commit comments

Comments
 (0)