Skip to content

Commit c836775

Browse files
committed
[VPlan] Support early-exit loops in optimizeForVFAndUF.
Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV.
1 parent 40b7034 commit c836775

File tree

2 files changed

+44
-28
lines changed

2 files changed

+44
-28
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,33 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
10051005
}
10061006
}
10071007

1008+
/// Return true if \p Cond is known to be true for given \p BestVF and \p
1009+
/// BestUF.
1010+
static bool isConditionKnown(VPValue *Cond, VPlan &Plan, ElementCount BestVF,
1011+
unsigned BestUF, ScalarEvolution &SE) {
1012+
using namespace llvm::VPlanPatternMatch;
1013+
if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
1014+
return any_of(Cond->getDefiningRecipe()->operands(),
1015+
[&Plan, BestVF, BestUF, &SE](VPValue *C) {
1016+
return isConditionKnown(C, Plan, BestVF, BestUF, SE);
1017+
});
1018+
1019+
VPValue *TripCount = Plan.getTripCount();
1020+
auto *CanIV = Plan.getCanonicalIV();
1021+
if (!match(Cond, m_Binary<Instruction::ICmp>(m_Specific(CanIV),
1022+
m_VPValue(TripCount))) ||
1023+
cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
1024+
CmpInst::ICMP_EQ)
1025+
return false;
1026+
1027+
const SCEV *TripCountSCEV = vputils::getSCEVExprForVPValue(TripCount, SE);
1028+
assert(!isa<SCEVCouldNotCompute>(TripCountSCEV) &&
1029+
"Trip count SCEV must be computable");
1030+
ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1031+
const SCEV *C = SE.getElementCount(TripCountSCEV->getType(), NumElements);
1032+
return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCountSCEV, C);
1033+
}
1034+
10081035
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
10091036
unsigned BestUF,
10101037
PredicatedScalarEvolution &PSE) {
@@ -1019,9 +1046,12 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
10191046
// 1. BranchOnCount, or
10201047
// 2. BranchOnCond where the input is Not(ActiveLaneMask).
10211048
using namespace llvm::VPlanPatternMatch;
1049+
VPValue *Cond;
10221050
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
1023-
!match(Term,
1024-
m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
1051+
!match(Term, m_BranchOnCond(
1052+
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))) &&
1053+
(!match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
1054+
isConditionKnown(Cond, Plan, BestVF, BestUF, *PSE.getSE())))
10251055
return;
10261056

10271057
ScalarEvolution &SE = *PSE.getSE();

llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
5858
; VF8UF2: [[VECTOR_PH]]:
5959
; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
6060
; VF8UF2: [[VECTOR_BODY]]:
61-
; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
62-
; VF8UF2-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
63-
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
61+
; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
6462
; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
6563
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
6664
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
67-
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
6865
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
69-
; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
70-
; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
71-
; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
66+
; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]]
7267
; VF8UF2: [[MIDDLE_SPLIT]]:
7368
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
7469
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -87,7 +82,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
8782
; VF8UF2: [[LOOP_LATCH]]:
8883
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
8984
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
90-
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
85+
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
9186
; VF8UF2: [[EXIT]]:
9287
; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
9388
; VF8UF2-NEXT: ret i8 [[RES]]
@@ -100,17 +95,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
10095
; VF16UF1: [[VECTOR_PH]]:
10196
; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
10297
; VF16UF1: [[VECTOR_BODY]]:
103-
; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
104-
; VF16UF1-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
105-
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
98+
; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
10699
; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
107100
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
108101
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
109-
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
110102
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
111-
; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
112-
; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
113-
; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
103+
; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]]
114104
; VF16UF1: [[MIDDLE_SPLIT]]:
115105
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
116106
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -129,7 +119,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
129119
; VF16UF1: [[LOOP_LATCH]]:
130120
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
131121
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
132-
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
122+
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
133123
; VF16UF1: [[EXIT]]:
134124
; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
135125
; VF16UF1-NEXT: ret i8 [[RES]]
@@ -219,11 +209,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
219209
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
220210
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
221211
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
222-
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
223-
; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
224212
; VF8UF2-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], splat (i64 8)
225-
; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
226-
; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
213+
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
214+
; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
227215
; VF8UF2: [[MIDDLE_SPLIT]]:
228216
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
229217
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -244,7 +232,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
244232
; VF8UF2: [[LOOP_LATCH]]:
245233
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
246234
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
247-
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
235+
; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
248236
; VF8UF2: [[EXIT]]:
249237
; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
250238
; VF8UF2-NEXT: ret i64 [[RES]]
@@ -265,11 +253,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
265253
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
266254
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
267255
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
268-
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
269-
; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
270256
; VF16UF1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
271-
; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
272-
; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
257+
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
258+
; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
273259
; VF16UF1: [[MIDDLE_SPLIT]]:
274260
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
275261
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -290,7 +276,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
290276
; VF16UF1: [[LOOP_LATCH]]:
291277
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
292278
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
293-
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
279+
; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
294280
; VF16UF1: [[EXIT]]:
295281
; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
296282
; VF16UF1-NEXT: ret i64 [[RES]]

0 commit comments

Comments
 (0)