-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VPlan] Support early-exit loops in optimizeForVFAndUF. #131539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesUpdate optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV. Full diff: https://github.com/llvm/llvm-project/pull/131539.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9aae383d35d91..097d68e3e20c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1005,6 +1005,33 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
}
}
+/// Return true if \p Cond is known to be true for given \p BestVF and \p
+/// BestUF.
+static bool isConditionKnown(VPValue *Cond, VPlan &Plan, ElementCount BestVF,
+ unsigned BestUF, ScalarEvolution &SE) {
+ using namespace llvm::VPlanPatternMatch;
+ if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
+ return any_of(Cond->getDefiningRecipe()->operands(),
+ [&Plan, BestVF, BestUF, &SE](VPValue *C) {
+ return isConditionKnown(C, Plan, BestVF, BestUF, SE);
+ });
+
+ VPValue *TripCount = Plan.getTripCount();
+ auto *CanIV = Plan.getCanonicalIV();
+ if (!match(Cond, m_Binary<Instruction::ICmp>(m_Specific(CanIV),
+ m_VPValue(TripCount))) ||
+ cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
+ CmpInst::ICMP_EQ)
+ return false;
+
+ const SCEV *TripCountSCEV = vputils::getSCEVExprForVPValue(TripCount, SE);
+ assert(!isa<SCEVCouldNotCompute>(TripCountSCEV) &&
+ "Trip count SCEV must be computable");
+ ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
+ const SCEV *C = SE.getElementCount(TripCountSCEV->getType(), NumElements);
+ return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCountSCEV, C);
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
@@ -1019,9 +1046,12 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
// 1. BranchOnCount, or
// 2. BranchOnCond where the input is Not(ActiveLaneMask).
using namespace llvm::VPlanPatternMatch;
+ VPValue *Cond;
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
- !match(Term,
- m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
+ !match(Term, m_BranchOnCond(
+ m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))) &&
+ (!match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
+ isConditionKnown(Cond, Plan, BestVF, BestUF, *PSE.getSE())))
return;
ScalarEvolution &SE = *PSE.getSE();
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 51458a7bb80b6..5db6752ed7159 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -58,17 +58,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF8UF2: [[VECTOR_PH]]:
; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF8UF2: [[VECTOR_BODY]]:
-; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
-; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
-; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]]
; VF8UF2: [[MIDDLE_SPLIT]]:
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -87,7 +82,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF8UF2: [[LOOP_LATCH]]:
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
; VF8UF2: [[EXIT]]:
; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
; VF8UF2-NEXT: ret i8 [[RES]]
@@ -100,17 +95,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF16UF1: [[VECTOR_PH]]:
; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
; VF16UF1: [[VECTOR_BODY]]:
-; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF16UF1-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
-; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]]
; VF16UF1: [[MIDDLE_SPLIT]]:
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -129,7 +119,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF16UF1: [[LOOP_LATCH]]:
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
; VF16UF1: [[EXIT]]:
; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
; VF16UF1-NEXT: ret i8 [[RES]]
@@ -219,11 +209,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
-; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; VF8UF2-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], splat (i64 8)
-; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF8UF2: [[MIDDLE_SPLIT]]:
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -244,7 +232,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF8UF2: [[LOOP_LATCH]]:
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
; VF8UF2: [[EXIT]]:
; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
; VF8UF2-NEXT: ret i64 [[RES]]
@@ -265,11 +253,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; VF16UF1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
-; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF16UF1: [[MIDDLE_SPLIT]]:
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -290,7 +276,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF16UF1: [[LOOP_LATCH]]:
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
; VF16UF1: [[EXIT]]:
; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
; VF16UF1-NEXT: ret i64 [[RES]]
|
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesUpdate optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV. Full diff: https://github.com/llvm/llvm-project/pull/131539.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9aae383d35d91..097d68e3e20c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1005,6 +1005,33 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
}
}
+/// Return true if \p Cond is known to be true for given \p BestVF and \p
+/// BestUF.
+static bool isConditionKnown(VPValue *Cond, VPlan &Plan, ElementCount BestVF,
+ unsigned BestUF, ScalarEvolution &SE) {
+ using namespace llvm::VPlanPatternMatch;
+ if (match(Cond, m_Binary<Instruction::Or>(m_VPValue(), m_VPValue())))
+ return any_of(Cond->getDefiningRecipe()->operands(),
+ [&Plan, BestVF, BestUF, &SE](VPValue *C) {
+ return isConditionKnown(C, Plan, BestVF, BestUF, SE);
+ });
+
+ VPValue *TripCount = Plan.getTripCount();
+ auto *CanIV = Plan.getCanonicalIV();
+ if (!match(Cond, m_Binary<Instruction::ICmp>(m_Specific(CanIV),
+ m_VPValue(TripCount))) ||
+ cast<VPRecipeWithIRFlags>(Cond->getDefiningRecipe())->getPredicate() !=
+ CmpInst::ICMP_EQ)
+ return false;
+
+ const SCEV *TripCountSCEV = vputils::getSCEVExprForVPValue(TripCount, SE);
+ assert(!isa<SCEVCouldNotCompute>(TripCountSCEV) &&
+ "Trip count SCEV must be computable");
+ ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
+ const SCEV *C = SE.getElementCount(TripCountSCEV->getType(), NumElements);
+ return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCountSCEV, C);
+}
+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
unsigned BestUF,
PredicatedScalarEvolution &PSE) {
@@ -1019,9 +1046,12 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
// 1. BranchOnCount, or
// 2. BranchOnCond where the input is Not(ActiveLaneMask).
using namespace llvm::VPlanPatternMatch;
+ VPValue *Cond;
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
- !match(Term,
- m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
+ !match(Term, m_BranchOnCond(
+ m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))) &&
+ (!match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
+ isConditionKnown(Cond, Plan, BestVF, BestUF, *PSE.getSE())))
return;
ScalarEvolution &SE = *PSE.getSE();
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 51458a7bb80b6..5db6752ed7159 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -58,17 +58,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF8UF2: [[VECTOR_PH]]:
; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]]
; VF8UF2: [[VECTOR_BODY]]:
-; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF8UF2-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
-; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
-; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]]
; VF8UF2: [[MIDDLE_SPLIT]]:
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -87,7 +82,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF8UF2: [[LOOP_LATCH]]:
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
; VF8UF2: [[EXIT]]:
; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
; VF8UF2-NEXT: ret i8 [[RES]]
@@ -100,17 +95,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF16UF1: [[VECTOR_PH]]:
; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]]
; VF16UF1: [[VECTOR_BODY]]:
-; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; VF16UF1-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0
-; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
+; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0
; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
-; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]]
; VF16UF1: [[MIDDLE_SPLIT]]:
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -129,7 +119,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr %A, i64 %N) nosync nofree {
; VF16UF1: [[LOOP_LATCH]]:
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
; VF16UF1: [[EXIT]]:
; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
; VF16UF1-NEXT: ret i8 [[RES]]
@@ -219,11 +209,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
-; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; VF8UF2-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], splat (i64 8)
-; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]])
+; VF8UF2-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF8UF2: [[MIDDLE_SPLIT]]:
; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF8UF2: [[MIDDLE_BLOCK]]:
@@ -244,7 +232,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF8UF2: [[LOOP_LATCH]]:
; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
; VF8UF2: [[EXIT]]:
; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
; VF8UF2-NEXT: ret i64 [[RES]]
@@ -265,11 +253,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer
; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
-; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
; VF16UF1-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
-; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; VF16UF1-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; VF16UF1: [[MIDDLE_SPLIT]]:
; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
; VF16UF1: [[MIDDLE_BLOCK]]:
@@ -290,7 +276,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr %A, i64
; VF16UF1: [[LOOP_LATCH]]:
; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1
; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
; VF16UF1: [[EXIT]]:
; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], %[[VECTOR_EARLY_EXIT]] ]
; VF16UF1-NEXT: ret i64 [[RES]]
|
@@ -1005,6 +1005,33 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { | |||
} | |||
} | |||
|
|||
/// Return true if \p Cond is known to be true for given \p BestVF and \p | |||
/// BestUF. | |||
static bool isConditionKnown(VPValue *Cond, VPlan &Plan, ElementCount BestVF, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The condition could be known to be either false or true, so to avoid ambiguity perhaps this should be renamed to isBranchConditionKnownTrue
? I also think adding the word Branch
indicates the condition relates to the terminator branch condition, as opposed to the condition for a select, etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated it to isConditionTrueViaVFAndUF
. I've not included Branch
for now, as I think the logic is independent of whether the condition is used in a branch or otherwise.
!match(Term, m_BranchOnCond( | ||
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))) && | ||
(!match(Term, m_BranchOnCond(m_VPValue(Cond))) || | ||
isConditionKnown(Cond, Plan, BestVF, BestUF, *PSE.getSE()))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be honest I don't know how this even works. If the terminator is a branch-on-cond recipe and the condition is known to be true then bail out and don't do any optimisations. Yet surely the case you're optimising (as shown by the tests) is precisely when you know the condition is true?!
Unless I'm missing something, it suggests that isConditionKnown
is actually returning false for your tests, doesn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes, I should have broken up the checks for BranchOnCount/BranchOnCond(activelanemask)
and BranchOnCond
to make this clearer and also skip the checks for BranchOnCount
if we have BranchOnCond
.
Should hopefully be clearer now.
Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV.
c836775
to
8a3587b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Thanks, that's a lot clearer. :)
…131539) Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV PR: llvm/llvm-project#131539
Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV PR: llvm#131539
Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV.