-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Enable unrolling for small multi-exit loops #131998
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: David Sherwood (david-arm) ChangesIt can be highly beneficial to unroll small, two-block search loops This patch adds a new flag -small-multi-exit-loop-unroll-factor Patch is 40.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131998.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7cec8a17dfaaa..d3850217f1358 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
cl::init(true), cl::Hidden);
+static cl::opt<unsigned> SmallMultiExitLoopUF(
+ "small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
+ cl::desc(
+ "Force unrolling of small multi-exit loops with given unroll factor"));
+
// A complete guess as to a reasonable cost.
static cl::opt<unsigned>
BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4237,6 +4242,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
}
}
+static bool shouldUnrollLoopWithInstruction(Instruction &I,
+ AArch64TTIImpl &TTI) {
+ // Don't unroll vectorised loop.
+ if (I.getType()->isVectorTy())
+ return false;
+
+ if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction())
+ if (!TTI.isLoweredToCall(F))
+ return true;
+ return false;
+ }
+
+ return true;
+}
+
+static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
+ AArch64TTIImpl &TTI) {
+ // Small search loops with multiple exits can be highly beneficial to unroll.
+ // We only care about loops with exactly two exiting blocks, although each
+ // block could jump to the same exit block.
+ SmallVector<BasicBlock *> Blocks(L->getBlocks());
+ if (Blocks.size() != 2 || L->getExitingBlock())
+ return false;
+
+ if (any_of(Blocks, [](BasicBlock *BB) {
+ return !isa<BranchInst>(BB->getTerminator());
+ }))
+ return false;
+
+ // Only consider loops with unknown trip counts for which we can determine
+ // a symbolic expression. Multi-exit loops with small known trip counts will
+ // likely be unrolled anyway.
+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
+ return false;
+
+ // It might not be worth unrolling loops with low max trip counts. Restrict
+ // this to max trip counts > 32 for now.
+ unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
+ if (MaxTC > 0 && MaxTC <= 32)
+ return false;
+
+ // Estimate the size of the loop.
+ int64_t Size = 0;
+ for (auto *BB : L->getBlocks()) {
+ for (auto &I : *BB) {
+ if (!shouldUnrollLoopWithInstruction(I, TTI))
+ return false;
+
+ SmallVector<const Value *, 4> Operands(I.operand_values());
+ InstructionCost Cost =
+ TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
+ // This can happen with intrinsics that don't currently have a cost model
+ // or for some operations that require SVE.
+ if (!Cost.isValid())
+ return false;
+ Size += *Cost.getValue();
+ }
+ }
+
+ return Size < 6;
+}
+
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
/// OOO engine's wide instruction window and various predictors.
static void
@@ -4412,22 +4481,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
break;
}
+ if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
+ UP.RuntimeUnrollMultiExit = true;
+ UP.Runtime = true;
+ // Limit unroll count.
+ UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
+ // Allow slightly more costly trip-count expansion to catch search loops
+ // with pointer inductions.
+ UP.SCEVExpansionBudget = 5;
+ }
+
// Scan the loop: don't unroll loops with calls as this could prevent
// inlining. Don't unroll vector loops either, as they don't benefit much from
// unrolling.
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- // Don't unroll vectorised loop.
- if (I.getType()->isVectorTy())
+ if (!shouldUnrollLoopWithInstruction(I, *this))
return;
-
- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
- if (!isLoweredToCall(F))
- continue;
- }
- return;
- }
}
}
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
new file mode 100644
index 0000000000000..b799b4328400a
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
@@ -0,0 +1,713 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mcpu=generic -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefixes=COMMON,UNROLL2 %s
+; RUN: opt -p loop-unroll -mcpu=generic -S %s | FileCheck --check-prefixes=COMMON,GENERIC %s
+
+target triple = "aarch64-linux-gnu"
+
+define i1 @multi_2_exiting_find_i8_loop_same_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]]
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; GENERIC: [[EXIT]]:
+; GENERIC-NEXT: [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; GENERIC-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; GENERIC-NEXT: ret i1 [[C_3]]
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+ %c.3 = icmp eq ptr %res, %end
+ ret i1 %c.3
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_diff_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT: [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT: br i1 [[TMP3]], label %[[LATCH_EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EARLY_EXIT_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[LATCH_EXIT_UNR_LCSSA:.*]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EARLY_EXIT_LOOPEXIT]]:
+; UNROLL2-NEXT: br label %[[EARLY_EXIT]]
+; UNROLL2: [[EARLY_EXIT]]:
+; UNROLL2-NEXT: ret i1 true
+; UNROLL2: [[LATCH_EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: br label %[[LATCH_EXIT]]
+; UNROLL2: [[LATCH_EXIT]]:
+; UNROLL2-NEXT: ret i1 false
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; GENERIC-NEXT: [[ENTRY:.*]]:
+; GENERIC-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT: br label %[[LOOP_HEADER:.*]]
+; GENERIC: [[LOOP_HEADER]]:
+; GENERIC-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT: br i1 [[C_1]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC: [[LOOP_LATCH]]:
+; GENERIC-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT: br i1 [[C_2]], label %[[LATCH_EXIT:.*]], label %[[LOOP_HEADER]]
+; GENERIC: [[EARLY_EXIT]]:
+; GENERIC-NEXT: ret i1 true
+; GENERIC: [[LATCH_EXIT]]:
+; GENERIC-NEXT: ret i1 false
+;
+entry:
+ %start = load ptr, ptr %vec, align 8
+ %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+ %end = load ptr, ptr %gep.end, align 8
+ br label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+ %l = load i8, ptr %ptr.iv, align 8
+ %c.1 = icmp eq i8 %l, %tgt
+ br i1 %c.1, label %early.exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+ %c.2 = icmp eq ptr %ptr.iv.next, %end
+ br i1 %c.2, label %latch.exit, label %loop.header
+
+early.exit:
+ ret i1 1
+
+latch.exit:
+ ret i1 0
+}
+
+
+define i1 @multi_2_exiting_find_ptr_loop_same_exit(ptr %vec, ptr %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_ptr_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT: [[ENTRY:.*]]:
+; UNROLL2-NEXT: [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT: [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT: [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8
+; UNROLL2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; UNROLL2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; UNROLL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; UNROLL2-NEXT: [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; UNROLL2-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], -1
+; UNROLL2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1
+; UNROLL2-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2: [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT: [[L_PROL:%.*]] = load ptr, ptr [[START]], align 8
+; UNROLL2-NEXT: [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2: [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 8
+; UNROLL2-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2: [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 1
+; UNROLL2-NEXT: br i1 [[TMP6]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2: [[ENTRY_NEW]]:
+; UNROLL2-NEXT: br label %[[LOOP_HEADER:.*]]
+; UNROLL2: [[LOOP_HEADER]]:
+; UNROLL2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2: [[LOOP_LATCH]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT: [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]]
+; UNROLL2-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2: [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT: [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8
+; UNROLL2-NEXT: [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT: br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT: br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2: [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT: [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT: br label %[[EXIT]]
+; UNROLL2: [[EXIT]]:
+; UNROLL2-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT: ret i1 [[C_3]...
[truncated]
|
return true; | ||
} | ||
|
||
static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a subset of getAppleRuntimeUnrollPreferences
, can we unify it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I could do that although it's actually a superset because in getAppleRuntimeUnrollPreferences
it seems to disallow loops with multiple exit blocks, i.e. both exiting blocks must have the same destination.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, sorry, maybe that's not right. I'm just looking at this code:
// Small search loops with multiple exits can be highly beneficial to unroll.
if (!L->getExitBlock()) {
Perhaps I've misunderstood what this is doing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought at first this is saying if the loop doesn't have a single exit block then unroll, however looking at the implementation of getExitBlock a bit more I can see that it doesn't permit repeats. So I guess that means it should always return false for any early exit loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code block should handle small multi exit loops as advertised (tests are in llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
). getExitBlock()
should return the single exit block if the loop has a single exit & exiting block.
If there are multiple exiting blocks to the same exit, the exit block will be included multiple times in getExitBlocks(); This is different from getUniqueExitBlock
, which would return the common exit block, even if there are multiple exiting blocks branching to it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, I'm not sure exactly what you meant by 'unify' and how far I should attempt to commonise code. I've tried my best - see what you think! The main difference is that now getAppleRuntimeUnrollPreferences bails out on multi-exit loops and defers to common code. The common code bails out for slightly more cases, for example if an intrinsic gets lowered to a call, if the loop already contains a vector type, or if a cost cannot be calculated. I assume that these are sensible things to do for all targets anyway. One side-effect of these changes is that it's now possible to override the default unroll factor of 4 - I've added a test to Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll for forcing unroll=2.
Hi @fhahn, I've tried to address your comments around trying to unify the code. Is it what you were expecting? Thanks! |
// Small search loops with multiple exits can be highly beneficial to unroll. | ||
// We only care about loops with exactly two exiting blocks, although each | ||
// block could jump to the same exit block. | ||
SmallVector<BasicBlock *> Blocks(L->getBlocks()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can avoid copying blocks here by ArrayRef or auto&:
ArrayRef<BasicBlock*> Blocks = L->getBlocks();
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
if (I.getType()->isVectorTy()) | ||
return false; | ||
|
||
if (isa<CallInst>(I) || isa<InvokeInst>(I)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we are only working with CallBase
below, we can rewrite it as:
if (auto *CB = dyn_cast<CallBase>(&I)) {
if (const Function *F = CB->getCalledFunction())
....
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, it's a good suggestion but I checked and it seems CallBrInst
also derives from CallBase
. I'm not sure if I should implicitly add support for that by permitting all instances of CallBase
. If you don't mind I'll leave this as it is for now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't know about that. :) But it looks like, with the current approach, shouldUnrollLoopWithInstruction
will return true for callbr
. Do we want to unroll that kind of loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, you're right! I've fixed it now.
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure about adding a very specialized option for this. Is there a reason we couldnt take the right decision per uarch to start with? It sounds like that is planned down the road already?
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", | |||
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", | |||
cl::init(true), cl::Hidden); | |||
|
|||
static cl::opt<unsigned> SmallMultiExitLoopUF( | |||
"small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name makes it sounds like a generally applicable flag, but it is something specific for the AArch64 cost model. I think ideally we would set things correctly per sub-target directly, which sounds like you are planning anyways, instead of adding this option first..
return true; | ||
} | ||
|
||
static InstructionCost getSizeOfLoop(Loop *L, AArch64TTIImpl &TTI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this also be shared with getAppleRuntimeUnrollPreferences
? Although there we explicitly don't bail out on vector instructions. We only bail out on loops LV already vectorized.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
InstructionCost Size = getSizeOfLoop(L, TTI); | ||
if (!Size.isValid()) | ||
return false; | ||
|
||
// Small search loops with multiple exits can be highly beneficial to unroll. | ||
// We only care about loops with exactly two exiting blocks, although each | ||
// block could jump to the same exit block. | ||
ArrayRef<BasicBlock *> Blocks = L->getBlocks(); | ||
if (Blocks.size() != 2) | ||
return false; | ||
|
||
if (any_of(Blocks, [](BasicBlock *BB) { | ||
return !isa<BranchInst>(BB->getTerminator()); | ||
})) | ||
return false; | ||
|
||
return *Size.getValue() < 6; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess that would make most sense to share this with getAppleRuntimeUnrollPreferences
, as is at the moment the checks above are duplicated.
Well I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it?
Having the option sounds useful for debugging - we can use it to disable the unrolling if we find issues. Perhaps rename it to -aarch64-small-multi-exit-loop-unroll-factor?
I gave this a try and it seemed OK. The code size, which was the thing I was worried about, did not increase by too much.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks OK to me if @fhahn is OK with the new code.
return true; | ||
} | ||
|
||
static unsigned getLoopSize(Loop *L, AArch64TTIImpl &TTI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The two uses make this look like it should return a bool, and maybe be named something like isLoopSizeOverBudget. If the Size is needed, it could be returned in a unsigned& arg, or return a optional?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would also be good to document.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've renamed this to isLoopSizeWithinBudget
so that it returns a boolean instead, and passes the final size in an optional argument. I've also added some comments to the function. Hope this is better!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure about adding a very specialized option for this. Is there a reason we couldnt take the right decision per uarch to start with? It sounds like that is planned down the road already?
Well I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it?
I don't think we generally encourage users to use internal LLVM options, they mostly for LLVM developers, unless expose explicitly via Clang
or other drivers. And those flags are also not guaranteed to be stable.
Another alternative as middle ground would be to introduce a target feature instead of a flag, this can be both specified via the command line and it will be easy to add it for CPUs where it's profitable.
But I think it would still be good to enable it for the CPUs that are known to benefit straigth away.
Having the option sounds useful for debugging - we can use it to disable the unrolling if we find issues. Perhaps rename it to -aarch64-small-multi-exit-loop-unroll-factor?
I gave this a try and it seemed OK. The code size, which was the thing I was worried about, did not increase by too much.
If it is just about disabling unrolling for such loops, it might be good to check if the existing unroll-runtime-multi-exit
option is explicitly set to false? This is something the current implementation for Apple CPUs doesn't do, but I think it would be reasonable to expect this flag to override the target specific decisions?
return true; | ||
} | ||
|
||
static unsigned getLoopSize(Loop *L, AArch64TTIImpl &TTI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would also be good to document.
I don't believe we want or should be adding a new "public" way to control this optimisation and so we shouldn't add anything binding like a target feature flag. The intent is really about having finer control for environments where it would not be easy to inject a new toolchain but a user is happy to try "secret" options using their existing toolchain, be it for functional or performance investigations. I do agree we should be more aggressive in enabling this optimisation by default for the platforms we know it will benefit. @fhahn: What is the down side to adding a dedicated option for this? If there's a problem with adding such flags then perhaps we should think about adding something akin to gcc's |
Following reviewer suggestions I've just gone ahead and enabled this for neoverse-v1 and neoverse-v2. The testing I've done doesn't suggest any downsides in terms of performance for SPEC2017 or any significant increase in code size. I saw less than 0.1% increase in code size for SPEC2017 and I observed a 0.3% increase for the LLVM test suite. However, when building SPEC2017 with libc++ I saw about 6% and 9% performance improvements for xalancbmk on neoverse-v1 and neoverse-v2, respectively. |
The option is just for debugging, it is not intended to be customer facing. Just in case we need to disable it again. If -unroll-runtime-multi-exit will reliably turn the option off then that would be be good. It doesn't look like that is how the option works though as it will not override ULO.RuntimeUnrollMultiExit.
We should enable for all cpus unless we have a good reason not to. We know this is relatively small impact on codesize, helps with big cores and little cores like unrolling (providing it is generally less instructions). Unless we have some reason not to, we should just enable it universally. If it does for some reason need to be per cpu then it should be a subtarget feature. |
If the intention is to enable this for all AArch64 targets then yes I agree this means we only need to be able to disable the option. To avoid confusion I will clarify my "we don't want a target feature" comment. Here I'm referring to public flags and specifically mean we don't want something at the |
Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92, we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always repsect the option. This surfaced while discussing llvm#131998.
I think this was something that was missed when adding |
Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92, we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always repsect the option. This surfaced while discussing llvm#131998.
Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92 (#124462), we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always respect the option. This surfaced while discussing #131998. PR: #134259
…TTI. (#134259) Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92 (llvm/llvm-project#124462), we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always respect the option. This surfaced while discussing llvm/llvm-project#131998. PR: llvm/llvm-project#134259
I've now enabled unrolling of small, multi-exit loops by default for all cores and removed the extra flag I added previously, in favour of the existing |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
InstructionCost LoopCost = 0; | ||
|
||
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) | ||
return 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return false;
Same in a couple of places below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, good spot. Fixed now, thanks!
@@ -3,7 +3,7 @@ | |||
; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s | |||
; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s | |||
; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s | |||
; RUN: opt -p loop-unroll -mcpu=apple-m1 -unroll-runtime-multi-exit=false -S %s | FileCheck --check-prefix=OTHER %s | |||
; RUN: opt -p loop-unroll -mcpu=apple-m1 -unroll-runtime-multi-exit=false -S %s | FileCheck --check-prefix=NOUNROLL %s | |||
; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that line can be dropped now, as other CPU behavior is now covered by other tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assuming I've understood which line you meant, I've deleted the -mcpu=cortex-a57
line.
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) | ||
return false; | ||
|
||
for (auto *BB : L->getBlocks()) { | ||
for (auto &I : *BB) { | ||
if (!shouldUnrollLoopWithInstruction(I, TTI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make more sense to check this separately? If not, would be good to update the name to make it clearer that this does not only check if the loop size is within a budget?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've kept it as part of the loop, because even the previous loop checking the size for the apple preferences bailed out on call instructions. It seems a bit inefficient to walk the loop twice, so I've renamed it to canUnrollLoopWithinBudget
. I've also move the findStringMetadataForLoop
out into shouldUnrollMultiExitLoop
, since there's no need to check here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok that's fine, although now we potentially check shouldUnrollLoopWithInstruction
3 times. Would it be better and more efficient to just check this once, before // Apply subtarget-specific unrolling preferences
?
I plan to land this patch this morning if nobody has any objections? |
@@ -4542,43 +4631,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, | |||
if (!L->isInnermost() || L->getNumBlocks() > 8) | |||
return; | |||
|
|||
// This is handled by common code. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// This is handled by common code. | |
// Loops with multiple exits are handled by common code. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is probably dead code, as the current version of canUnrollLoopWithinBudget
forbids loops with vector instructions. This is a change to the current behavior for the Apple unrolling preferences, which allowed loops with vector code (e.g. intrinsics), but not loops vectorized by LV, which should already be unrolled by LV.
If there are enough vector execution units, I don't think there's a need to dis-allow vector instructions from unrolling, but that may require more target info, so keeping the bail-out is fine for now in shouldUnrollLoopWithInstruction
is fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it dead code? Doesn't the loop vectoriser mark this as vectorised when using something like VF=1,IC=2? Having said that, I'm happy to remove it if you don't think it adds any value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've removed it anyway
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LV should disable runtime unrolling explicitly via metadata and after running LV only runtime unrolling should run. I guess in theory there may be edge-cases where we fail to fully unroll initially and maybe fully unroll after vectorization, but that should probably be fine
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) | ||
return false; | ||
|
||
for (auto *BB : L->getBlocks()) { | ||
for (auto &I : *BB) { | ||
if (!shouldUnrollLoopWithInstruction(I, TTI)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok that's fine, although now we potentially check shouldUnrollLoopWithInstruction
3 times. Would it be better and more efficient to just check this once, before // Apply subtarget-specific unrolling preferences
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
// unrolling. | ||
for (auto *BB : L->getBlocks()) { | ||
for (auto &I : *BB) { | ||
if (!shouldUnrollLoopWithInstruction(I, *this)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: now that this has a single user, could be inlined again
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LV should disable runtime unrolling explicitly via metadata and after running LV only runtime unrolling should run. I guess in theory there may be edge-cases where we fail to fully unroll initially and maybe fully unroll after vectorization, but that should probably be fine
@@ -4528,6 +4528,88 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
} | |||
} | |||
|
|||
static bool shouldUnrollLoopWithInstruction(Instruction &I, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If not inlined back, would be good to doucment.
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
Rebased and ran "make check-all" locally. Seems ok. |
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.
It can be highly beneficial to unroll small, two-block search loops
that look for a value in an array. An example of this would be
something that uses std::find to find a value in libc++. Older
versions of std::find in the libstdc++ headers are manually unrolled
in the source code, but this might change in newer releases where
the compiler is expected to either vectorise or unroll itself.