[AArch64] Enable unrolling for small multi-exit loops #131998

david-arm · 2025-03-19T10:27:33Z

It can be highly beneficial to unroll small, two-block search loops
that look for a value in an array. An example of this would be
something that uses std::find to find a value in libc++. Older
versions of std::find in the libstdc++ headers are manually unrolled
in the source code, but this might change in newer releases where
the compiler is expected to either vectorise or unroll itself.

llvmbot · 2025-03-19T10:28:06Z

@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: David Sherwood (david-arm)

Changes

It can be highly beneficial to unroll small, two-block search loops
that look for a value in an array. An example of this would be
something that uses std::find to find a value in libc++. Older
versions of std::find in the libstdc++ headers are manually unrolled
in the source code, but this might change in newer releases where
the compiler is expected to either vectorise or unroll itself.

This patch adds a new flag -small-multi-exit-loop-unroll-factor
that controls the amount of unrolling for such loops. This is
currently off by default, but in a future patch I plan to enable
this for some targets along with details of any performance
improvements.

Patch is 40.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131998.diff

2 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+80-10)
(added) llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll (+713)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7cec8a17dfaaa..d3850217f1358 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
                                       cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned> SmallMultiExitLoopUF(
+    "small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,
+    cl::desc(
+        "Force unrolling of small multi-exit loops with given unroll factor"));
+
 // A complete guess as to a reasonable cost.
 static cl::opt<unsigned>
     BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
@@ -4237,6 +4242,70 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   }
 }
 
+static bool shouldUnrollLoopWithInstruction(Instruction &I,
+                                            AArch64TTIImpl &TTI) {
+  // Don't unroll vectorised loop.
+  if (I.getType()->isVectorTy())
+    return false;
+
+  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+    if (const Function *F = cast<CallBase>(I).getCalledFunction())
+      if (!TTI.isLoweredToCall(F))
+        return true;
+    return false;
+  }
+
+  return true;
+}
+
+static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,
+                                           AArch64TTIImpl &TTI) {
+  // Small search loops with multiple exits can be highly beneficial to unroll.
+  // We only care about loops with exactly two exiting blocks, although each
+  // block could jump to the same exit block.
+  SmallVector<BasicBlock *> Blocks(L->getBlocks());
+  if (Blocks.size() != 2 || L->getExitingBlock())
+    return false;
+
+  if (any_of(Blocks, [](BasicBlock *BB) {
+        return !isa<BranchInst>(BB->getTerminator());
+      }))
+    return false;
+
+  // Only consider loops with unknown trip counts for which we can determine
+  // a symbolic expression. Multi-exit loops with small known trip counts will
+  // likely be unrolled anyway.
+  const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
+  if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
+    return false;
+
+  // It might not be worth unrolling loops with low max trip counts. Restrict
+  // this to max trip counts > 32 for now.
+  unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
+  if (MaxTC > 0 && MaxTC <= 32)
+    return false;
+
+  // Estimate the size of the loop.
+  int64_t Size = 0;
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!shouldUnrollLoopWithInstruction(I, TTI))
+        return false;
+
+      SmallVector<const Value *, 4> Operands(I.operand_values());
+      InstructionCost Cost =
+          TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
+      // This can happen with intrinsics that don't currently have a cost model
+      // or for some operations that require SVE.
+      if (!Cost.isValid())
+        return false;
+      Size += *Cost.getValue();
+    }
+  }
+
+  return Size < 6;
+}
+
 /// For Apple CPUs, we want to runtime-unroll loops to make better use if the
 /// OOO engine's wide instruction window and various predictors.
 static void
@@ -4412,22 +4481,23 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     break;
   }
 
+  if (SmallMultiExitLoopUF && shouldUnrollSmallMultiExitLoop(L, SE, *this)) {
+    UP.RuntimeUnrollMultiExit = true;
+    UP.Runtime = true;
+    // Limit unroll count.
+    UP.DefaultUnrollRuntimeCount = SmallMultiExitLoopUF;
+    // Allow slightly more costly trip-count expansion to catch search loops
+    // with pointer inductions.
+    UP.SCEVExpansionBudget = 5;
+  }
+
   // Scan the loop: don't unroll loops with calls as this could prevent
   // inlining. Don't unroll vector loops either, as they don't benefit much from
   // unrolling.
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
-      // Don't unroll vectorised loop.
-      if (I.getType()->isVectorTy())
+      if (!shouldUnrollLoopWithInstruction(I, *this))
         return;
-
-      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
-          if (!isLoweredToCall(F))
-            continue;
-        }
-        return;
-      }
     }
   }
 
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
new file mode 100644
index 0000000000000..b799b4328400a
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
@@ -0,0 +1,713 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mcpu=generic -small-multi-exit-loop-unroll-factor=2 -S %s | FileCheck --check-prefixes=COMMON,UNROLL2 %s
+; RUN: opt -p loop-unroll -mcpu=generic -S %s | FileCheck --check-prefixes=COMMON,GENERIC %s
+
+target triple = "aarch64-linux-gnu"
+
+define i1 @multi_2_exiting_find_i8_loop_same_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL2-NEXT:  [[ENTRY:.*]]:
+; UNROLL2-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT:    [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT:    [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT:    [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2:       [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2:       [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT:    [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT:    [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2:       [[ENTRY_NEW]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; UNROLL2:       [[LOOP_HEADER]]:
+; UNROLL2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2:       [[LOOP_LATCH]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT:    [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT:    [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2:       [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT:    [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT:    br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT:    [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2:       [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT:    [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT:    br label %[[EXIT]]
+; UNROLL2:       [[EXIT]]:
+; UNROLL2-NEXT:    [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT:    ret i1 [[C_3]]
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_same_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; GENERIC-NEXT:  [[ENTRY:.*]]:
+; GENERIC-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT:    br label %[[LOOP_HEADER:.*]]
+; GENERIC:       [[LOOP_HEADER]]:
+; GENERIC-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC:       [[LOOP_LATCH]]:
+; GENERIC-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; GENERIC:       [[EXIT]]:
+; GENERIC-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; GENERIC-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; GENERIC-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+  %end = load ptr, ptr %gep.end, align 8
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load i8, ptr %ptr.iv, align 8
+  %c.1 = icmp eq i8 %l, %tgt
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+  %c.3 = icmp eq ptr %res, %end
+  ret i1 %c.3
+}
+
+
+define i1 @multi_2_exiting_find_i8_loop_diff_exit(ptr %vec, i8 %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT:  [[ENTRY:.*]]:
+; UNROLL2-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; UNROLL2-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT:    [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; UNROLL2-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; UNROLL2-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; UNROLL2-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP1]], 1
+; UNROLL2-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT:    [[L_PROL:%.*]] = load i8, ptr [[START]], align 8
+; UNROLL2-NEXT:    [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_PROL]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2:       [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 1
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2:       [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT:    [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; UNROLL2-NEXT:    br i1 [[TMP3]], label %[[LATCH_EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2:       [[ENTRY_NEW]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; UNROLL2:       [[LOOP_HEADER]]:
+; UNROLL2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1]], label %[[EARLY_EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2:       [[LOOP_LATCH]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; UNROLL2-NEXT:    [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT:    [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_1]], label %[[EARLY_EXIT_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2:       [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1
+; UNROLL2-NEXT:    [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT:    br i1 [[C_2_1]], label %[[LATCH_EXIT_UNR_LCSSA:.*]], label %[[LOOP_HEADER]]
+; UNROLL2:       [[EARLY_EXIT_LOOPEXIT]]:
+; UNROLL2-NEXT:    br label %[[EARLY_EXIT]]
+; UNROLL2:       [[EARLY_EXIT]]:
+; UNROLL2-NEXT:    ret i1 true
+; UNROLL2:       [[LATCH_EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT:    br label %[[LATCH_EXIT]]
+; UNROLL2:       [[LATCH_EXIT]]:
+; UNROLL2-NEXT:    ret i1 false
+;
+; GENERIC-LABEL: define i1 @multi_2_exiting_find_i8_loop_diff_exit(
+; GENERIC-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; GENERIC-NEXT:  [[ENTRY:.*]]:
+; GENERIC-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; GENERIC-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; GENERIC-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; GENERIC-NEXT:    br label %[[LOOP_HEADER:.*]]
+; GENERIC:       [[LOOP_HEADER]]:
+; GENERIC-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; GENERIC-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; GENERIC-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; GENERIC-NEXT:    br i1 [[C_1]], label %[[EARLY_EXIT:.*]], label %[[LOOP_LATCH]]
+; GENERIC:       [[LOOP_LATCH]]:
+; GENERIC-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; GENERIC-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; GENERIC-NEXT:    br i1 [[C_2]], label %[[LATCH_EXIT:.*]], label %[[LOOP_HEADER]]
+; GENERIC:       [[EARLY_EXIT]]:
+; GENERIC-NEXT:    ret i1 true
+; GENERIC:       [[LATCH_EXIT]]:
+; GENERIC-NEXT:    ret i1 false
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+  %end = load ptr, ptr %gep.end, align 8
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load i8, ptr %ptr.iv, align 8
+  %c.1 = icmp eq i8 %l, %tgt
+  br i1 %c.1, label %early.exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %latch.exit, label %loop.header
+
+early.exit:
+  ret i1 1
+
+latch.exit:
+  ret i1 0
+}
+
+
+define i1 @multi_2_exiting_find_ptr_loop_same_exit(ptr %vec, ptr %tgt) {
+; UNROLL2-LABEL: define i1 @multi_2_exiting_find_ptr_loop_same_exit(
+; UNROLL2-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; UNROLL2-NEXT:  [[ENTRY:.*]]:
+; UNROLL2-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; UNROLL2-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; UNROLL2-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; UNROLL2-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; UNROLL2-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; UNROLL2-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; UNROLL2-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
+; UNROLL2-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; UNROLL2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
+; UNROLL2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; UNROLL2-NEXT:    [[TMP4:%.*]] = freeze i64 [[TMP3]]
+; UNROLL2-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], -1
+; UNROLL2-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP4]], 1
+; UNROLL2-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
+; UNROLL2-NEXT:    br i1 [[LCMP_MOD]], label %[[LOOP_HEADER_PROL_PREHEADER:.*]], label %[[LOOP_HEADER_PROL_LOOPEXIT:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL_PREHEADER]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL:.*]]
+; UNROLL2:       [[LOOP_HEADER_PROL]]:
+; UNROLL2-NEXT:    [[L_PROL:%.*]] = load ptr, ptr [[START]], align 8
+; UNROLL2-NEXT:    [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP_LATCH_PROL:.*]]
+; UNROLL2:       [[LOOP_LATCH_PROL]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[START]], i64 8
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER_PROL_LOOPEXIT]]
+; UNROLL2:       [[LOOP_HEADER_PROL_LOOPEXIT]]:
+; UNROLL2-NEXT:    [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[END]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT:    [[PTR_IV_UNR:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ]
+; UNROLL2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 1
+; UNROLL2-NEXT:    br i1 [[TMP6]], label %[[EXIT:.*]], label %[[ENTRY_NEW:.*]]
+; UNROLL2:       [[ENTRY_NEW]]:
+; UNROLL2-NEXT:    br label %[[LOOP_HEADER:.*]]
+; UNROLL2:       [[LOOP_HEADER]]:
+; UNROLL2-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_1:%.*]], %[[LOOP_LATCH_1:.*]] ]
+; UNROLL2-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; UNROLL2-NEXT:    [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]]
+; UNROLL2:       [[LOOP_LATCH]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; UNROLL2-NEXT:    [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8
+; UNROLL2-NEXT:    [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]]
+; UNROLL2-NEXT:    br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1]]
+; UNROLL2:       [[LOOP_LATCH_1]]:
+; UNROLL2-NEXT:    [[PTR_IV_NEXT_1]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8
+; UNROLL2-NEXT:    [[C_2_1:%.*]] = icmp eq ptr [[PTR_IV_NEXT_1]], [[END]]
+; UNROLL2-NEXT:    br i1 [[C_2_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]]
+; UNROLL2:       [[EXIT_UNR_LCSSA_LOOPEXIT]]:
+; UNROLL2-NEXT:    [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[END]], %[[LOOP_LATCH_1]] ]
+; UNROLL2-NEXT:    br label %[[EXIT_UNR_LCSSA]]
+; UNROLL2:       [[EXIT_UNR_LCSSA]]:
+; UNROLL2-NEXT:    [[RES_PH:%.*]] = phi ptr [ [[START]], %[[LOOP_HEADER_PROL]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
+; UNROLL2-NEXT:    br label %[[EXIT]]
+; UNROLL2:       [[EXIT]]:
+; UNROLL2-NEXT:    [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ]
+; UNROLL2-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; UNROLL2-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; UNROLL2-NEXT:    ret i1 [[C_3]...
[truncated]

fhahn · 2025-03-19T10:39:11Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  return true;
+}
+
+static bool shouldUnrollSmallMultiExitLoop(Loop *L, ScalarEvolution &SE,


This looks like a subset of getAppleRuntimeUnrollPreferences, can we unify it?

Sure, I could do that although it's actually a superset because in getAppleRuntimeUnrollPreferences it seems to disallow loops with multiple exit blocks, i.e. both exiting blocks must have the same destination.

Hmm, sorry, maybe that's not right. I'm just looking at this code:

// Small search loops with multiple exits can be highly beneficial to unroll. if (!L->getExitBlock()) {

Perhaps I've misunderstood what this is doing?

I thought at first this is saying if the loop doesn't have a single exit block then unroll, however looking at the implementation of getExitBlock a bit more I can see that it doesn't permit repeats. So I guess that means it should always return false for any early exit loop.

The code block should handle small multi exit loops as advertised (tests are in llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll). getExitBlock() should return the single exit block if the loop has a single exit & exiting block.

If there are multiple exiting blocks to the same exit, the exit block will be included multiple times in getExitBlocks(); This is different from getUniqueExitBlock, which would return the common exit block, even if there are multiple exiting blocks branching to it.

Hi, I'm not sure exactly what you meant by 'unify' and how far I should attempt to commonise code. I've tried my best - see what you think! The main difference is that now getAppleRuntimeUnrollPreferences bails out on multi-exit loops and defers to common code. The common code bails out for slightly more cases, for example if an intrinsic gets lowered to a call, if the loop already contains a vector type, or if a cost cannot be calculated. I assume that these are sensible things to do for all targets anyway. One side-effect of these changes is that it's now possible to override the default unroll factor of 4 - I've added a test to Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll for forcing unroll=2.

david-arm · 2025-03-27T09:22:01Z

Hi @fhahn, I've tried to address your comments around trying to unify the code. Is it what you were expecting? Thanks!

igogo-x86 · 2025-03-27T09:22:06Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  // Small search loops with multiple exits can be highly beneficial to unroll.
+  // We only care about loops with exactly two exiting blocks, although each
+  // block could jump to the same exit block.
+  SmallVector<BasicBlock *> Blocks(L->getBlocks());


I think we can avoid copying blocks here by ArrayRef or auto&:

ArrayRef<BasicBlock*> Blocks = L->getBlocks();

igogo-x86 · 2025-03-27T09:22:09Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (I.getType()->isVectorTy())
+    return false;
+
+  if (isa<CallInst>(I) || isa<InvokeInst>(I)) {


Since we are only working with CallBase below, we can rewrite it as:

if (auto *CB = dyn_cast<CallBase>(&I)) { if (const Function *F = CB->getCalledFunction()) .... }

Hmm, it's a good suggestion but I checked and it seems CallBrInst also derives from CallBase. I'm not sure if I should implicitly add support for that by permitting all instances of CallBase. If you don't mind I'll leave this as it is for now?

I didn't know about that. :) But it looks like, with the current approach, shouldUnrollLoopWithInstruction will return true for callbr. Do we want to unroll that kind of loop?

Yep, you're right! I've fixed it now.

github-actions · 2025-03-27T13:22:39Z

✅ With the latest revision this PR passed the C/C++ code formatter.

fhahn

I am not sure about adding a very specialized option for this. Is there a reason we couldnt take the right decision per uarch to start with? It sounds like that is planned down the road already?

fhahn · 2025-03-27T22:07:44Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -68,6 +68,11 @@ static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
 static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
                                      cl::init(true), cl::Hidden);

+static cl::opt<unsigned> SmallMultiExitLoopUF(
+    "small-multi-exit-loop-unroll-factor", cl::init(0), cl::Hidden,


The name makes it sounds like a generally applicable flag, but it is something specific for the AArch64 cost model. I think ideally we would set things correctly per sub-target directly, which sounds like you are planning anyways, instead of adding this option first..

fhahn · 2025-03-27T22:09:52Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  return true;
+}
+
+static InstructionCost getSizeOfLoop(Loop *L, AArch64TTIImpl &TTI) {


Can this also be shared with getAppleRuntimeUnrollPreferences? Although there we explicitly don't bail out on vector instructions. We only bail out on loops LV already vectorized.

fhahn · 2025-03-27T22:11:08Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  InstructionCost Size = getSizeOfLoop(L, TTI);
+  if (!Size.isValid())
+    return false;
+
+  // Small search loops with multiple exits can be highly beneficial to unroll.
+  // We only care about loops with exactly two exiting blocks, although each
+  // block could jump to the same exit block.
+  ArrayRef<BasicBlock *> Blocks = L->getBlocks();
+  if (Blocks.size() != 2)
+    return false;
+
+  if (any_of(Blocks, [](BasicBlock *BB) {
+        return !isa<BranchInst>(BB->getTerminator());
+      }))
+    return false;
+
+  return *Size.getValue() < 6;


I guess that would make most sense to share this with getAppleRuntimeUnrollPreferences, as is at the moment the checks above are duplicated.

david-arm · 2025-03-28T09:21:48Z

I am not sure about adding a very specialized option for this. Is there a reason we couldnt take the right decision per uarch to start with? It sounds like that is planned down the road already?

Well I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it?

davemgreen

I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it?

Having the option sounds useful for debugging - we can use it to disable the unrolling if we find issues. Perhaps rename it to -aarch64-small-multi-exit-loop-unroll-factor?

I gave this a try and it seemed OK. The code size, which was the thing I was worried about, did not increase by too much.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

davemgreen

This looks OK to me if @fhahn is OK with the new code.

davemgreen · 2025-04-01T14:48:15Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  return true;
+}
+
+static unsigned getLoopSize(Loop *L, AArch64TTIImpl &TTI,


The two uses make this look like it should return a bool, and maybe be named something like isLoopSizeOverBudget. If the Size is needed, it could be returned in a unsigned& arg, or return a optional?

Would also be good to document.

I've renamed this to isLoopSizeWithinBudget so that it returns a boolean instead, and passes the final size in an optional argument. I've also added some comments to the function. Hope this is better!

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

fhahn

I am not sure about adding a very specialized option for this. Is there a reason we couldnt take the right decision per uarch to start with? It sounds like that is planned down the road already?

Well I was thinking that someone might be targeting a generic CPU with a particular architecture (perhaps this happens when targeting Android?), but still want a way to get the benefit of unrolling multi-exit loops such as std::find. Having said that, if the consensus is to avoid introducing a flag I can remove it?

I don't think we generally encourage users to use internal LLVM options, they mostly for LLVM developers, unless expose explicitly via Clang or other drivers. And those flags are also not guaranteed to be stable.

Another alternative as middle ground would be to introduce a target feature instead of a flag, this can be both specified via the command line and it will be easy to add it for CPUs where it's profitable.

But I think it would still be good to enable it for the CPUs that are known to benefit straigth away.

Having the option sounds useful for debugging - we can use it to disable the unrolling if we find issues. Perhaps rename it to -aarch64-small-multi-exit-loop-unroll-factor?

I gave this a try and it seemed OK. The code size, which was the thing I was worried about, did not increase by too much.

If it is just about disabling unrolling for such loops, it might be good to check if the existing unroll-runtime-multi-exit option is explicitly set to false? This is something the current implementation for Apple CPUs doesn't do, but I think it would be reasonable to expect this flag to override the target specific decisions?

fhahn · 2025-04-02T09:50:31Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  return true;
+}
+
+static unsigned getLoopSize(Loop *L, AArch64TTIImpl &TTI,


Would also be good to document.

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

paulwalker-arm · 2025-04-02T13:27:31Z

I don't believe we want or should be adding a new "public" way to control this optimisation and so we shouldn't add anything binding like a target feature flag. The intent is really about having finer control for environments where it would not be easy to inject a new toolchain but a user is happy to try "secret" options using their existing toolchain, be it for functional or performance investigations.

I do agree we should be more aggressive in enabling this optimisation by default for the platforms we know it will benefit.

@fhahn: What is the down side to adding a dedicated option for this? If there's a problem with adding such flags then perhaps we should think about adding something akin to gcc's -param flag? Would there be benefit to having one flag with many properties verse having many flags?

david-arm · 2025-04-02T15:36:38Z

Following reviewer suggestions I've just gone ahead and enabled this for neoverse-v1 and neoverse-v2. The testing I've done doesn't suggest any downsides in terms of performance for SPEC2017 or any significant increase in code size. I saw less than 0.1% increase in code size for SPEC2017 and I observed a 0.3% increase for the LLVM test suite. However, when building SPEC2017 with libc++ I saw about 6% and 9% performance improvements for xalancbmk on neoverse-v1 and neoverse-v2, respectively.

davemgreen · 2025-04-02T21:38:33Z

The option is just for debugging, it is not intended to be customer facing. Just in case we need to disable it again. If -unroll-runtime-multi-exit will reliably turn the option off then that would be be good. It doesn't look like that is how the option works though as it will not override ULO.RuntimeUnrollMultiExit.

Following reviewer suggestions I've just gone ahead and enabled this for neoverse-v1 and neoverse-v2. The testing I've done doesn't suggest any downsides in terms of performance for SPEC2017 or any significant increase in code size. I saw less than 0.1% increase in code size for SPEC2017 and I observed a 0.3% increase for the LLVM test suite. However, when building SPEC2017 with libc++ I saw about 6% and 9% performance improvements for xalancbmk on neoverse-v1 and neoverse-v2, respectively.

We should enable for all cpus unless we have a good reason not to. We know this is relatively small impact on codesize, helps with big cores and little cores like unrolling (providing it is generally less instructions). Unless we have some reason not to, we should just enable it universally. If it does for some reason need to be per cpu then it should be a subtarget feature.

paulwalker-arm · 2025-04-03T10:50:11Z

If the intention is to enable this for all AArch64 targets then yes I agree this means we only need to be able to disable the option. To avoid confusion I will clarify my "we don't want a target feature" comment. Here I'm referring to public flags and specifically mean we don't want something at the -march level. -target-feature is effectively a secret flag and so I'm not against that.

Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92, we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always repsect the option. This surfaced while discussing llvm#131998.

fhahn · 2025-04-03T15:24:46Z

The option is just for debugging, it is not intended to be customer facing. Just in case we need to disable it again. If -unroll-runtime-multi-exit will reliably turn the option off then that would be be good. It doesn't look like that is how the option works though as it will not override ULO.RuntimeUnrollMultiExit.

I think this was something that was missed when adding RuntimeUnrollMultiExit. I think it makes sense for the option to override the profitability heuristic from TTI, same as it overrides the generic profitability heuristics: #134259

Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92, we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always repsect the option. This surfaced while discussing llvm#131998.

Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92 (#124462), we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always respect the option. This surfaced while discussing #131998. PR: #134259

…TTI. (#134259) Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92 (llvm/llvm-project#124462), we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always respect the option. This surfaced while discussing llvm/llvm-project#131998. PR: llvm/llvm-project#134259

david-arm · 2025-04-04T10:21:13Z

I've now enabled unrolling of small, multi-exit loops by default for all cores and removed the extra flag I added previously, in favour of the existing -unroll-runtime-multi-exit.

davemgreen

LGTM

davemgreen · 2025-04-04T11:03:56Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  InstructionCost LoopCost = 0;
+
+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return 0;


return false;
Same in a couple of places below.

Yep, good spot. Fixed now, thanks!

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

fhahn · 2025-04-07T09:32:39Z

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll

@@ -3,7 +3,7 @@
 ; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s
 ; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s
 ; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s
-; RUN: opt -p loop-unroll -mcpu=apple-m1 -unroll-runtime-multi-exit=false -S %s | FileCheck --check-prefix=OTHER %s
+; RUN: opt -p loop-unroll -mcpu=apple-m1 -unroll-runtime-multi-exit=false -S %s | FileCheck --check-prefix=NOUNROLL %s
 ; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s


I think that line can be dropped now, as other CPU behavior is now covered by other tests

Assuming I've understood which line you meant, I've deleted the -mcpu=cortex-a57 line.

fhahn · 2025-04-07T09:35:26Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return false;
+
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!shouldUnrollLoopWithInstruction(I, TTI))


Would it make more sense to check this separately? If not, would be good to update the name to make it clearer that this does not only check if the loop size is within a budget?

I've kept it as part of the loop, because even the previous loop checking the size for the apple preferences bailed out on call instructions. It seems a bit inefficient to walk the loop twice, so I've renamed it to canUnrollLoopWithinBudget. I've also move the findStringMetadataForLoop out into shouldUnrollMultiExitLoop, since there's no need to check here.

Ok that's fine, although now we potentially check shouldUnrollLoopWithInstruction 3 times. Would it be better and more efficient to just check this once, before // Apply subtarget-specific unrolling preferences?

david-arm · 2025-04-08T08:17:44Z

I plan to land this patch this morning if nobody has any objections?

fhahn · 2025-04-08T08:33:48Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -4542,43 +4631,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
  if (!L->isInnermost() || L->getNumBlocks() > 8)
    return;

+  // This is handled by common code.


Suggested change

// This is handled by common code.

// Loops with multiple exits are handled by common code.

fhahn · 2025-04-08T08:40:24Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return false;


I think this is probably dead code, as the current version of canUnrollLoopWithinBudget forbids loops with vector instructions. This is a change to the current behavior for the Apple unrolling preferences, which allowed loops with vector code (e.g. intrinsics), but not loops vectorized by LV, which should already be unrolled by LV.

If there are enough vector execution units, I don't think there's a need to dis-allow vector instructions from unrolling, but that may require more target info, so keeping the bail-out is fine for now in shouldUnrollLoopWithInstruction is fine.

Is it dead code? Doesn't the loop vectoriser mark this as vectorised when using something like VF=1,IC=2? Having said that, I'm happy to remove it if you don't think it adds any value.

I've removed it anyway

LV should disable runtime unrolling explicitly via metadata and after running LV only runtime unrolling should run. I guess in theory there may be edge-cases where we fail to fully unroll initially and maybe fully unroll after vectorization, but that should probably be fine

fhahn · 2025-04-08T08:46:59Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return false;
+
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!shouldUnrollLoopWithInstruction(I, TTI))


Ok that's fine, although now we potentially check shouldUnrollLoopWithInstruction 3 times. Would it be better and more efficient to just check this once, before // Apply subtarget-specific unrolling preferences?

fhahn

LGTM, thanks

fhahn · 2025-04-08T17:52:28Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  // unrolling.
+  for (auto *BB : L->getBlocks()) {
+    for (auto &I : *BB) {
+      if (!shouldUnrollLoopWithInstruction(I, *this))


nit: now that this has a single user, could be inlined again

fhahn · 2025-04-08T17:54:33Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+  if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
+    return false;


LV should disable runtime unrolling explicitly via metadata and after running LV only runtime unrolling should run. I guess in theory there may be edge-cases where we fail to fully unroll initially and maybe fully unroll after vectorization, but that should probably be fine

fhahn · 2025-04-08T17:54:59Z

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

@@ -4528,6 +4528,88 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  }
 }

+static bool shouldUnrollLoopWithInstruction(Instruction &I,


If not inlined back, would be good to doucment.

It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.

david-arm · 2025-04-09T08:11:54Z

Rebased and ran "make check-all" locally. Seems ok.

It can be highly beneficial to unroll small, two-block search loops that look for a value in an array. An example of this would be something that uses std::find to find a value in libc++. Older versions of std::find in the libstdc++ headers are manually unrolled in the source code, but this might change in newer releases where the compiler is expected to either vectorise or unroll itself.

david-arm requested review from davemgreen, paulwalker-arm and sjoerdmeijer March 19, 2025 10:27

llvmbot added backend:AArch64 llvm:transforms labels Mar 19, 2025

fhahn reviewed Mar 19, 2025

View reviewed changes

igogo-x86 reviewed Mar 27, 2025

View reviewed changes

david-arm force-pushed the ee_unroll branch from 9cfa550 to 8110f91 Compare March 27, 2025 13:18

fhahn reviewed Mar 27, 2025

View reviewed changes

davemgreen reviewed Mar 28, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp Show resolved Hide resolved

david-arm force-pushed the ee_unroll branch from 4816b09 to fb755c0 Compare March 31, 2025 13:40

davemgreen reviewed Apr 1, 2025

View reviewed changes

fhahn reviewed Apr 2, 2025

View reviewed changes

fhahn mentioned this pull request Apr 3, 2025

[LoopUnroll] UnrollRuntimeMultiExit takes precedence over TTI. #134259

Merged

david-arm force-pushed the ee_unroll branch from c5e7906 to a8f5914 Compare April 4, 2025 10:19

david-arm force-pushed the ee_unroll branch from a8f5914 to da79613 Compare April 4, 2025 10:22

david-arm changed the title ~~[AArch64] Add flag to control unrolling for small multi-exit loops~~ [AArch64] Enable unrolling for small multi-exit loops Apr 4, 2025

davemgreen approved these changes Apr 7, 2025

View reviewed changes

david-arm force-pushed the ee_unroll branch from da79613 to fe780fe Compare April 7, 2025 09:25

fhahn reviewed Apr 7, 2025

View reviewed changes

fhahn reviewed Apr 8, 2025

View reviewed changes

fhahn approved these changes Apr 8, 2025

View reviewed changes

david-arm added 5 commits April 9, 2025 08:01

Address review comment

c56c43d

Address review comments

d1e9466

Address review comments

4671915

Address review comments

3cbc21b

david-arm force-pushed the ee_unroll branch from d2ba565 to 3cbc21b Compare April 9, 2025 08:06

david-arm merged commit 712c213 into llvm:main Apr 9, 2025
7 of 9 checks passed

	// This is handled by common code.
	// Loops with multiple exits are handled by common code.

		if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
		return false;

[AArch64] Enable unrolling for small multi-exit loops #131998

[AArch64] Enable unrolling for small multi-exit loops #131998

Uh oh!

Conversation

david-arm commented Mar 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

david-arm commented Mar 27, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Mar 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

fhahn left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

david-arm commented Mar 28, 2025

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

davemgreen left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

paulwalker-arm commented Apr 2, 2025

Uh oh!

david-arm commented Apr 2, 2025

Uh oh!

davemgreen commented Apr 2, 2025

david-arm commented Mar 19, 2025 •

edited

Loading

llvmbot commented Mar 19, 2025 •

edited

Loading

github-actions bot commented Mar 27, 2025 •

edited

Loading

fhahn left a comment •

edited

Loading

paulwalker-arm commented Apr 3, 2025 •

edited

Loading