Skip to content

Commit 96f9ad3

Browse files
committed
[AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs. (llvm#118317)
Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. Across a large set of workloads, this increase the total number of unrolled loops by 2.9%. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide Depends on llvm#118316 for TTI changes. PR: llvm#118317 (cherry picked from commit 0bb7bd4)
1 parent 1b0e7b1 commit 96f9ad3

File tree

2 files changed

+179
-7
lines changed

2 files changed

+179
-7
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3611,6 +3611,92 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
36113611
}
36123612
}
36133613

3614+
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
3615+
/// OOO engine's wide instruction window and various predictors.
3616+
static void
3617+
getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
3618+
TargetTransformInfo::UnrollingPreferences &UP,
3619+
AArch64TTIImpl &TTI) {
3620+
// Limit loops with structure that is highly likely to benefit from runtime
3621+
// unrolling; that is we exclude outer loops, loops with multiple exits and
3622+
// many blocks (i.e. likely with complex control flow). Note that the
3623+
// heuristics here may be overly conservative and we err on the side of
3624+
// avoiding runtime unrolling rather than unroll excessively. They are all
3625+
// subject to further refinement.
3626+
if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
3627+
return;
3628+
3629+
const SCEV *BTC = SE.getBackedgeTakenCount(L);
3630+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
3631+
(SE.getSmallConstantMaxTripCount(L) > 0 &&
3632+
SE.getSmallConstantMaxTripCount(L) <= 32))
3633+
return;
3634+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
3635+
return;
3636+
3637+
int64_t Size = 0;
3638+
for (auto *BB : L->getBlocks()) {
3639+
for (auto &I : *BB) {
3640+
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
3641+
return;
3642+
SmallVector<const Value *, 4> Operands(I.operand_values());
3643+
Size +=
3644+
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
3645+
}
3646+
}
3647+
3648+
// Limit to loops with trip counts that are cheap to expand.
3649+
UP.SCEVExpansionBudget = 1;
3650+
3651+
// Try to unroll small, single block loops, if they have load/store
3652+
// dependencies, to expose more parallel memory access streams.
3653+
if (L->getHeader() != L->getLoopLatch() || Size > 8)
3654+
return;
3655+
3656+
SmallPtrSet<Value *, 8> LoadedValues;
3657+
SmallVector<StoreInst *> Stores;
3658+
for (auto *BB : L->blocks()) {
3659+
for (auto &I : *BB) {
3660+
Value *Ptr = getLoadStorePointerOperand(&I);
3661+
if (!Ptr)
3662+
continue;
3663+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
3664+
if (SE.isLoopInvariant(PtrSCEV, L))
3665+
continue;
3666+
if (isa<LoadInst>(&I))
3667+
LoadedValues.insert(&I);
3668+
else
3669+
Stores.push_back(cast<StoreInst>(&I));
3670+
}
3671+
}
3672+
3673+
// Try to find an unroll count that maximizes the use of the instruction
3674+
// window, i.e. trying to fetch as many instructions per cycle as possible.
3675+
unsigned MaxInstsPerLine = 16;
3676+
unsigned UC = 1;
3677+
unsigned BestUC = 1;
3678+
unsigned SizeWithBestUC = BestUC * Size;
3679+
while (UC <= 8) {
3680+
unsigned SizeWithUC = UC * Size;
3681+
if (SizeWithUC > 48)
3682+
break;
3683+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
3684+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
3685+
BestUC = UC;
3686+
SizeWithBestUC = BestUC * Size;
3687+
}
3688+
UC++;
3689+
}
3690+
3691+
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
3692+
return LoadedValues.contains(SI->getOperand(0));
3693+
}))
3694+
return;
3695+
3696+
UP.Runtime = true;
3697+
UP.DefaultUnrollRuntimeCount = BestUC;
3698+
}
3699+
36143700
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
36153701
TTI::UnrollingPreferences &UP,
36163702
OptimizationRemarkEmitter *ORE) {
@@ -3628,9 +3714,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
36283714
// Disable partial & runtime unrolling on -Os.
36293715
UP.PartialOptSizeThreshold = 0;
36303716

3631-
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3632-
EnableFalkorHWPFUnrollFix)
3633-
getFalkorUnrollingPreferences(L, SE, UP);
3717+
// Apply subtarget-specific unrolling preferences.
3718+
switch (ST->getProcFamily()) {
3719+
case AArch64Subtarget::AppleA14:
3720+
case AArch64Subtarget::AppleA15:
3721+
case AArch64Subtarget::AppleA16:
3722+
case AArch64Subtarget::AppleM4:
3723+
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
3724+
break;
3725+
case AArch64Subtarget::Falkor:
3726+
if (EnableFalkorHWPFUnrollFix)
3727+
getFalkorUnrollingPreferences(L, SE, UP);
3728+
break;
3729+
default:
3730+
break;
3731+
}
36343732

36353733
// Scan the loop: don't unroll loops with calls as this could prevent
36363734
// inlining. Don't unroll vector loops either, as they don't benefit much from

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
1212
; APPLE-LABEL: define void @small_load_store_loop(
1313
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
1414
; APPLE-NEXT: [[ENTRY:.*]]:
15+
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
16+
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
17+
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
18+
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
19+
; APPLE: [[ENTRY_NEW]]:
20+
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
1521
; APPLE-NEXT: br label %[[LOOP:.*]]
1622
; APPLE: [[LOOP]]:
17-
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
23+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
24+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
1825
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
1926
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
2027
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
2128
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
2229
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
23-
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
24-
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
25-
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
30+
; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
31+
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
32+
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
33+
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
34+
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
35+
; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
36+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
37+
; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
38+
; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
39+
; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
40+
; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
41+
; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
42+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
43+
; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
44+
; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
45+
; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
46+
; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
47+
; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
48+
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
49+
; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
50+
; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
51+
; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
52+
; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
53+
; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
54+
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
55+
; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
56+
; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
57+
; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
58+
; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
59+
; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
60+
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
61+
; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
62+
; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
63+
; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
64+
; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
65+
; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
66+
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
67+
; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
68+
; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
69+
; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
70+
; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
71+
; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
72+
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
73+
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
74+
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
75+
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
76+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
77+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
78+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
79+
; APPLE: [[EXIT_UNR_LCSSA]]:
80+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
81+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
82+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
83+
; APPLE: [[LOOP_EPIL_PREHEADER]]:
84+
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
85+
; APPLE: [[LOOP_EPIL]]:
86+
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
87+
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
88+
; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
89+
; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
90+
; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
91+
; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
92+
; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
93+
; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
94+
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
95+
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
96+
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
97+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
98+
; APPLE: [[EXIT_EPILOG_LCSSA]]:
99+
; APPLE-NEXT: br label %[[EXIT]]
26100
; APPLE: [[EXIT]]:
27101
; APPLE-NEXT: ret void
28102
;

0 commit comments

Comments
 (0)