Skip to content

Commit 0bb7bd4

Browse files
authored
[AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs. (#118317)
Add initial heuristics to selectively enable runtime unrolling for loops where doing so is expected to be highly beneficial on Apple Silicon CPUs. To start with, we try to runtime-unroll small, single block loops, if they have load/store dependencies, to expose more parallel memory access streams [1] and to improve instruction delivery [2]. We also explicitly avoid runtime-unrolling for loop structures that may limit the expected gains from runtime unrolling. Such loops include loops with complex control flow (aren't innermost loops, have multiple exits, have a large number of blocks), trip count expansion is expensive and are expected to execute a small number of iterations. Note that the heuristics here may be overly conservative and we err on the side of avoiding runtime unrolling rather than unroll excessively. They are all subject to further refinement. Across a large set of workloads, this increase the total number of unrolled loops by 2.9%. [1] 4.6.10 in Apple Silicon CPU Optimization Guide [2] 4.4.4 in Apple Silicon CPU Optimization Guide Depends on #118316 for TTI changes. PR: #118317
1 parent f22ecdd commit 0bb7bd4

File tree

2 files changed

+179
-7
lines changed

2 files changed

+179
-7
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3989,6 +3989,92 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
39893989
}
39903990
}
39913991

3992+
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
3993+
/// OOO engine's wide instruction window and various predictors.
3994+
static void
3995+
getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
3996+
TargetTransformInfo::UnrollingPreferences &UP,
3997+
AArch64TTIImpl &TTI) {
3998+
// Limit loops with structure that is highly likely to benefit from runtime
3999+
// unrolling; that is we exclude outer loops, loops with multiple exits and
4000+
// many blocks (i.e. likely with complex control flow). Note that the
4001+
// heuristics here may be overly conservative and we err on the side of
4002+
// avoiding runtime unrolling rather than unroll excessively. They are all
4003+
// subject to further refinement.
4004+
if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4005+
return;
4006+
4007+
const SCEV *BTC = SE.getBackedgeTakenCount(L);
4008+
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4009+
(SE.getSmallConstantMaxTripCount(L) > 0 &&
4010+
SE.getSmallConstantMaxTripCount(L) <= 32))
4011+
return;
4012+
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4013+
return;
4014+
4015+
int64_t Size = 0;
4016+
for (auto *BB : L->getBlocks()) {
4017+
for (auto &I : *BB) {
4018+
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4019+
return;
4020+
SmallVector<const Value *, 4> Operands(I.operand_values());
4021+
Size +=
4022+
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4023+
}
4024+
}
4025+
4026+
// Limit to loops with trip counts that are cheap to expand.
4027+
UP.SCEVExpansionBudget = 1;
4028+
4029+
// Try to unroll small, single block loops, if they have load/store
4030+
// dependencies, to expose more parallel memory access streams.
4031+
if (L->getHeader() != L->getLoopLatch() || Size > 8)
4032+
return;
4033+
4034+
SmallPtrSet<Value *, 8> LoadedValues;
4035+
SmallVector<StoreInst *> Stores;
4036+
for (auto *BB : L->blocks()) {
4037+
for (auto &I : *BB) {
4038+
Value *Ptr = getLoadStorePointerOperand(&I);
4039+
if (!Ptr)
4040+
continue;
4041+
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4042+
if (SE.isLoopInvariant(PtrSCEV, L))
4043+
continue;
4044+
if (isa<LoadInst>(&I))
4045+
LoadedValues.insert(&I);
4046+
else
4047+
Stores.push_back(cast<StoreInst>(&I));
4048+
}
4049+
}
4050+
4051+
// Try to find an unroll count that maximizes the use of the instruction
4052+
// window, i.e. trying to fetch as many instructions per cycle as possible.
4053+
unsigned MaxInstsPerLine = 16;
4054+
unsigned UC = 1;
4055+
unsigned BestUC = 1;
4056+
unsigned SizeWithBestUC = BestUC * Size;
4057+
while (UC <= 8) {
4058+
unsigned SizeWithUC = UC * Size;
4059+
if (SizeWithUC > 48)
4060+
break;
4061+
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4062+
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4063+
BestUC = UC;
4064+
SizeWithBestUC = BestUC * Size;
4065+
}
4066+
UC++;
4067+
}
4068+
4069+
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4070+
return LoadedValues.contains(SI->getOperand(0));
4071+
}))
4072+
return;
4073+
4074+
UP.Runtime = true;
4075+
UP.DefaultUnrollRuntimeCount = BestUC;
4076+
}
4077+
39924078
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
39934079
TTI::UnrollingPreferences &UP,
39944080
OptimizationRemarkEmitter *ORE) {
@@ -4006,9 +4092,21 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
40064092
// Disable partial & runtime unrolling on -Os.
40074093
UP.PartialOptSizeThreshold = 0;
40084094

4009-
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
4010-
EnableFalkorHWPFUnrollFix)
4011-
getFalkorUnrollingPreferences(L, SE, UP);
4095+
// Apply subtarget-specific unrolling preferences.
4096+
switch (ST->getProcFamily()) {
4097+
case AArch64Subtarget::AppleA14:
4098+
case AArch64Subtarget::AppleA15:
4099+
case AArch64Subtarget::AppleA16:
4100+
case AArch64Subtarget::AppleM4:
4101+
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4102+
break;
4103+
case AArch64Subtarget::Falkor:
4104+
if (EnableFalkorHWPFUnrollFix)
4105+
getFalkorUnrollingPreferences(L, SE, UP);
4106+
break;
4107+
default:
4108+
break;
4109+
}
40124110

40134111
// Scan the loop: don't unroll loops with calls as this could prevent
40144112
// inlining. Don't unroll vector loops either, as they don't benefit much from

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,91 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
1212
; APPLE-LABEL: define void @small_load_store_loop(
1313
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
1414
; APPLE-NEXT: [[ENTRY:.*]]:
15+
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
16+
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
17+
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
18+
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
19+
; APPLE: [[ENTRY_NEW]]:
20+
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
1521
; APPLE-NEXT: br label %[[LOOP:.*]]
1622
; APPLE: [[LOOP]]:
17-
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
23+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
24+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
1825
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
1926
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
2027
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
2128
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
2229
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
23-
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
24-
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
25-
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
30+
; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
31+
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
32+
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
33+
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
34+
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
35+
; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
36+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
37+
; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
38+
; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
39+
; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
40+
; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
41+
; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
42+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
43+
; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
44+
; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
45+
; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
46+
; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
47+
; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
48+
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
49+
; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
50+
; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
51+
; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
52+
; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
53+
; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
54+
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
55+
; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
56+
; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
57+
; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
58+
; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
59+
; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
60+
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
61+
; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
62+
; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
63+
; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
64+
; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
65+
; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
66+
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
67+
; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
68+
; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
69+
; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
70+
; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
71+
; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
72+
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
73+
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
74+
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
75+
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
76+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
77+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
78+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
79+
; APPLE: [[EXIT_UNR_LCSSA]]:
80+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
81+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
82+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
83+
; APPLE: [[LOOP_EPIL_PREHEADER]]:
84+
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
85+
; APPLE: [[LOOP_EPIL]]:
86+
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
87+
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
88+
; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
89+
; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
90+
; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
91+
; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
92+
; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
93+
; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
94+
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
95+
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
96+
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
97+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
98+
; APPLE: [[EXIT_EPILOG_LCSSA]]:
99+
; APPLE-NEXT: br label %[[EXIT]]
26100
; APPLE: [[EXIT]]:
27101
; APPLE-NEXT: ret void
28102
;

0 commit comments

Comments
 (0)