Skip to content

Commit d24df56

Browse files
Florian Hahnfhahn
authored andcommitted
Revert "[AArch64] Runtime-unroll small load/store loops for Apple Silicon CPUs. (llvm#118317)"
This reverts commit 0bb7bd4.
1 parent edd11b8 commit d24df56

File tree

2 files changed

+7
-214
lines changed

2 files changed

+7
-214
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 3 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -4100,127 +4100,6 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
41004100
}
41014101
}
41024102

4103-
/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4104-
/// OOO engine's wide instruction window and various predictors.
4105-
static void
4106-
getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4107-
TargetTransformInfo::UnrollingPreferences &UP,
4108-
AArch64TTIImpl &TTI) {
4109-
// Limit loops with structure that is highly likely to benefit from runtime
4110-
// unrolling; that is we exclude outer loops, loops with multiple exits and
4111-
// many blocks (i.e. likely with complex control flow). Note that the
4112-
// heuristics here may be overly conservative and we err on the side of
4113-
// avoiding runtime unrolling rather than unroll excessively. They are all
4114-
// subject to further refinement.
4115-
if (!L->isInnermost() || !L->getExitBlock() || L->getNumBlocks() > 8)
4116-
return;
4117-
4118-
const SCEV *BTC = SE.getBackedgeTakenCount(L);
4119-
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
4120-
(SE.getSmallConstantMaxTripCount(L) > 0 &&
4121-
SE.getSmallConstantMaxTripCount(L) <= 32))
4122-
return;
4123-
if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
4124-
return;
4125-
4126-
int64_t Size = 0;
4127-
for (auto *BB : L->getBlocks()) {
4128-
for (auto &I : *BB) {
4129-
if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4130-
return;
4131-
SmallVector<const Value *, 4> Operands(I.operand_values());
4132-
Size +=
4133-
*TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize).getValue();
4134-
}
4135-
}
4136-
4137-
// Limit to loops with trip counts that are cheap to expand.
4138-
UP.SCEVExpansionBudget = 1;
4139-
4140-
// Try to unroll small, single block loops, if they have load/store
4141-
// dependencies, to expose more parallel memory access streams.
4142-
BasicBlock *Header = L->getHeader();
4143-
if (Header == L->getLoopLatch()) {
4144-
if (Size > 8)
4145-
return;
4146-
4147-
SmallPtrSet<Value *, 8> LoadedValues;
4148-
SmallVector<StoreInst *> Stores;
4149-
for (auto *BB : L->blocks()) {
4150-
for (auto &I : *BB) {
4151-
Value *Ptr = getLoadStorePointerOperand(&I);
4152-
if (!Ptr)
4153-
continue;
4154-
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4155-
if (SE.isLoopInvariant(PtrSCEV, L))
4156-
continue;
4157-
if (isa<LoadInst>(&I))
4158-
LoadedValues.insert(&I);
4159-
else
4160-
Stores.push_back(cast<StoreInst>(&I));
4161-
}
4162-
}
4163-
4164-
// Try to find an unroll count that maximizes the use of the instruction
4165-
// window, i.e. trying to fetch as many instructions per cycle as possible.
4166-
unsigned MaxInstsPerLine = 16;
4167-
unsigned UC = 1;
4168-
unsigned BestUC = 1;
4169-
unsigned SizeWithBestUC = BestUC * Size;
4170-
while (UC <= 8) {
4171-
unsigned SizeWithUC = UC * Size;
4172-
if (SizeWithUC > 48)
4173-
break;
4174-
if ((SizeWithUC % MaxInstsPerLine) == 0 ||
4175-
(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4176-
BestUC = UC;
4177-
SizeWithBestUC = BestUC * Size;
4178-
}
4179-
UC++;
4180-
}
4181-
4182-
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4183-
return LoadedValues.contains(SI->getOperand(0));
4184-
}))
4185-
return;
4186-
4187-
UP.Runtime = true;
4188-
UP.DefaultUnrollRuntimeCount = BestUC;
4189-
return;
4190-
}
4191-
4192-
// Try to runtime-unroll loops with early-continues depending on loop-varying
4193-
// loads; this helps with branch-prediction for the early-continues.
4194-
auto *Term = dyn_cast<BranchInst>(Header->getTerminator());
4195-
auto *Latch = L->getLoopLatch();
4196-
SmallVector<BasicBlock *> Preds(predecessors(Latch));
4197-
if (!Term || !Term->isConditional() || Preds.size() == 1 ||
4198-
none_of(Preds, [Header](BasicBlock *Pred) { return Header == Pred; }) ||
4199-
none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
4200-
return;
4201-
4202-
std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
4203-
[&](Instruction *I, unsigned Depth) -> bool {
4204-
if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
4205-
return false;
4206-
4207-
if (isa<LoadInst>(I))
4208-
return true;
4209-
4210-
return any_of(I->operands(), [&](Value *V) {
4211-
auto *I = dyn_cast<Instruction>(V);
4212-
return I && DependsOnLoopLoad(I, Depth + 1);
4213-
});
4214-
};
4215-
CmpPredicate Pred;
4216-
Instruction *I;
4217-
if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
4218-
m_Value())) &&
4219-
DependsOnLoopLoad(I, 0)) {
4220-
UP.Runtime = true;
4221-
}
4222-
}
4223-
42244103
void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
42254104
TTI::UnrollingPreferences &UP,
42264105
OptimizationRemarkEmitter *ORE) {
@@ -4238,21 +4117,9 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
42384117
// Disable partial & runtime unrolling on -Os.
42394118
UP.PartialOptSizeThreshold = 0;
42404119

4241-
// Apply subtarget-specific unrolling preferences.
4242-
switch (ST->getProcFamily()) {
4243-
case AArch64Subtarget::AppleA14:
4244-
case AArch64Subtarget::AppleA15:
4245-
case AArch64Subtarget::AppleA16:
4246-
case AArch64Subtarget::AppleM4:
4247-
getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
4248-
break;
4249-
case AArch64Subtarget::Falkor:
4250-
if (EnableFalkorHWPFUnrollFix)
4251-
getFalkorUnrollingPreferences(L, SE, UP);
4252-
break;
4253-
default:
4254-
break;
4255-
}
4120+
if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
4121+
EnableFalkorHWPFUnrollFix)
4122+
getFalkorUnrollingPreferences(L, SE, UP);
42564123

42574124
// Scan the loop: don't unroll loops with calls as this could prevent
42584125
// inlining. Don't unroll vector loops either, as they don't benefit much from

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 4 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -12,91 +12,17 @@ define void @small_load_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale) {
1212
; APPLE-LABEL: define void @small_load_store_loop(
1313
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
1414
; APPLE-NEXT: [[ENTRY:.*]]:
15-
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
16-
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7
17-
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
18-
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
19-
; APPLE: [[ENTRY_NEW]]:
20-
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
2115
; APPLE-NEXT: br label %[[LOOP:.*]]
2216
; APPLE: [[LOOP]]:
23-
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ]
24-
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ]
17+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
2518
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_EPIL]], [[SCALE]]
2619
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
2720
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
2821
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL]]
2922
; APPLE-NEXT: store float [[L_EPIL]], ptr [[GEP_DST_EPIL]], align 4
30-
; APPLE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
31-
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT_EPIL]], [[SCALE]]
32-
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
33-
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
34-
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
35-
; APPLE-NEXT: store float [[L_1]], ptr [[GEP_DST_1]], align 4
36-
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
37-
; APPLE-NEXT: [[SCALED_IV_2:%.*]] = mul nuw nsw i64 [[IV_NEXT_1]], [[SCALE]]
38-
; APPLE-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_2]]
39-
; APPLE-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 4
40-
; APPLE-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_1]]
41-
; APPLE-NEXT: store float [[L_2]], ptr [[GEP_DST_2]], align 4
42-
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
43-
; APPLE-NEXT: [[SCALED_IV_3:%.*]] = mul nuw nsw i64 [[IV_NEXT_2]], [[SCALE]]
44-
; APPLE-NEXT: [[GEP_SRC_3:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_3]]
45-
; APPLE-NEXT: [[L_3:%.*]] = load float, ptr [[GEP_SRC_3]], align 4
46-
; APPLE-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_2]]
47-
; APPLE-NEXT: store float [[L_3]], ptr [[GEP_DST_3]], align 4
48-
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV_EPIL]], 4
49-
; APPLE-NEXT: [[SCALED_IV_4:%.*]] = mul nuw nsw i64 [[IV_NEXT_3]], [[SCALE]]
50-
; APPLE-NEXT: [[GEP_SRC_4:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_4]]
51-
; APPLE-NEXT: [[L_4:%.*]] = load float, ptr [[GEP_SRC_4]], align 4
52-
; APPLE-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_3]]
53-
; APPLE-NEXT: store float [[L_4]], ptr [[GEP_DST_4]], align 4
54-
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV_EPIL]], 5
55-
; APPLE-NEXT: [[SCALED_IV_5:%.*]] = mul nuw nsw i64 [[IV_NEXT_4]], [[SCALE]]
56-
; APPLE-NEXT: [[GEP_SRC_5:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_5]]
57-
; APPLE-NEXT: [[L_5:%.*]] = load float, ptr [[GEP_SRC_5]], align 4
58-
; APPLE-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_4]]
59-
; APPLE-NEXT: store float [[L_5]], ptr [[GEP_DST_5]], align 4
60-
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV_EPIL]], 6
61-
; APPLE-NEXT: [[SCALED_IV_6:%.*]] = mul nuw nsw i64 [[IV_NEXT_5]], [[SCALE]]
62-
; APPLE-NEXT: [[GEP_SRC_6:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_6]]
63-
; APPLE-NEXT: [[L_6:%.*]] = load float, ptr [[GEP_SRC_6]], align 4
64-
; APPLE-NEXT: [[GEP_DST_6:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_5]]
65-
; APPLE-NEXT: store float [[L_6]], ptr [[GEP_DST_6]], align 4
66-
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV_EPIL]], 7
67-
; APPLE-NEXT: [[SCALED_IV_7:%.*]] = mul nuw nsw i64 [[IV_NEXT_6]], [[SCALE]]
68-
; APPLE-NEXT: [[GEP_SRC_7:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_7]]
69-
; APPLE-NEXT: [[L_7:%.*]] = load float, ptr [[GEP_SRC_7]], align 4
70-
; APPLE-NEXT: [[GEP_DST_7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT_6]]
71-
; APPLE-NEXT: store float [[L_7]], ptr [[GEP_DST_7]], align 4
72-
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV_EPIL]], 8
73-
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
74-
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
75-
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
76-
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
77-
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ]
78-
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
79-
; APPLE: [[EXIT_UNR_LCSSA]]:
80-
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
81-
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
82-
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
83-
; APPLE: [[LOOP_EPIL_PREHEADER]]:
84-
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
85-
; APPLE: [[LOOP_EPIL]]:
86-
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL1:%.*]], %[[LOOP_EPIL]] ]
87-
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
88-
; APPLE-NEXT: [[SCALED_IV_EPIL1:%.*]] = mul nuw nsw i64 [[IV_EPIL1]], [[SCALE]]
89-
; APPLE-NEXT: [[GEP_SRC_EPIL1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL1]]
90-
; APPLE-NEXT: [[L_EPIL1:%.*]] = load float, ptr [[GEP_SRC_EPIL1]], align 4
91-
; APPLE-NEXT: [[GEP_DST_EPIL1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_EPIL1]]
92-
; APPLE-NEXT: store float [[L_EPIL1]], ptr [[GEP_DST_EPIL1]], align 4
93-
; APPLE-NEXT: [[IV_NEXT_EPIL1]] = add nuw nsw i64 [[IV_EPIL1]], 1
94-
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL1]], [[N]]
95-
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
96-
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
97-
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
98-
; APPLE: [[EXIT_EPILOG_LCSSA]]:
99-
; APPLE-NEXT: br label %[[EXIT]]
23+
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
24+
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
25+
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
10026
; APPLE: [[EXIT]]:
10127
; APPLE-NEXT: ret void
10228
;

0 commit comments

Comments
 (0)