Skip to content

Commit da859c7

Browse files
[InlineCost] Simplify extractvalue across callsite
Motivation: When using libc++, `std::bitset<64>::count()` doesn't optimize to a single popcount instruction on AArch64, because we fail to inline the library code completely. Inlining fails, because the internal bit_iterator struct is passed as a [2 x i64] %arg value on AArch64. The value is built using insertvalue instructions and only one of the array entries is constant. If we know that this entry is constant, we can prove that half the function becomes dead. However, InlineCost only considers operands for simplification if they are Constants, which %arg is not. Without this simplification the function is too expensive to inline. Therefore, we need to teach InlineCost to simplify values which aren't Constants and which may originate from the caller function. As a first step, we enable this for extractvalue, so it can be simplified against insertvalues from the caller function. This is enough to fix the problem for bitset. There are similar opportunities we can explore for BinOps in the future (e.g. cmp eq %arg1, %arg2 when the caller passes the same value into both arguments), but we need to be careful here, because InstSimplify isn't completely safe to use with operands owned by different functions.
1 parent 1d6f102 commit da859c7

File tree

2 files changed

+107
-28
lines changed

2 files changed

+107
-28
lines changed

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
391391
/// likely simplifications post-inlining. The most important aspect we track
392392
/// is CFG altering simplifications -- when we prove a basic block dead, that
393393
/// can cause dramatic shifts in the cost of inlining a function.
394-
DenseMap<Value *, Constant *> SimplifiedValues;
394+
/// Note: The simplified Value may be owned by the caller function.
395+
DenseMap<Value *, Value *> SimplifiedValues;
395396

396397
/// Keep track of the values which map back (through function arguments) to
397398
/// allocas on the caller stack which could be simplified through SROA.
@@ -432,7 +433,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
432433
template <typename T> T *getDirectOrSimplifiedValue(Value *V) const {
433434
if (auto *Direct = dyn_cast<T>(V))
434435
return Direct;
435-
return dyn_cast_if_present<T>(SimplifiedValues.lookup(V));
436+
return getSimplifiedValue<T>(V);
436437
}
437438

438439
// Custom simplification helper routines.
@@ -525,11 +526,33 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
525526

526527
InlineResult analyze();
527528

528-
std::optional<Constant *> getSimplifiedValue(Instruction *I) {
529-
auto It = SimplifiedValues.find(I);
530-
if (It != SimplifiedValues.end())
531-
return It->second;
532-
return std::nullopt;
529+
// Lookup simplified Value. May return a value owned by the caller.
530+
Value *getSimplifiedValueUnchecked(Value *V) const {
531+
return SimplifiedValues.lookup(V);
532+
}
533+
534+
// Lookup simplified Value, but return nullptr if the simplified value is
535+
// owned by the caller.
536+
template <typename T> T *getSimplifiedValue(Value *V) const {
537+
Value *SimpleV = SimplifiedValues.lookup(V);
538+
if (!SimpleV)
539+
return nullptr;
540+
541+
// Skip checks if we know T is a global. This has a small, but measurable
542+
// impact on compile-time.
543+
if constexpr (std::is_base_of_v<Constant, T>)
544+
return dyn_cast<T>(SimpleV);
545+
546+
// Make sure the simplified Value is owned by this function
547+
if (auto *I = dyn_cast<Instruction>(SimpleV)) {
548+
if (I->getFunction() != &F)
549+
return nullptr;
550+
} else if (auto *Arg = dyn_cast<Argument>(SimpleV)) {
551+
if (Arg->getParent() != &F)
552+
return nullptr;
553+
} else if (!isa<Constant>(SimpleV))
554+
return nullptr;
555+
return dyn_cast<T>(SimpleV);
533556
}
534557

535558
// Keep a bunch of stats about the cost savings found so we can print them
@@ -921,12 +944,11 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
921944
if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
922945
// Count a conditional branch as savings if it becomes unconditional.
923946
if (BI->isConditional() &&
924-
isa_and_nonnull<ConstantInt>(
925-
SimplifiedValues.lookup(BI->getCondition()))) {
947+
getSimplifiedValue<ConstantInt>(BI->getCondition())) {
926948
CurrentSavings += InstrCost;
927949
}
928950
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
929-
if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
951+
if (getSimplifiedValue<ConstantInt>(SI->getCondition()))
930952
CurrentSavings += InstrCost;
931953
} else if (Value *V = dyn_cast<Value>(&I)) {
932954
// Count an instruction as savings if we can fold it.
@@ -1423,10 +1445,17 @@ void InlineCostAnnotationWriter::emitInstructionAnnot(
14231445
if (Record->hasThresholdChanged())
14241446
OS << ", threshold delta = " << Record->getThresholdDelta();
14251447
}
1426-
auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1427-
if (C) {
1448+
auto *V = ICCA->getSimplifiedValueUnchecked(const_cast<Instruction *>(I));
1449+
if (V) {
14281450
OS << ", simplified to ";
1429-
(*C)->print(OS, true);
1451+
V->print(OS, true);
1452+
if (auto *VI = dyn_cast<Instruction>(V)) {
1453+
if (VI->getFunction() != I->getFunction())
1454+
OS << " (caller instruction)";
1455+
} else if (auto *VArg = dyn_cast<Argument>(V)) {
1456+
if (VArg->getParent() != I->getFunction())
1457+
OS << " (caller argument)";
1458+
}
14301459
}
14311460
OS << "\n";
14321461
}
@@ -1483,7 +1512,7 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
14831512
SmallVector<Value *, 4> Operands;
14841513
Operands.push_back(GEP.getOperand(0));
14851514
for (const Use &Op : GEP.indices())
1486-
if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1515+
if (Constant *SimpleOp = getSimplifiedValue<Constant>(Op))
14871516
Operands.push_back(SimpleOp);
14881517
else
14891518
Operands.push_back(Op);
@@ -1498,7 +1527,7 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
14981527
// Check whether inlining will turn a dynamic alloca into a static
14991528
// alloca and handle that case.
15001529
if (I.isArrayAllocation()) {
1501-
Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1530+
Constant *Size = getSimplifiedValue<Constant>(I.getArraySize());
15021531
if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
15031532
// Sometimes a dynamic alloca could be converted into a static alloca
15041533
// after this constant prop, and become a huge static alloca on an
@@ -2287,9 +2316,18 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
22872316
}
22882317

22892318
bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2290-
// Constant folding for extract value is trivial.
2291-
if (simplifyInstruction(I))
2292-
return true;
2319+
Value *Op = I.getAggregateOperand();
2320+
2321+
// Special handling, because we want to simplify extractvalue with a
2322+
// potential insertvalue from the caller.
2323+
if (Value *SimpleOp = getSimplifiedValueUnchecked(Op)) {
2324+
SimplifyQuery SQ(DL);
2325+
Value *SimpleV = simplifyExtractValueInst(SimpleOp, I.getIndices(), SQ);
2326+
if (SimpleV) {
2327+
SimplifiedValues[&I] = SimpleV;
2328+
return true;
2329+
}
2330+
}
22932331

22942332
// SROA can't look through these, but they may be free.
22952333
return Base::visitExtractValue(I);
@@ -2388,7 +2426,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
23882426
// Check if this happens to be an indirect function call to a known function
23892427
// in this inline context. If not, we've done all we can.
23902428
Value *Callee = Call.getCalledOperand();
2391-
F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2429+
F = getSimplifiedValue<Function>(Callee);
23922430
if (!F || F->getFunctionType() != Call.getFunctionType()) {
23932431
onCallArgumentSetup(Call);
23942432

@@ -2483,8 +2521,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
24832521

24842522
Constant *TrueC = getDirectOrSimplifiedValue<Constant>(TrueVal);
24852523
Constant *FalseC = getDirectOrSimplifiedValue<Constant>(FalseVal);
2486-
Constant *CondC =
2487-
dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2524+
Constant *CondC = getSimplifiedValue<Constant>(SI.getCondition());
24882525

24892526
if (!CondC) {
24902527
// Select C, X, X => X
@@ -2833,8 +2870,9 @@ InlineResult CallAnalyzer::analyze() {
28332870
auto CAI = CandidateCall.arg_begin();
28342871
for (Argument &FAI : F.args()) {
28352872
assert(CAI != CandidateCall.arg_end());
2836-
if (Constant *C = dyn_cast<Constant>(CAI))
2837-
SimplifiedValues[&FAI] = C;
2873+
SimplifiedValues[&FAI] = *CAI;
2874+
if (isa<Constant>(*CAI))
2875+
++NumConstantArgs;
28382876

28392877
Value *PtrArg = *CAI;
28402878
if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
@@ -2849,7 +2887,6 @@ InlineResult CallAnalyzer::analyze() {
28492887
}
28502888
++CAI;
28512889
}
2852-
NumConstantArgs = SimplifiedValues.size();
28532890
NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
28542891
NumAllocaArgs = SROAArgValues.size();
28552892

@@ -2911,8 +2948,7 @@ InlineResult CallAnalyzer::analyze() {
29112948
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
29122949
if (BI->isConditional()) {
29132950
Value *Cond = BI->getCondition();
2914-
if (ConstantInt *SimpleCond =
2915-
dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2951+
if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
29162952
BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
29172953
BBWorklist.insert(NextBB);
29182954
KnownSuccessors[BB] = NextBB;
@@ -2922,8 +2958,7 @@ InlineResult CallAnalyzer::analyze() {
29222958
}
29232959
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
29242960
Value *Cond = SI->getCondition();
2925-
if (ConstantInt *SimpleCond =
2926-
dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2961+
if (ConstantInt *SimpleCond = getSimplifiedValue<ConstantInt>(Cond)) {
29272962
BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
29282963
BBWorklist.insert(NextBB);
29292964
KnownSuccessors[BB] = NextBB;
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -passes=inline -inline-threshold=0 | FileCheck %s
3+
; RUN: opt < %s -S -passes='cgscc(inline)' -inline-threshold=0 | FileCheck %s
4+
5+
declare void @foo()
6+
7+
define i32 @callee([2 x i32] %agg) {
8+
; CHECK-LABEL: define i32 @callee(
9+
; CHECK-SAME: [2 x i32] [[AGG:%.*]]) {
10+
; CHECK-NEXT: [[V:%.*]] = extractvalue [2 x i32] [[AGG]], 0
11+
; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[V]], 0
12+
; CHECK-NEXT: br i1 [[C]], label %[[IS_NULL:.*]], label %[[NON_NULL:.*]]
13+
; CHECK: [[IS_NULL]]:
14+
; CHECK-NEXT: ret i32 0
15+
; CHECK: [[NON_NULL]]:
16+
; CHECK-NEXT: call void @foo()
17+
; CHECK-NEXT: call void @foo()
18+
; CHECK-NEXT: ret i32 1
19+
;
20+
%v = extractvalue [2 x i32] %agg, 0
21+
%c = icmp eq i32 %v, 0
22+
br i1 %c, label %is_null, label %non_null
23+
24+
is_null:
25+
ret i32 0
26+
27+
non_null:
28+
call void @foo()
29+
call void @foo()
30+
ret i32 1
31+
}
32+
33+
define i32 @caller(i32 %arg) {
34+
; CHECK-LABEL: define i32 @caller(
35+
; CHECK-SAME: i32 [[ARG:%.*]]) {
36+
; CHECK-NEXT: [[AGG0:%.*]] = insertvalue [2 x i32] poison, i32 0, 0
37+
; CHECK-NEXT: [[AGG1:%.*]] = insertvalue [2 x i32] [[AGG0]], i32 [[ARG]], 1
38+
; CHECK-NEXT: ret i32 0
39+
;
40+
%agg0 = insertvalue [2 x i32] poison, i32 0, 0
41+
%agg1 = insertvalue [2 x i32] %agg0, i32 %arg, 1
42+
%v = call i32 @callee([2 x i32] %agg1)
43+
ret i32 %v
44+
}

0 commit comments

Comments
 (0)