Skip to content

Commit d175616

Browse files
authored
[lld-macho][arm64] Enhance safe ICF with thunk-based deduplication (#106573)
Currently, our `safe` ICF mode only merges non-address-significant code, leaving duplicate address-significant functions in the output. This patch introduces `safe_thunks` ICF mode, which keeps a single master copy of each function and replaces address-significant duplicates with thunks that branch to the master copy. Currently `--icf=safe_thunks` is only supported for `arm64` architectures. **Perf stats for a large binary:** | ICF Option | Total Size | __text Size | __unwind_info | % total | |-------------------|------------|-------------|---------------------|---------------------------| | `--icf=none` | 91.738 MB | 55.220 MB | 1.424 MB | 0% | | `--icf=safe` | 85.042 MB | 49.572 MB | 1.168 MB | 7.30% | | `--icf=safe_thunks` | 84.650 MB | 49.219 MB | 1.143 MB | 7.72% | | `--icf=all` | 82.060 MB | 48.726 MB | 1.111 MB | 10.55% | So overall we can expect a `~0.45%` binary size reduction for a typical large binary compared to the `--icf=safe` option. **Runtime:** Linking the above binary took ~10 seconds. Comparing the link performance of --icf=safe_thunks vs --icf=safe, a ~2% slowdown was observed.
1 parent 1be9a80 commit d175616

File tree

12 files changed

+405
-12
lines changed

12 files changed

+405
-12
lines changed

lld/MachO/Arch/ARM64.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ struct ARM64 : ARM64Common {
4141
Symbol *objcMsgSend) const override;
4242
void populateThunk(InputSection *thunk, Symbol *funcSym) override;
4343
void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
44+
45+
void initICFSafeThunkBody(InputSection *thunk,
46+
InputSection *branchTarget) const override;
47+
uint32_t getICFSafeThunkSize() const override;
4448
};
4549

4650
} // namespace
@@ -175,6 +179,25 @@ void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) {
175179
/*offset=*/0, /*addend=*/0,
176180
/*referent=*/funcSym);
177181
}
182+
// Just a single direct branch to the target function.
183+
static constexpr uint32_t icfSafeThunkCode[] = {
184+
0x14000000, // 08: b target
185+
};
186+
187+
void ARM64::initICFSafeThunkBody(InputSection *thunk,
188+
InputSection *branchTarget) const {
189+
// The base data here will not be itself modified, we'll just be adding a
190+
// reloc below. So we can directly use the constexpr above as the data.
191+
thunk->data = {reinterpret_cast<const uint8_t *>(icfSafeThunkCode),
192+
sizeof(icfSafeThunkCode)};
193+
194+
thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_BRANCH26,
195+
/*pcrel=*/true, /*length=*/2,
196+
/*offset=*/0, /*addend=*/0,
197+
/*referent=*/branchTarget);
198+
}
199+
200+
uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); }
178201

179202
ARM64::ARM64() : ARM64Common(LP64()) {
180203
cpuType = CPU_TYPE_ARM64;

lld/MachO/Config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ enum class ICFLevel {
6868
unknown,
6969
none,
7070
safe,
71+
safe_thunks,
7172
all,
7273
};
7374

lld/MachO/Driver.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -847,8 +847,14 @@ static ICFLevel getICFLevel(const ArgList &args) {
847847
auto icfLevel = StringSwitch<ICFLevel>(icfLevelStr)
848848
.Cases("none", "", ICFLevel::none)
849849
.Case("safe", ICFLevel::safe)
850+
.Case("safe_thunks", ICFLevel::safe_thunks)
850851
.Case("all", ICFLevel::all)
851852
.Default(ICFLevel::unknown);
853+
854+
if ((icfLevel == ICFLevel::safe_thunks) && (config->arch() != AK_arm64)) {
855+
error("--icf=safe_thunks is only supported on arm64 targets");
856+
}
857+
852858
if (icfLevel == ICFLevel::unknown) {
853859
warn(Twine("unknown --icf=OPTION `") + icfLevelStr +
854860
"', defaulting to `none'");
@@ -2116,7 +2122,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
21162122
// foldIdenticalLiterals before foldIdenticalSections.
21172123
foldIdenticalLiterals();
21182124
if (config->icfLevel != ICFLevel::none) {
2119-
if (config->icfLevel == ICFLevel::safe)
2125+
if (config->icfLevel == ICFLevel::safe ||
2126+
config->icfLevel == ICFLevel::safe_thunks)
21202127
markAddrSigSymbols();
21212128
foldIdenticalSections(/*onlyCfStrings=*/false);
21222129
} else if (config->dedupStrings) {

lld/MachO/ICF.cpp

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class ICF {
4545
const ConcatInputSection *ib);
4646
bool equalsVariable(const ConcatInputSection *ia,
4747
const ConcatInputSection *ib);
48+
void applySafeThunksToRange(size_t begin, size_t end);
4849

4950
// ICF needs a copy of the inputs vector because its equivalence-class
5051
// segregation algorithm destroys the proper sequence.
@@ -251,6 +252,50 @@ void ICF::forEachClassRange(size_t begin, size_t end,
251252
}
252253
}
253254

255+
// Given a range of identical icfInputs, replace address significant functions
256+
// with a thunk that is just a direct branch to the first function in the
257+
// series. This way we keep only one main body of the function but we still
258+
// retain the address uniqueness of relevant functions by having them be a
259+
// direct branch thunk rather than containing a full copy of the actual function
260+
// body.
261+
void ICF::applySafeThunksToRange(size_t begin, size_t end) {
262+
// If the functions we're dealing with are smaller than the thunk size, then
263+
// just leave them all as-is - creating thunks would be a net loss.
264+
uint32_t thunkSize = target->getICFSafeThunkSize();
265+
if (icfInputs[begin]->data.size() <= thunkSize)
266+
return;
267+
268+
// When creating a unique ICF thunk, use the first section as the section that
269+
// all thunks will branch to.
270+
ConcatInputSection *masterIsec = icfInputs[begin];
271+
272+
for (size_t i = begin + 1; i < end; ++i) {
273+
ConcatInputSection *isec = icfInputs[i];
274+
// When we're done processing keepUnique entries, we can stop. Sorting
275+
// guaratees that all keepUnique will be at the front.
276+
if (!isec->keepUnique)
277+
break;
278+
279+
ConcatInputSection *thunk =
280+
makeSyntheticInputSection(isec->getSegName(), isec->getName());
281+
addInputSection(thunk);
282+
283+
target->initICFSafeThunkBody(thunk, masterIsec);
284+
thunk->foldIdentical(isec, Symbol::ICFFoldKind::Thunk);
285+
286+
// Since we're folding the target function into a thunk, we need to adjust
287+
// the symbols that now got relocated from the target function to the thunk.
288+
// Since the thunk is only one branch, we move all symbols to offset 0 and
289+
// make sure that the size of all non-zero-size symbols is equal to the size
290+
// of the branch.
291+
for (auto *sym : thunk->symbols) {
292+
sym->value = 0;
293+
if (sym->size != 0)
294+
sym->size = thunkSize;
295+
}
296+
}
297+
}
298+
254299
// Split icfInputs into shards, then parallelize invocation of FUNC on subranges
255300
// with matching equivalence class
256301
void ICF::forEachClass(llvm::function_ref<void(size_t, size_t)> func) {
@@ -312,6 +357,12 @@ void ICF::run() {
312357

313358
llvm::stable_sort(
314359
icfInputs, [](const ConcatInputSection *a, const ConcatInputSection *b) {
360+
// When using safe_thunks, ensure that we first sort by icfEqClass and
361+
// then by keepUnique (descending). This guarantees that within an
362+
// equivalence class, the keepUnique inputs are always first.
363+
if (config->icfLevel == ICFLevel::safe_thunks)
364+
if (a->icfEqClass[0] == b->icfEqClass[0])
365+
return a->keepUnique > b->keepUnique;
315366
return a->icfEqClass[0] < b->icfEqClass[0];
316367
});
317368
forEachClass([&](size_t begin, size_t end) {
@@ -331,13 +382,37 @@ void ICF::run() {
331382
log("equalsVariable() called " + Twine(equalsVariableCount) + " times");
332383
}
333384

385+
// When using safe_thunks, we need to create thunks for all keepUnique
386+
// functions that can be deduplicated. Since we're creating / adding new
387+
// InputSections, we can't paralellize this.
388+
if (config->icfLevel == ICFLevel::safe_thunks)
389+
forEachClassRange(0, icfInputs.size(), [&](size_t begin, size_t end) {
390+
applySafeThunksToRange(begin, end);
391+
});
392+
334393
// Fold sections within equivalence classes
335394
forEachClass([&](size_t begin, size_t end) {
336395
if (end - begin < 2)
337396
return;
397+
bool useSafeThunks = config->icfLevel == ICFLevel::safe_thunks;
398+
399+
// For ICF level safe_thunks, replace keepUnique function bodies with
400+
// thunks. For all other ICF levles, directly merge the functions.
401+
338402
ConcatInputSection *beginIsec = icfInputs[begin];
339-
for (size_t i = begin + 1; i < end; ++i)
403+
for (size_t i = begin + 1; i < end; ++i) {
404+
// Skip keepUnique inputs when using safe_thunks (already handeled above)
405+
if (useSafeThunks && icfInputs[i]->keepUnique) {
406+
// Assert keepUnique sections are either small or replaced with thunks.
407+
assert(!icfInputs[i]->live ||
408+
icfInputs[i]->data.size() <= target->getICFSafeThunkSize());
409+
assert(!icfInputs[i]->replacement ||
410+
icfInputs[i]->replacement->data.size() ==
411+
target->getICFSafeThunkSize());
412+
continue;
413+
}
340414
beginIsec->foldIdentical(icfInputs[i]);
415+
}
341416
});
342417
}
343418

@@ -421,11 +496,22 @@ void macho::foldIdenticalSections(bool onlyCfStrings) {
421496
// can still fold it.
422497
bool hasFoldableFlags = (isSelRefsSection(isec) ||
423498
sectionType(isec->getFlags()) == MachO::S_REGULAR);
499+
500+
bool isCodeSec = isCodeSection(isec);
501+
502+
// When keepUnique is true, the section is not foldable. Unless we are at
503+
// icf level safe_thunks, in which case we still want to fold code sections.
504+
// When using safe_thunks we'll apply the safe_thunks logic at merge time
505+
// based on the 'keepUnique' flag.
506+
bool noUniqueRequirement =
507+
!isec->keepUnique ||
508+
((config->icfLevel == ICFLevel::safe_thunks) && isCodeSec);
509+
424510
// FIXME: consider non-code __text sections as foldable?
425511
bool isFoldable = (!onlyCfStrings || isCfStringSection(isec)) &&
426-
(isCodeSection(isec) || isFoldableWithAddendsRemoved ||
512+
(isCodeSec || isFoldableWithAddendsRemoved ||
427513
isGccExceptTabSection(isec)) &&
428-
!isec->keepUnique && !isec->hasAltEntry &&
514+
noUniqueRequirement && !isec->hasAltEntry &&
429515
!isec->shouldOmitFromOutput() && hasFoldableFlags;
430516
if (isFoldable) {
431517
foldable.push_back(isec);

lld/MachO/InputSection.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,13 +190,14 @@ const Reloc *InputSection::getRelocAt(uint32_t off) const {
190190
return &*it;
191191
}
192192

193-
void ConcatInputSection::foldIdentical(ConcatInputSection *copy) {
193+
void ConcatInputSection::foldIdentical(ConcatInputSection *copy,
194+
Symbol::ICFFoldKind foldKind) {
194195
align = std::max(align, copy->align);
195196
copy->live = false;
196197
copy->wasCoalesced = true;
197198
copy->replacement = this;
198199
for (auto &copySym : copy->symbols)
199-
copySym->wasIdenticalCodeFolded = true;
200+
copySym->identicalCodeFoldingKind = foldKind;
200201

201202
symbols.insert(symbols.end(), copy->symbols.begin(), copy->symbols.end());
202203
copy->symbols.clear();

lld/MachO/InputSection.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ class ConcatInputSection final : public InputSection {
117117
bool shouldOmitFromOutput() const { return !live || isCoalescedWeak(); }
118118
void writeTo(uint8_t *buf);
119119

120-
void foldIdentical(ConcatInputSection *redundant);
120+
void foldIdentical(ConcatInputSection *redundant,
121+
Symbol::ICFFoldKind foldKind = Symbol::ICFFoldKind::Body);
121122
ConcatInputSection *canonical() override {
122123
return replacement ? replacement : this;
123124
}

lld/MachO/MapFile.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ static void printNonLazyPointerSection(raw_fd_ostream &os,
156156
}
157157

158158
static uint64_t getSymSizeForMap(Defined *sym) {
159-
if (sym->wasIdenticalCodeFolded)
159+
if (sym->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
160160
return 0;
161161
return sym->size;
162162
}

lld/MachO/Symbols.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Defined::Defined(StringRef name, InputFile *file, InputSection *isec,
6060
bool interposable)
6161
: Symbol(DefinedKind, name, file), overridesWeakDef(canOverrideWeakDef),
6262
privateExtern(isPrivateExtern), includeInSymtab(includeInSymtab),
63-
wasIdenticalCodeFolded(false),
63+
identicalCodeFoldingKind(ICFFoldKind::None),
6464
referencedDynamically(isReferencedDynamically), noDeadStrip(noDeadStrip),
6565
interposable(interposable), weakDefCanBeHidden(isWeakDefCanBeHidden),
6666
weakDef(isWeakDef), external(isExternal), originalIsec(isec),

lld/MachO/Symbols.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@ class Symbol {
3333
AliasKind,
3434
};
3535

36+
// Enum that describes the type of Identical Code Folding (ICF) applied to a
37+
// symbol. This information is crucial for accurately representing symbol
38+
// sizes in the map file.
39+
enum ICFFoldKind {
40+
None, // No folding is applied.
41+
Body, // The entire body (function or data) is folded.
42+
Thunk // The function body is folded into a single branch thunk.
43+
};
44+
3645
virtual ~Symbol() {}
3746

3847
Kind kind() const { return symbolKind; }
@@ -142,8 +151,8 @@ class Defined : public Symbol {
142151
bool privateExtern : 1;
143152
// Whether this symbol should appear in the output symbol table.
144153
bool includeInSymtab : 1;
145-
// Whether this symbol was folded into a different symbol during ICF.
146-
bool wasIdenticalCodeFolded : 1;
154+
// The ICF folding kind of this symbol: None / Body / Thunk.
155+
ICFFoldKind identicalCodeFoldingKind : 2;
147156
// Symbols marked referencedDynamically won't be removed from the output's
148157
// symbol table by tools like strip. In theory, this could be set on arbitrary
149158
// symbols in input object files. In practice, it's used solely for the

lld/MachO/SyntheticSections.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1231,7 +1231,8 @@ void SymtabSection::emitStabs() {
12311231

12321232
// Constant-folded symbols go in the executable's symbol table, but don't
12331233
// get a stabs entry unless --keep-icf-stabs flag is specified
1234-
if (!config->keepICFStabs && defined->wasIdenticalCodeFolded)
1234+
if (!config->keepICFStabs &&
1235+
defined->identicalCodeFoldingKind == Symbol::ICFFoldKind::Body)
12351236
continue;
12361237

12371238
ObjFile *file = defined->getObjectFile();

lld/MachO/Target.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ class TargetInfo {
7474
uint64_t selrefVA,
7575
Symbol *objcMsgSend) const = 0;
7676

77+
// Init 'thunk' so that it be a direct jump to 'branchTarget'.
78+
virtual void initICFSafeThunkBody(InputSection *thunk,
79+
InputSection *branchTarget) const {
80+
llvm_unreachable("target does not support ICF safe thunks");
81+
}
82+
83+
virtual uint32_t getICFSafeThunkSize() const {
84+
llvm_unreachable("target does not support ICF safe thunks");
85+
}
86+
7787
// Symbols may be referenced via either the GOT or the stubs section,
7888
// depending on the relocation type. prepareSymbolRelocation() will set up the
7989
// GOT/stubs entries, and resolveSymbolVA() will return the addresses of those

0 commit comments

Comments
 (0)