Skip to content

Commit 3b4d800

Browse files
committed
[ELF] Parallelize writes of different OutputSections
We currently process one OutputSection at a time and for each OutputSection write contained input sections in parallel. This strategy does not leverage multi-threading well. Instead, parallelize writes of different OutputSections. The default TaskSize for parallelFor often leads to inferior sharding. We prepare the task in the caller instead. * Move llvm::parallel::detail::TaskGroup to llvm::parallel::TaskGroup * Add llvm::parallel::TaskGroup::execute. * Change writeSections to declare TaskGroup and pass it to writeTo. Speed-up with --threads=8: * clang -DCMAKE_BUILD_TYPE=Release: 1.11x as fast * clang -DCMAKE_BUILD_TYPE=Debug: 1.10x as fast * chrome -DCMAKE_BUILD_TYPE=Release: 1.04x as fast * scylladb build/release: 1.09x as fast On M1, many benchmarks are a small fraction of a percentage faster. Mozilla showed the largest difference with the patch being about 1.03x as fast. Differential Revision: https://reviews.llvm.org/D131247
1 parent e854c17 commit 3b4d800

File tree

8 files changed

+131
-58
lines changed

8 files changed

+131
-58
lines changed

lld/ELF/OutputSections.cpp

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
332332

333333
// Write uncompressed data to a temporary zero-initialized buffer.
334334
auto buf = std::make_unique<uint8_t[]>(size);
335-
writeTo<ELFT>(buf.get());
335+
{
336+
parallel::TaskGroup tg;
337+
writeTo<ELFT>(buf.get(), tg);
338+
}
336339
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
337340
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
338341
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
@@ -386,7 +389,8 @@ static void writeInt(uint8_t *buf, uint64_t data, uint64_t size) {
386389
llvm_unreachable("unsupported Size argument");
387390
}
388391

389-
template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
392+
template <class ELFT>
393+
void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
390394
llvm::TimeTraceScope timeScope("Write sections", name);
391395
if (type == SHT_NOBITS)
392396
return;
@@ -419,41 +423,68 @@ template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
419423
}
420424

421425
// Write leading padding.
422-
SmallVector<InputSection *, 0> storage;
423426
ArrayRef<InputSection *> sections = getInputSections(*this, storage);
424427
std::array<uint8_t, 4> filler = getFiller();
425428
bool nonZeroFiller = read32(filler.data()) != 0;
426429
if (nonZeroFiller)
427430
fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
428431

429-
parallelFor(0, sections.size(), [&](size_t i) {
430-
InputSection *isec = sections[i];
431-
if (auto *s = dyn_cast<SyntheticSection>(isec))
432-
s->writeTo(buf + isec->outSecOff);
433-
else
434-
isec->writeTo<ELFT>(buf + isec->outSecOff);
435-
436-
// Fill gaps between sections.
437-
if (nonZeroFiller) {
438-
uint8_t *start = buf + isec->outSecOff + isec->getSize();
439-
uint8_t *end;
440-
if (i + 1 == sections.size())
441-
end = buf + size;
432+
auto fn = [=](size_t begin, size_t end) {
433+
size_t numSections = sections.size();
434+
for (size_t i = begin; i != end; ++i) {
435+
InputSection *isec = sections[i];
436+
if (auto *s = dyn_cast<SyntheticSection>(isec))
437+
s->writeTo(buf + isec->outSecOff);
442438
else
443-
end = buf + sections[i + 1]->outSecOff;
444-
if (isec->nopFiller) {
445-
assert(target->nopInstrs);
446-
nopInstrFill(start, end - start);
447-
} else
448-
fill(start, end - start, filler);
439+
isec->writeTo<ELFT>(buf + isec->outSecOff);
440+
441+
// Fill gaps between sections.
442+
if (nonZeroFiller) {
443+
uint8_t *start = buf + isec->outSecOff + isec->getSize();
444+
uint8_t *end;
445+
if (i + 1 == numSections)
446+
end = buf + size;
447+
else
448+
end = buf + sections[i + 1]->outSecOff;
449+
if (isec->nopFiller) {
450+
assert(target->nopInstrs);
451+
nopInstrFill(start, end - start);
452+
} else
453+
fill(start, end - start, filler);
454+
}
449455
}
450-
});
456+
};
451457

452-
// Linker scripts may have BYTE()-family commands with which you
453-
// can write arbitrary bytes to the output. Process them if any.
458+
// If there is any BYTE()-family command (rare), write the section content
459+
// first then process BYTE to overwrite the filler content. The write is
460+
// serial due to the limitation of llvm/Support/Parallel.h.
461+
bool written = false;
462+
size_t numSections = sections.size();
454463
for (SectionCommand *cmd : commands)
455-
if (auto *data = dyn_cast<ByteCommand>(cmd))
464+
if (auto *data = dyn_cast<ByteCommand>(cmd)) {
465+
if (!std::exchange(written, true))
466+
fn(0, numSections);
456467
writeInt(buf + data->offset, data->expression().getValue(), data->size);
468+
}
469+
if (written || !numSections)
470+
return;
471+
472+
// There is no data command. Write content asynchronously to overlap the write
473+
// time with other output sections. Note, if a linker script specifies
474+
// overlapping output sections (needs --noinhibit-exec or --no-check-sections
475+
// to supress the error), the output may be non-deterministic.
476+
const size_t taskSizeLimit = 4 << 20;
477+
for (size_t begin = 0, i = 0, taskSize = 0;;) {
478+
taskSize += sections[i]->getSize();
479+
bool done = ++i == numSections;
480+
if (done || taskSize >= taskSizeLimit) {
481+
tg.execute([=] { fn(begin, i); });
482+
if (done)
483+
break;
484+
begin = i;
485+
taskSize = 0;
486+
}
487+
}
457488
}
458489

459490
static void finalizeShtGroup(OutputSection *os, InputSection *section) {
@@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo<ELF32BE>(ELF32BE::Shdr *Shdr);
673704
template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
674705
template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
675706

676-
template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
677-
template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
678-
template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
679-
template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
707+
template void OutputSection::writeTo<ELF32LE>(uint8_t *,
708+
llvm::parallel::TaskGroup &);
709+
template void OutputSection::writeTo<ELF32BE>(uint8_t *,
710+
llvm::parallel::TaskGroup &);
711+
template void OutputSection::writeTo<ELF64LE>(uint8_t *,
712+
llvm::parallel::TaskGroup &);
713+
template void OutputSection::writeTo<ELF64BE>(uint8_t *,
714+
llvm::parallel::TaskGroup &);
680715

681716
template void OutputSection::maybeCompress<ELF32LE>();
682717
template void OutputSection::maybeCompress<ELF32BE>();

lld/ELF/OutputSections.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "InputSection.h"
1313
#include "LinkerScript.h"
1414
#include "lld/Common/LLVM.h"
15+
#include "llvm/Support/Parallel.h"
1516

1617
#include <array>
1718

@@ -104,7 +105,8 @@ class OutputSection final : public SectionBase {
104105
bool relro = false;
105106

106107
void finalize();
107-
template <class ELFT> void writeTo(uint8_t *buf);
108+
template <class ELFT>
109+
void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
108110
// Check that the addends for dynamic relocations were written correctly.
109111
void checkDynRelAddends(const uint8_t *bufStart);
110112
template <class ELFT> void maybeCompress();
@@ -114,6 +116,8 @@ class OutputSection final : public SectionBase {
114116
void sortCtorsDtors();
115117

116118
private:
119+
SmallVector<InputSection *, 0> storage;
120+
117121
// Used for implementation of --compress-debug-sections option.
118122
CompressedData compressed;
119123

lld/ELF/Writer.cpp

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2839,9 +2839,10 @@ template <class ELFT> void Writer<ELFT>::openFile() {
28392839
}
28402840

28412841
template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
2842+
parallel::TaskGroup tg;
28422843
for (OutputSection *sec : outputSections)
28432844
if (sec->flags & SHF_ALLOC)
2844-
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
2845+
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
28452846
}
28462847

28472848
static void fillTrap(uint8_t *i, uint8_t *end) {
@@ -2884,16 +2885,21 @@ template <class ELFT> void Writer<ELFT>::writeTrapInstr() {
28842885
template <class ELFT> void Writer<ELFT>::writeSections() {
28852886
llvm::TimeTraceScope timeScope("Write sections");
28862887

2887-
// In -r or --emit-relocs mode, write the relocation sections first as in
2888-
// ELf_Rel targets we might find out that we need to modify the relocated
2889-
// section while doing it.
2890-
for (OutputSection *sec : outputSections)
2891-
if (sec->type == SHT_REL || sec->type == SHT_RELA)
2892-
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
2893-
2894-
for (OutputSection *sec : outputSections)
2895-
if (sec->type != SHT_REL && sec->type != SHT_RELA)
2896-
sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
2888+
{
2889+
// In -r or --emit-relocs mode, write the relocation sections first as in
2890+
// ELf_Rel targets we might find out that we need to modify the relocated
2891+
// section while doing it.
2892+
parallel::TaskGroup tg;
2893+
for (OutputSection *sec : outputSections)
2894+
if (sec->type == SHT_REL || sec->type == SHT_RELA)
2895+
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
2896+
}
2897+
{
2898+
parallel::TaskGroup tg;
2899+
for (OutputSection *sec : outputSections)
2900+
if (sec->type != SHT_REL && sec->type != SHT_RELA)
2901+
sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
2902+
}
28972903

28982904
// Finally, check that all dynamic relocation addends were written correctly.
28992905
if (config->checkDynamicRelocs && config->writeAddends) {

lld/test/ELF/arm-thumb-interwork-notfunc.s

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// REQUIRES: arm
22
// RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
3-
// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN
3+
/// Use --threads=1 to keep emitted warnings across sections sequential.
4+
// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
45
// RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
56

67
.syntax unified

lld/test/ELF/hexagon-jump-error.s

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# REQUIRES: hexagon
22
# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
3-
# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s
3+
## Use --threads=1 to keep emitted warnings across sections sequential.
4+
# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
45

56
.globl _start
67
.type _start, @function

lld/test/ELF/linkerscript/overlapping-sections.s

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@
8888
# BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
8989
# BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
9090
# BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
91-
# Starting here the contents of .sec2 overwrites .sec1:
92-
# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202
91+
## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
92+
# BROKEN-OUTPUT-FILE-NEXT: 8040
9393

9494
# RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
9595
# BAD-BOTH-LABEL: Section Headers:

llvm/include/llvm/Support/Parallel.h

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@ namespace parallel {
3030
extern ThreadPoolStrategy strategy;
3131

3232
namespace detail {
33-
34-
#if LLVM_ENABLE_THREADS
35-
3633
class Latch {
3734
uint32_t Count;
3835
mutable std::mutex Mutex;
@@ -61,20 +58,42 @@ class Latch {
6158
Cond.wait(lock, [&] { return Count == 0; });
6259
}
6360
};
61+
} // namespace detail
6462

6563
class TaskGroup {
66-
Latch L;
64+
detail::Latch L;
6765
bool Parallel;
6866

6967
public:
7068
TaskGroup();
7169
~TaskGroup();
7270

71+
// Spawn a task, but does not wait for it to finish.
7372
void spawn(std::function<void()> f);
7473

74+
// Similar to spawn, but execute the task immediately when ThreadsRequested ==
75+
// 1. The difference is to give the following pattern a more intuitive order
76+
// when single threading is requested.
77+
//
78+
// for (size_t begin = 0, i = 0, taskSize = 0;;) {
79+
// taskSize += ...
80+
// bool done = ++i == end;
81+
// if (done || taskSize >= taskSizeLimit) {
82+
// tg.execute([=] { fn(begin, i); });
83+
// if (done)
84+
// break;
85+
// begin = i;
86+
// taskSize = 0;
87+
// }
88+
// }
89+
void execute(std::function<void()> f);
90+
7591
void sync() const { L.sync(); }
7692
};
7793

94+
namespace detail {
95+
96+
#if LLVM_ENABLE_THREADS
7897
const ptrdiff_t MinParallelSize = 1024;
7998

8099
/// Inclusive median.

llvm/lib/Support/Parallel.cpp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,9 @@
1919

2020
llvm::ThreadPoolStrategy llvm::parallel::strategy;
2121

22-
#if LLVM_ENABLE_THREADS
23-
2422
namespace llvm {
2523
namespace parallel {
24+
#if LLVM_ENABLE_THREADS
2625
namespace detail {
2726

2827
namespace {
@@ -143,6 +142,8 @@ Executor *Executor::getDefaultExecutor() {
143142
return Exec.get();
144143
}
145144
} // namespace
145+
} // namespace detail
146+
#endif
146147

147148
static std::atomic<int> TaskGroupInstances;
148149

@@ -159,21 +160,27 @@ TaskGroup::~TaskGroup() {
159160
}
160161

161162
void TaskGroup::spawn(std::function<void()> F) {
163+
#if LLVM_ENABLE_THREADS
162164
if (Parallel) {
163165
L.inc();
164-
Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
166+
detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
165167
F();
166168
L.dec();
167169
});
168-
} else {
169-
F();
170+
return;
170171
}
172+
#endif
173+
F();
171174
}
172175

173-
} // namespace detail
176+
void TaskGroup::execute(std::function<void()> F) {
177+
if (parallel::strategy.ThreadsRequested == 1)
178+
F();
179+
else
180+
spawn(F);
181+
}
174182
} // namespace parallel
175183
} // namespace llvm
176-
#endif // LLVM_ENABLE_THREADS
177184

178185
void llvm::parallelFor(size_t Begin, size_t End,
179186
llvm::function_ref<void(size_t)> Fn) {
@@ -190,7 +197,7 @@ void llvm::parallelFor(size_t Begin, size_t End,
190197
if (TaskSize == 0)
191198
TaskSize = 1;
192199

193-
parallel::detail::TaskGroup TG;
200+
parallel::TaskGroup TG;
194201
for (; Begin + TaskSize < End; Begin += TaskSize) {
195202
TG.spawn([=, &Fn] {
196203
for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)

0 commit comments

Comments
 (0)