Skip to content

Commit fa74144

Browse files
committed
[ELF] Parallelize --compress-debug-sections=zstd
See D117853: compressing debug sections is a bottleneck and therefore it has a large value parallizing the step. zstd provides multi-threading API and the output is deterministic even with different numbers of threads (see facebook/zstd#2238). Therefore we can leverage it instead of using the pigz-style sharding approach. Also, switch to the default compression level 3. The current level 5 is significantly slower without providing justifying size benefit. ``` 'dash b.sh 1' ran 1.05 ± 0.01 times faster than 'dash b.sh 3' 1.18 ± 0.01 times faster than 'dash b.sh 4' 1.29 ± 0.02 times faster than 'dash b.sh 5' level=1 size: 358946945 level=3 size: 309002145 level=4 size: 307693204 level=5 size: 297828315 ``` Reviewed By: andrewng, peter.smith Differential Revision: https://reviews.llvm.org/D133679
1 parent a9e0dbe commit fa74144

File tree

1 file changed

+50
-12
lines changed

1 file changed

+50
-12
lines changed

lld/ELF/OutputSections.cpp

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#if LLVM_ENABLE_ZLIB
2525
#include <zlib.h>
2626
#endif
27+
#if LLVM_ENABLE_ZSTD
28+
#include <zstd.h>
29+
#endif
2730

2831
using namespace llvm;
2932
using namespace llvm::dwarf;
@@ -331,25 +334,60 @@ template <class ELFT> void OutputSection::maybeCompress() {
331334
llvm::TimeTraceScope timeScope("Compress debug sections");
332335
compressed.uncompressedSize = size;
333336
auto buf = std::make_unique<uint8_t[]>(size);
337+
// Write uncompressed data to a temporary zero-initialized buffer.
338+
{
339+
parallel::TaskGroup tg;
340+
writeTo<ELFT>(buf.get(), tg);
341+
}
342+
343+
#if LLVM_ENABLE_ZSTD
344+
// Use ZSTD's streaming compression API which permits parallel workers working
345+
// on the stream. See http://facebook.github.io/zstd/zstd_manual.html
346+
// "Streaming compression - HowTo".
334347
if (config->compressDebugSections == DebugCompressionType::Zstd) {
335-
{
336-
parallel::TaskGroup tg;
337-
writeTo<ELFT>(buf.get(), tg);
338-
}
348+
// Allocate a buffer of half of the input size, and grow it by 1.5x if
349+
// insufficient.
339350
compressed.shards = std::make_unique<SmallVector<uint8_t, 0>[]>(1);
340-
compression::zstd::compress(makeArrayRef(buf.get(), size),
341-
compressed.shards[0]);
342-
size = sizeof(Elf_Chdr) + compressed.shards[0].size();
351+
SmallVector<uint8_t, 0> &out = compressed.shards[0];
352+
out.resize_for_overwrite(std::max<size_t>(size / 2, 32));
353+
size_t pos = 0;
354+
355+
ZSTD_CCtx *cctx = ZSTD_createCCtx();
356+
size_t ret = ZSTD_CCtx_setParameter(
357+
cctx, ZSTD_c_nbWorkers, parallel::strategy.compute_thread_count());
358+
if (ZSTD_isError(ret))
359+
fatal(Twine("ZSTD_CCtx_setParameter: ") + ZSTD_getErrorName(ret));
360+
ZSTD_outBuffer zob = {out.data(), out.size(), 0};
361+
ZSTD_EndDirective directive = ZSTD_e_continue;
362+
const size_t blockSize = ZSTD_CStreamInSize();
363+
do {
364+
const size_t n = std::min(size - pos, blockSize);
365+
if (n == size - pos)
366+
directive = ZSTD_e_end;
367+
ZSTD_inBuffer zib = {buf.get() + pos, n, 0};
368+
size_t bytesRemaining = 0;
369+
while (zib.pos != zib.size ||
370+
(directive == ZSTD_e_end && bytesRemaining != 0)) {
371+
if (zob.pos == zob.size) {
372+
out.resize_for_overwrite(out.size() * 3 / 2);
373+
zob.dst = out.data();
374+
zob.size = out.size();
375+
}
376+
bytesRemaining = ZSTD_compressStream2(cctx, &zob, &zib, directive);
377+
assert(!ZSTD_isError(bytesRemaining));
378+
}
379+
pos += n;
380+
} while (directive != ZSTD_e_end);
381+
out.resize(zob.pos);
382+
ZSTD_freeCCtx(cctx);
383+
384+
size = sizeof(Elf_Chdr) + out.size();
343385
flags |= SHF_COMPRESSED;
344386
return;
345387
}
388+
#endif
346389

347390
#if LLVM_ENABLE_ZLIB
348-
// Write uncompressed data to a temporary zero-initialized buffer.
349-
{
350-
parallel::TaskGroup tg;
351-
writeTo<ELFT>(buf.get(), tg);
352-
}
353391
// We chose 1 (Z_BEST_SPEED) as the default compression level because it is
354392
// the fastest. If -O2 is given, we use level 6 to compress debug info more by
355393
// ~15%. We found that level 7 to 9 doesn't make much difference (~1% more

0 commit comments

Comments
 (0)