Skip to content

Commit 6529d7c

Browse files
committed
[PDB] Defer relocating .debug$S until commit time and parallelize it
This is a pretty classic optimization. Instead of processing symbol records and copying them to temporary storage, do a first pass to measure how large the module symbol stream will be, and then copy the data into place in the PDB file. This requires defering relocation until much later, which accounts for most of the complexity in this patch. This patch avoids copying the contents of all live .debug$S sections into heap memory, which is worth about 20% of private memory usage when making PDBs. However, this is not an unmitigated performance win, because it can be faster to read dense, temporary, heap data than it is to iterate symbol records in object file backed memory a second time. Results on release chrome.dll: peak mem: 5164.89MB -> 4072.19MB (-1,092.7MB, -21.2%) wall-j1: 0m30.844s -> 0m32.094s (slightly slower) wall-j3: 0m20.968s -> 0m20.312s (slightly faster) wall-j8: 0m19.062s -> 0m17.672s (meaningfully faster) I gathered similar numbers for a debug, component build of content.dll in Chrome, and the performance impact of this change was in the noise. The memory usage reduction was visible and similar. Because of the new parallelism in the PDB commit phase, more cores makes the new approach faster. I'm assuming that most C++ developer machines these days are at least quad core, so I think this is a win. Differential Revision: https://reviews.llvm.org/D94267
1 parent 5c7dcd7 commit 6529d7c

File tree

6 files changed

+628
-292
lines changed

6 files changed

+628
-292
lines changed

lld/COFF/Chunks.cpp

Lines changed: 77 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -369,47 +369,88 @@ void SectionChunk::writeTo(uint8_t *buf) const {
369369
continue;
370370
}
371371

372-
uint8_t *off = buf + rel.VirtualAddress;
372+
applyRelocation(buf + rel.VirtualAddress, rel);
373+
}
374+
}
373375

374-
auto *sym =
375-
dyn_cast_or_null<Defined>(file->getSymbol(rel.SymbolTableIndex));
376+
void SectionChunk::applyRelocation(uint8_t *off,
377+
const coff_relocation &rel) const {
378+
auto *sym = dyn_cast_or_null<Defined>(file->getSymbol(rel.SymbolTableIndex));
376379

377-
// Get the output section of the symbol for this relocation. The output
378-
// section is needed to compute SECREL and SECTION relocations used in debug
379-
// info.
380-
Chunk *c = sym ? sym->getChunk() : nullptr;
381-
OutputSection *os = c ? c->getOutputSection() : nullptr;
382-
383-
// Skip the relocation if it refers to a discarded section, and diagnose it
384-
// as an error if appropriate. If a symbol was discarded early, it may be
385-
// null. If it was discarded late, the output section will be null, unless
386-
// it was an absolute or synthetic symbol.
387-
if (!sym ||
388-
(!os && !isa<DefinedAbsolute>(sym) && !isa<DefinedSynthetic>(sym))) {
389-
maybeReportRelocationToDiscarded(this, sym, rel);
390-
continue;
391-
}
380+
// Get the output section of the symbol for this relocation. The output
381+
// section is needed to compute SECREL and SECTION relocations used in debug
382+
// info.
383+
Chunk *c = sym ? sym->getChunk() : nullptr;
384+
OutputSection *os = c ? c->getOutputSection() : nullptr;
392385

393-
uint64_t s = sym->getRVA();
386+
// Skip the relocation if it refers to a discarded section, and diagnose it
387+
// as an error if appropriate. If a symbol was discarded early, it may be
388+
// null. If it was discarded late, the output section will be null, unless
389+
// it was an absolute or synthetic symbol.
390+
if (!sym ||
391+
(!os && !isa<DefinedAbsolute>(sym) && !isa<DefinedSynthetic>(sym))) {
392+
maybeReportRelocationToDiscarded(this, sym, rel);
393+
return;
394+
}
394395

395-
// Compute the RVA of the relocation for relative relocations.
396-
uint64_t p = rva + rel.VirtualAddress;
397-
switch (config->machine) {
398-
case AMD64:
399-
applyRelX64(off, rel.Type, os, s, p);
400-
break;
401-
case I386:
402-
applyRelX86(off, rel.Type, os, s, p);
403-
break;
404-
case ARMNT:
405-
applyRelARM(off, rel.Type, os, s, p);
406-
break;
407-
case ARM64:
408-
applyRelARM64(off, rel.Type, os, s, p);
396+
uint64_t s = sym->getRVA();
397+
398+
// Compute the RVA of the relocation for relative relocations.
399+
uint64_t p = rva + rel.VirtualAddress;
400+
switch (config->machine) {
401+
case AMD64:
402+
applyRelX64(off, rel.Type, os, s, p);
403+
break;
404+
case I386:
405+
applyRelX86(off, rel.Type, os, s, p);
406+
break;
407+
case ARMNT:
408+
applyRelARM(off, rel.Type, os, s, p);
409+
break;
410+
case ARM64:
411+
applyRelARM64(off, rel.Type, os, s, p);
412+
break;
413+
default:
414+
llvm_unreachable("unknown machine type");
415+
}
416+
}
417+
418+
// Defend against unsorted relocations. This may be overly conservative.
419+
void SectionChunk::sortRelocations() {
420+
auto cmpByVa = [](const coff_relocation &l, const coff_relocation &r) {
421+
return l.VirtualAddress < r.VirtualAddress;
422+
};
423+
if (llvm::is_sorted(getRelocs(), cmpByVa))
424+
return;
425+
warn("some relocations in " + file->getName() + " are not sorted");
426+
MutableArrayRef<coff_relocation> newRelocs(
427+
bAlloc.Allocate<coff_relocation>(relocsSize), relocsSize);
428+
memcpy(newRelocs.data(), relocsData, relocsSize * sizeof(coff_relocation));
429+
llvm::sort(newRelocs, cmpByVa);
430+
setRelocs(newRelocs);
431+
}
432+
433+
// Similar to writeTo, but suitable for relocating a subsection of the overall
434+
// section.
435+
void SectionChunk::writeAndRelocateSubsection(ArrayRef<uint8_t> sec,
436+
ArrayRef<uint8_t> subsec,
437+
uint32_t &nextRelocIndex,
438+
uint8_t *buf) const {
439+
assert(!subsec.empty() && !sec.empty());
440+
assert(sec.begin() <= subsec.begin() && subsec.end() <= sec.end() &&
441+
"subsection is not part of this section");
442+
size_t vaBegin = std::distance(sec.begin(), subsec.begin());
443+
size_t vaEnd = std::distance(sec.begin(), subsec.end());
444+
memcpy(buf, subsec.data(), subsec.size());
445+
for (; nextRelocIndex < relocsSize; ++nextRelocIndex) {
446+
const coff_relocation &rel = relocsData[nextRelocIndex];
447+
// Skip relocations applied before this subsection.
448+
if (rel.VirtualAddress < vaBegin)
449+
continue;
450+
// Stop if the relocation does not apply to this subsection.
451+
if (rel.VirtualAddress >= vaEnd)
409452
break;
410-
default:
411-
llvm_unreachable("unknown machine type");
412-
}
453+
applyRelocation(&buf[rel.VirtualAddress - vaBegin], rel);
413454
}
414455
}
415456

lld/COFF/Chunks.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,15 @@ class SectionChunk final : public Chunk {
204204
ArrayRef<uint8_t> getContents() const;
205205
void writeTo(uint8_t *buf) const;
206206

207+
// Defend against unsorted relocations. This may be overly conservative.
208+
void sortRelocations();
209+
210+
// Write and relocate a portion of the section. This is intended to be called
211+
// in a loop. Relocations must be sorted first.
212+
void writeAndRelocateSubsection(ArrayRef<uint8_t> sec,
213+
ArrayRef<uint8_t> subsec,
214+
uint32_t &nextRelocIndex, uint8_t *buf) const;
215+
207216
uint32_t getOutputCharacteristics() const {
208217
return header->Characteristics & (permMask | typeMask);
209218
}
@@ -212,6 +221,7 @@ class SectionChunk final : public Chunk {
212221
}
213222
void getBaserels(std::vector<Baserel> *res);
214223
bool isCOMDAT() const;
224+
void applyRelocation(uint8_t *off, const coff_relocation &rel) const;
215225
void applyRelX64(uint8_t *off, uint16_t type, OutputSection *os, uint64_t s,
216226
uint64_t p) const;
217227
void applyRelX86(uint8_t *off, uint16_t type, OutputSection *os, uint64_t s,

0 commit comments

Comments
 (0)