-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Reduce llvm-gsymutil memory usage #91023
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-debuginfo Author: Kevin Frei (kevinfrei) Changesllvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. My testing of the double-checked locking around the creation & freeing of the data structures was tested on a 166 core system, validating that it trivially malfunctioned without the locks (or with stupid reordering of the locks) and worked reliably with them. Full diff: https://github.com/llvm/llvm-project/pull/91023.diff 3 Files Affected:
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 80c27aea893123..9614aab8bb9b50 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -22,6 +22,7 @@
#include "llvm/DebugInfo/DWARF/DWARFLocationExpression.h"
#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/RWMutex.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -257,6 +258,9 @@ class DWARFUnit {
std::shared_ptr<DWARFUnit> DWO;
+ mutable llvm::sys::RWMutex m_cu_die_array_mutex;
+ mutable llvm::sys::RWMutex m_all_die_array_mutex;
+
protected:
friend dwarf_linker::parallel::CompileUnit;
@@ -566,6 +570,9 @@ class DWARFUnit {
Error tryExtractDIEsIfNeeded(bool CUDieOnly);
+ /// clearDIEs - Clear parsed DIEs to keep memory usage low.
+ void clearDIEs(bool KeepCUDie);
+
private:
/// Size in bytes of the .debug_info data associated with this compile unit.
size_t getDebugInfoSize() const {
@@ -581,9 +588,6 @@ class DWARFUnit {
void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
std::vector<DWARFDebugInfoEntry> &DIEs) const;
- /// clearDIEs - Clear parsed DIEs to keep memory usage low.
- void clearDIEs(bool KeepCUDie);
-
/// parseDWO - Parses .dwo file for current compile unit. Returns true if
/// it was actually constructed.
/// The \p AlternativeLocation specifies an alternative location to get
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index bdd04b00f557bd..cc79d9ec7130c9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -496,13 +496,29 @@ void DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
}
Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
- if ((CUDieOnly && !DieArray.empty()) ||
- DieArray.size() > 1)
- return Error::success(); // Already parsed.
-
- bool HasCUDie = !DieArray.empty();
- extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
-
+ {
+ llvm::sys::ScopedReader Lock(m_cu_die_array_mutex);
+ if ((CUDieOnly && !DieArray.empty()) || DieArray.size() > 1)
+ return Error::success(); // Already parsed.
+ }
+ bool HasCUDie = false;
+ {
+ llvm::sys::ScopedWriter Lock(m_cu_die_array_mutex);
+ if ((CUDieOnly && !DieArray.empty()) || DieArray.size() > 1)
+ return Error::success(); // Already parsed.
+ HasCUDie = !DieArray.empty();
+ extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
+ }
+ {
+ llvm::sys::ScopedReader Lock(m_all_die_array_mutex);
+ if (DieArray.empty())
+ return Error::success();
+
+ // If CU DIE was just parsed, copy several attribute values from it.
+ if (HasCUDie)
+ return Error::success();
+ }
+ llvm::sys::ScopedWriter Lock(m_all_die_array_mutex);
if (DieArray.empty())
return Error::success();
@@ -658,6 +674,8 @@ void DWARFUnit::clearDIEs(bool KeepCUDie) {
// It depends on the implementation whether the request is fulfilled.
// Create a new vector with a small capacity and assign it to the DieArray to
// have previous contents freed.
+ llvm::sys::ScopedWriter CULock(m_cu_die_array_mutex);
+ llvm::sys::ScopedWriter AllLock(m_all_die_array_mutex);
DieArray = (KeepCUDie && !DieArray.empty())
? std::vector<DWARFDebugInfoEntry>({DieArray[0]})
: std::vector<DWARFDebugInfoEntry>();
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 601686fdd3dd51..4a1ed91a7244f2 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -587,6 +587,11 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
DWARFDie Die = getDie(*CU);
CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
handleDie(Out, CUI, Die);
+ // Release the line table, once we're done.
+ DICtx.clearLineTableForUnit(CU.get());
+ // Free any DIEs that were allocated by the DWARF parser.
+ // If/when they're needed by other CU's, they'll be recreated.
+ CU->clearDIEs(false);
}
} else {
// LLVM Dwarf parser is not thread-safe and we need to parse all DWARF up
@@ -612,11 +617,16 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
DWARFDie Die = getDie(*CU);
if (Die) {
CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
- pool.async([this, CUI, &LogMutex, &Out, Die]() mutable {
+ pool.async([this, CUI, &CU, &LogMutex, &Out, Die]() mutable {
std::string storage;
raw_string_ostream StrStream(storage);
OutputAggregator ThreadOut(Out.GetOS() ? &StrStream : nullptr);
handleDie(ThreadOut, CUI, Die);
+ // Release the line table once we're done.
+ DICtx.clearLineTableForUnit(CU.get());
+ // Free any DIEs that were allocated by the DWARF parser.
+ // If/when they're needed by other CU's, they'll be recreated.
+ CU->clearDIEs(false);
// Print ThreadLogStorage lines into an actual stream under a lock
std::lock_guard<std::mutex> guard(LogMutex);
if (Out.GetOS()) {
@@ -629,6 +639,9 @@ Error DwarfTransformer::convert(uint32_t NumThreads, OutputAggregator &Out) {
}
pool.wait();
}
+ // Now get rid of all the DIEs that may have been recreated
+ for (const auto &CU : DICtx.compile_units())
+ CU->clearDIEs(false);
size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
Out << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
return Error::success();
|
@clayborg @jeffreytan81 could one of you please take a look? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per my understanding, these two mutex are required to protect any read/write to DieArray field? If so, there seem to be many more accesses of this field not protected?
They're there for protecting the allocation+population, or destruction of the DieArrays, not generally reading/writing those fields. There needs to be two (for just the first CUDie, and then the whole array) as populating the whole array can trigger a request to populate the first CuDie. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am fine with the GSYM parts. I will let the LLVM DWARF experts accept the changes in DWARFUnit.h/.cpp
What is the code path for that? |
Sorry: I wrote that backwards (fixed the original comment). Populating the full array can trigger a request to populate the CuDie through |
Glanced at the code. I think I am missing something. How can code through determineStringOffsetsTableContribution trigger the path? It just accesses StrOffset Section. |
Yeah, I didn't see it by looking at the code, because the recursion occurs inside getUnitDIE() (in the .h file), which calls extractDIEsIfNeeded(...) |
ping @ayermolo @dwblaikie This is blocking some dwarf-health tracking work I'm trying to make headway on. The only 'stylistic' adjustment I can see making sense is to have some sort of doubled-checked locking abstraction that takes a lock and a couple lambda's to make the duplicated code cleaner, which I'm happy to do if someone wants it, but I haven't seen something like that elsewhere. |
OK. I think this two lock thing makes sense in my head. |
Oooh! You're right: There's a spot in there where bad things could happen. I'll get that fixed. |
@ayermolo I used a rwlock model to prevent the issue you identified. Good catch! |
To alleviate copy-pasta problems, I've added a local DoubleCheckewhatdRWLocker helper function that takes the "check if it's already been initialized" lambda, and the initializer lambda (and the lock). I feel like the result is a much cleaner version of the original diff, plus it has the third lock to prevent free's from occurring between the single CU and the array CU initialization code that @ayermolo saw. @dwblaikie they tell me you're the guy who needs to approve this, now... |
Ah, I thought this was all in gsym, so hadn't given the review much attention. Are these independent changes - the line table clearing stuff presumably is independent, for instance? I guess the unit DIE clearing is dependent on the extra locking being added? The extra locking is unfortunate/more difficult to just straight up approve, as we'd previously restricted the multithreading support to the DWARFContext by having distinct contexts - so that non-multithreaded code didn't take the hit or complexity for the multithreaded code. But this new codepath and locking is present for single and multithreaded uses alike, by the looks of it? Not sure what to do about that... |
I'm only doing one "real" thing here, so no, the changes can't be split into smaller, still useful/sensible pieces (except that I could delay the gsymutil change until after the dwarfunit support for releasing the memory is in, which doesn't seem worthwhile)
I spent about 3 days staring at how to accomplish this in a way that wouldn't increase the cost of single-threaded code, and couldn't come up with anything other that copying the entire class and making a multi-threaded 'freeable' duplicate, which seemed like an awful lot of code duplication to save a few hundred CPU cycles in code that is egregiously I/O bound. (I have a 166 core server: gsymutil works fastest when it's run on 6-10 cores: any more swamps the NVMe drive). |
All that said, apparently something broke a bunch of tests (I hadn't run them since my last couple changes) so I'm not blocked on your review just yet :/ |
502f36a
to
f4a456d
Compare
b3120a5
to
b8c0704
Compare
Is it worth having 3 separate mutexes? One answer would be to go fully coarse grained - just have 1 that covers the whole array. Does that have noticably worse performance for your use case? The other option would be 2 - you have one covering the first element of the array, and another covering the rest - I guess then you have problems when reallocating the array, though? |
I started with what you're describing, and it deadlocks, as there's a path from the non-CU DIE creation that eventually recurses into the function (which I called out in the review above)
Yup. That's what @ayermolo caught as well. You need 3 locks. No amount of wishful thinking is gonna change that (I tried. Many different ways. Several times. 🫤) |
Thanks, sorry I'm rehashing things. A recursive mutex would apply here, then? (& we even have this thing: https://llvm.org/doxygen/Mutex_8h_source.html - which would reduce the overhead (presumably?) to the non-multithreaded users (well, I guess only to users building all of LLVM without multithreading, not users building with multithreading enabled and just not using multiple threads to interact with libDebugInfoDWARF)?) Does a single recursive mutex substantially affect the performance of the situations you're interested in? |
Yes. You need a reader-writer mutex, as the reader scenario is very common, but conflicts enough to matter quite a bit. Any other performance doesn't matter at all. Even in single-threaded mode, the locking overhead isn't likely to register, as the work protected by the write-lock is deeply IO bound (and the existing lock is already one of the LLVM smart locks so it shouldn't be a cost). If you're reading this a whole lot, it could potentially slow things down, but I'm not sure what that scenario is, outside of the one I'm looking at in GSYM. How common is it to try to extract the DIE's multiple times in a scenario where it's not processing an entire gigantic dependency graph from storage? I do not claim to understand all potential uses/scenarios of this code, but with my current understanding, that seems like a very unlikely scenario. I'm happy to learn of a counter-example. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough - thanks for your patience walking me through it
// Helper to safely check if the Compile-Unit DIE has been extracted already. | ||
// If not, then extract it, and return false, indicating that it was *not* | ||
// already extracted. | ||
bool DWARFUnit::extractCUDieIfNeeded(bool CUDieOnly, bool &HasCUDie) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this function misnamed? It looks like it extracts more than only the CUDie (if CUDieOnly is false, it extracts all the DIEs, right?) So maybe this should be called "extractDiesIfNeeded"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup, you're correct: This is allocating (and extracting) all the DIE's. What's protected by the second lock appears to be extracting string (offset...), range, and location tables for the DIEs that were just extracted. Would renaming things to extractDIEsIfNeeded
/extractTablesForDIEsIfNeeded
seem reasonable?
(I'm also going to see if I can free those tables in a later diff, as I'd love to drive memory usage lower than it currently is even with this change)
It seems this commit caused a build failure on the clang-ppc64le-linux-multistage build no.583. Please take a look at it and post a fix or revert your patch if the fix will take to long to implement. Let me know if there is anyway help or extra information I can offer.
|
Please revert it. Looks like the PPC64le build doesn't like lambda's very much. I'll try to refactor it without the lambda's, but I won't be able to do it in the next few hours :/ |
Reverts #91023 Build break found in clang-ppc64le-linux-multistage build no. 583.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. The double-checked locking around the creation & freeing of the data structures was tested on a 166 core system. I validated that it trivially malfunctioned without the locks (and with stupid reordering of the locks) and worked reliably with them. --------- Co-authored-by: Kevin Frei <[email protected]>
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. The double-checked locking around the creation & freeing of the data structures was tested on a 166 core system. I validated that it trivially malfunctioned without the locks (and with stupid reordering of the locks) and worked reliably with them. --------- Co-authored-by: Kevin Frei <[email protected]>
Reverts llvm#91023 Build break found in clang-ppc64le-linux-multistage build no. 583.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests. There is a recursive mutex necessary to prevent accidental freeing of the DIE arrays while they're being extraced. It needs to be recursive as there's a recursive path through the final section of the code (determineStringOffsetsTableContribution) that may result in a call to this function.
llvm-gsymutil eats a lot of RAM. On some large binaries, it causes OOM's on smaller hardware, consuming well over 64GB of RAM. This change frees line tables once we're done with them, and frees DWARFUnits's DIE's when we finish processing each DU, though they may get reconstituted if there are references from other DU's during processing. Once the conversion is complete, all DIE's are freed. The reduction in peak memory usage from these changes showed between 7-12% in my tests.
The double-checked locking around the creation & freeing of the data structures was tested on a 166 core system. I validated that it trivially malfunctioned without the locks (and with stupid reordering of the locks) and worked reliably with them.