-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DebugInfo] Preserve line and column number when merging debug info. #129960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This patch introduces a new option `-preserve-merged-debug-info` to preserve an arbitrary version of debug information when DILocations are merged. This is intended to be used in production environments from which sample based profiles are derived such as AutoFDO and MemProf. With this patch we have see a 0.2% improvement on an internal workload at Google when generating AutoFDO profiles. It also significantly improves the ability for MemProf by preserving debug info for merged call instructions used in the contextual profile. Co-authored-by: Krzysztof Pszeniczny <[email protected]>
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-debuginfo Author: Snehasish Kumar (snehasish) ChangesThis patch introduces a new option With this patch we have see a 0.2% improvement on an internal workload at Google when generating AutoFDO profiles. It also significantly improves the ability for MemProf by preserving debug info for merged call instructions used in the contextual profile. Full diff: https://github.com/llvm/llvm-project/pull/129960.diff 2 Files Affected:
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index f975d4ca33ad9..b18db09820068 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
#include <numeric>
#include <optional>
@@ -34,6 +35,12 @@ cl::opt<bool> EnableFSDiscriminator(
cl::desc("Enable adding flow sensitive discriminators"));
} // namespace llvm
+// When true, preserves line and column number by picking one of the merged
+// location info in a deterministic manner to assist sample based PGO.
+static cl::opt<bool> PreserveMergedDebugInfo(
+ "preserve-merged-debug-info", cl::init(false), cl::Hidden,
+ cl::desc("Preserve line and column number when merging locations."));
+
uint32_t DIType::getAlignInBits() const {
return (getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ? 0 : SubclassData32);
}
@@ -125,6 +132,14 @@ DILocation *DILocation::getMergedLocation(DILocation *LocA, DILocation *LocB) {
if (LocA == LocB)
return LocA;
+ if (PreserveMergedDebugInfo) {
+ auto A = std::make_tuple(LocA->getLine(), LocA->getColumn(),
+ LocA->getDiscriminator());
+ auto B = std::make_tuple(LocB->getLine(), LocB->getColumn(),
+ LocB->getDiscriminator());
+ return A < B ? LocA : LocB;
+ }
+
LLVMContext &C = LocA->getContext();
using LocVec = SmallVector<const DILocation *>;
diff --git a/llvm/test/DebugInfo/preserve-merged-debug-info.ll b/llvm/test/DebugInfo/preserve-merged-debug-info.ll
new file mode 100644
index 0000000000000..d2b5af0d2d2c3
--- /dev/null
+++ b/llvm/test/DebugInfo/preserve-merged-debug-info.ll
@@ -0,0 +1,65 @@
+; RUN: opt %s -passes=simplifycfg -hoist-common-insts -preserve-merged-debug-info -S | FileCheck %s
+; CHECK: tail call i32 @bar{{.*!dbg !}}[[TAG:[0-9]+]]
+; CHECK: ![[TAG]] = !DILocation(line: 9, column: 16, scope: !9)
+
+; ModuleID = '../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c'
+source_filename = "../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @test(i32 noundef %n) local_unnamed_addr #0 !dbg !9 {
+entry:
+ %call = tail call i32 @foo(i32 noundef %n) #2, !dbg !12
+ %cmp1 = icmp sgt i32 %n, 100, !dbg !13
+ br i1 %cmp1, label %if.then, label %if.else, !dbg !13
+
+if.then: ; preds = %entry
+ %call2 = tail call i32 @bar(i32 noundef %n) #2, !dbg !14
+ %add = add nsw i32 %call2, %call, !dbg !15
+ br label %if.end, !dbg !16
+
+if.else: ; preds = %entry
+ %call4 = tail call i32 @bar(i32 noundef %n) #2, !dbg !17
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %r.0 = phi i32 [ %add, %if.then ], [ %call4, %if.else ], !dbg !18
+ ret i32 %r.0, !dbg !19
+}
+
+declare !dbg !20 i32 @foo(i32 noundef) local_unnamed_addr #1
+
+declare !dbg !21 i32 @bar(i32 noundef) local_unnamed_addr #1
+
+attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 21.0.0git ([email protected]:snehasish/llvm-project.git 6ce41db6b0275d060d6e60f88b96a1657024345c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c", directory: "/usr/local/google/home/snehasishk/working/llvm-project/build-assert", checksumkind: CSK_MD5, checksum: "ac1be6c40dad11691922d600f9d55c55")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{!"clang version 21.0.0git ([email protected]:snehasish/llvm-project.git 6ce41db6b0275d060d6e60f88b96a1657024345c)"}
+!9 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 5, type: !10, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{}
+!12 = !DILocation(line: 7, column: 13, scope: !9)
+!13 = !DILocation(line: 8, column: 8, scope: !9)
+!14 = !DILocation(line: 9, column: 16, scope: !9)
+!15 = !DILocation(line: 9, column: 14, scope: !9)
+!16 = !DILocation(line: 10, column: 3, scope: !9)
+!17 = !DILocation(line: 11, column: 10, scope: !9)
+!18 = !DILocation(line: 0, scope: !9)
+!19 = !DILocation(line: 13, column: 3, scope: !9)
+!20 = !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!21 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized)
+
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding of this is that with this flag on, a consistent but essentially arbitrary source location will be kept for merged instructions -- potentially misleading for debugging (hence it's an option), but good for PGO, yes?
I wonder how effective this is in the general case, but you've got numbers for a workload, so SGTM.
attributes #0 = { nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } | ||
attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } | ||
attributes #2 = { nounwind } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit -- if we can remove or reduce these attributes that'd be appreciated, it's a future maintenance burden when attributes change otherwise.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; RUN: opt %s -passes=simplifycfg -hoist-common-insts -preserve-merged-debug-info -S | FileCheck %s | ||
; CHECK: tail call i32 @bar{{.*!dbg !}}[[TAG:[0-9]+]] | ||
; CHECK: ![[TAG]] = !DILocation(line: 9, column: 16, scope: !9) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A file-level comment recording the intention of this test will assist future developers who cause this test to fail.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You know, we have unit tests for this API with decent fixtures:
TEST_F(DILocationTest, Merge) { |
Maybe this is more trouble than it's worth, but you could make the cl::opt non-static, declare it in the unittest, modify it directly (PreserveMergedDebugInfo = true/false
) and test it that way. No strong preference on my part, though,
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer to leave this as is since the current test also covers the SimplifyCfg transformations usage of getMergedLocation
. This is the case we found to be particularly impactful to preserve.
@@ -0,0 +1,65 @@ | |||
; RUN: opt %s -passes=simplifycfg -hoist-common-insts -preserve-merged-debug-info -S | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the test could be made more robust by adding CHECKs to show that simplifycfg has done the hoisting (at the moment, I think this test would pass even if the hoisting didn't take place?).
Possibly also worth checking with -preserve-merged-debug-info=false
too (that the instruction gets a merged loc) to avoid this test spuriously passing if, for whatever reason, the default behaviour changes to a non-merged-loc for this test? YMMV.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done for both, thanks for the suggestion.
The commit message should say "arbitrary but deterministic." If it's just arbitrary, the workload got lucky and it might not remain lucky. If it's deterministic, the luck won't change. |
This seems like a pretty substantial departure from LLVM's documented approach to instruction line information - a policy that was established predominantly as an agreement/unified needs between Google (for profile accuracy) and Sony (for user debuggability/not so "jumpy" stepping). So this deserves a bit more discussion to better understand why we're considering departing from that established decision now. For instance, this seems like it does the wrong thing for PGO, as far I've understood the goals, for an example like this:
The call to The premise so far has been that that change is harmful for users (they might incorrectly conclude that since line 6 is reached, then Could you provide more context for how this ^ benefits PGO? Or which cases do benefit? |
Thanks for taking a look @dwblaikie.
We do not intend to change the behaviour for scenarios where users are stepping through the binary. This behaviour is intended to be enabled in production where sample profiles are collected, similar in spirit to
I believe this notion of attribution for AutoFDO is outdated. Today hotness inference for a basic block relies on branch target information. In your example, we would use the debug information attached to the if statement to determine the hotness of the callsites for f2 and f3.
Debug info loss on merged indirect calls is particularly important since it disallows indirect call promotion. The net impact of retaining a deterministic version of merged debug info for a large workload is positive. MemProf is a context sensitive profile which is particularly susceptible to loss of debug information on direct and indirect calls. This is part of a broader effort to improve profile quality for AutoFDO and MemProf by addressing missing debug info. I think we will find additional opportunities and once we have a better idea, I can update the documentation and guidance at https://llvm.org/docs/HowToUpdateDebugInfo.html. Does that sound reasonable to you? |
Thanks for taking a look @pogo59 @jmorse!
Yes, determinism is necessary for accurate profile attribution. Updated the PR description.
Yes, it's a net positive for PGO and not meant to be used in interactive debugging sessions. |
That's hard to separate -people still want to debug their production binaries.
Hrm :/ Sort of following, thanks.
Could you walk me through this in more detail - exactly what DWARF is generated, how does it get consumed, and what decisions are made based on it?
I'd be inclined to figure out the principles here a bit earlier/somewhat up front, before we make changes. I'm especially concerned about a divergence between what's good for profilers and what's good for humans. |
I think this is probably the most important thing: documenting the principles of what we do, why, what AFDO needs, what profilers need, etc, what modes we have, and what we expect them to do. Over a long enough time horizon, it seems like we'd want to make this new behavior part of |
Documenting it seems good - perhaps more than has been posted so far (the AFDO documentation doesn't discuss the issue of "samples of a source location inside a block could lead to misconstrued assumptions about that block being reachable"). But even then, I'm still really worried that this makes the needs of different consumers divergent to a problematic degree - we managed to make the direction align between a variety of consumers (humans, profilers, profile driven optimization, etc) so far and diverging from that seems like a real problem. (even just at Google - humans use profiles from this same debug info, and if we tell them that some block is hot when it isn't, that seems potentially really unfortunate) |
Here is a pointer to a prior discussion on merging value profile metadata which notes the need to retain debug information to be able to assign the right profile information to indirect calls. For AFDO we rely on the start line of the function (needs
I agree - with this change we are no longer have a single aligned implementation for all users. However, the cost we pay in terms of performance for alignment increases with time as more aggressive optimizations are introduced. For newer optimizations such as MemProf the loss of debug information on call insts is a significant contributor to coverage loss. Basically, the trade off we have encoded in LLVM increases the downside for PGO with the passage of time. With this change I want us to be able to quantify the impact in production. It's possible we will encounter situations where this is not ideal (e.g. crash stack traces) but that's ok in my opinion. In the short term we can disable the change for those affected binaries and in the long term, gives us motivation to champion work on improving source location fidelity for humans and tools. Are there any further changes / clarifications I can make to help move this along? @rnk @dwblaikie |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the docs!
This makes me wonder if we should look into some higher-level integration testing that tracks the number of call sites that lack source locations, since call site source locations are critical for so many applications.
llvm/docs/HowToUpdateDebugInfo.rst
Outdated
In particular, loss of location information for calls inhibit optimizations | ||
such as indirect call promotion. This behavior can be optionally enabled until | ||
support for accurately representing merged instructions in the line table is | ||
implemented. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are branch instruction locations also critical? I've had more conversations with other sample PGO folks, and my mental model is that PGO constructs branch weights from LBR (Last Branch Record) counters. So, the key to getting accurate branch weights is not having deterministic, distinct source locations on every instruction in the basic block, it's tracking deterministic, distinct source locations on every branch instruction. I think it turns out that, in practice, branch instructions are control instructions, so they tend to either folded away, duplicated in order (inlining, unswitching), or left alone. It doesn't really make sense to speculate a branch instruction. The folding we do just needs to be extremely careful about tracking slocs, just like it has to care about updating branch weights.
If that's accurate, I think it would be helpful to document that maintaining deterministic and distinct source locations are what's important for sample PGO. I think this would go a long way to helping motivate why we can safely retain more source locations on out-of-order instructions without negatively impacting sample PGO results.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are branch instruction locations also critical?
Yes, though the design of sample attribution mitigates this somewhat by inferring block weight from the max of all instructions in the basic block. Calls are more important since they carry additional metadata apart from their own execution count as noted earlier.
Updated text, ptal.
I share this concern about aligning the needs . At the end of the day, the compiler only emits one kind of debug info, and it's used for many purposes. We need to make reasonable tradeoffs. I view this copt as a form of short term tech debt to unblock current PGO use cases. It doesn't feel reasonable to me to insist that at this moment, to uphold the desired PGO invariant that all call sites have determinstic source locations, that we stop now and come up with a design for that up front. As long as we have agreement that we need such a design and that this is the long term goal, I'm comfortable living with this (I won't even say short term, because everything lives longer than you expect) tech debt tradeoff. |
To say more (I should edit more before replying), I think documenting what PGO needs from debug info is a necessary step on the path to creating this alignment. We have some of this in HowToUpdateDebugInfo.rst, but a lot of these requirements for interactive stepping, PGO control dependency, and sanitizer reports have largely been in our heads, and that makes it harder for the LLVM project to function as a globally distributed open source community. Which is why I think it's a good tradeoff to accept some tech debt for improved documentation. |
+1 to this. We've had similar discussions about profile information propagation before.
Yes, the need for distinction also motivated the work on DWARF path discriminators for duplicated basic blocks. More recent work on flow sensitive AFDO has extended the use of discriminators to enhance profile accuracy particularly in the backend.
Let me know if you want me to file an issue or if you want any further clarification on SamplePGO needs in the debug info documentation. Thanks! |
@@ -34,6 +35,12 @@ cl::opt<bool> EnableFSDiscriminator( | |||
cl::desc("Enable adding flow sensitive discriminators")); | |||
} // namespace llvm | |||
|
|||
// When true, preserves line and column number by picking one of the merged |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to have a more descriptive name for this option, since we are discarding information, we're not preserving half of the merged source location. I came up with some suggestions:
- -merge-source-locs-by-picking
- -pick-merged-source-locations
- -merge-source-locations-with-selection
- -merge-source-locations-by-choice
- -no-approximate-merged-source-locs
I'm really just trying various synonyms for "pick", "select", "choose".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, "pick-merged-source-locations" sounds good to me.
@@ -125,6 +132,14 @@ DILocation *DILocation::getMergedLocation(DILocation *LocA, DILocation *LocB) { | |||
if (LocA == LocB) | |||
return LocA; | |||
|
|||
if (PreserveMergedDebugInfo) { | |||
auto A = std::make_tuple(LocA->getLine(), LocA->getColumn(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should filenames be compared here? DILocation has getFilename
and getDirectory
, which return StringRefs, so you can fill them in here in a reasonably straightforward way.
However, given the use case, would it be reasonable to always pick LocA
?
Regardless, I think adding directories and files would follow the principle of least surprise.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I've added getFilename
and getDirectory
to the comparison. Note that SamplePGO doesn't rely on file and directory today when using instruction locations.
would it be reasonable to always pick LocA
I think the pick policy should be independent of the order of the parameters passed to this function. So getMergedLocation(A, B) == getMergedLocation(B, A)
.
; RUN: opt %s -passes=simplifycfg -hoist-common-insts -preserve-merged-debug-info -S | FileCheck %s | ||
; CHECK: tail call i32 @bar{{.*!dbg !}}[[TAG:[0-9]+]] | ||
; CHECK: ![[TAG]] = !DILocation(line: 9, column: 16, scope: !9) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You know, we have unit tests for this API with decent fixtures:
TEST_F(DILocationTest, Merge) { |
Maybe this is more trouble than it's worth, but you could make the cl::opt non-static, declare it in the unittest, modify it directly (PreserveMergedDebugInfo = true/false
) and test it that way. No strong preference on my part, though,
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks!
I'll wait a day and merge this on Friday morning if there are no more comments. Thanks! |
// rather than computing a merged location using line 0, which is typically | ||
// not useful for PGO. | ||
if (PickMergedSourceLocations) { | ||
auto A = std::make_tuple(LocA->getLine(), LocA->getColumn(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Does std::forward_as_tuple
work here (to avoid copies)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, the only place it would be useful is if getFilename
and getDirectory
returned string, but they return StringRef. Also you'd have to rewrite this to do the comparison without the auto A
and auto B
vars or you'd introduce lifetime issues if you just replaced make_tuple
with forward_as_tuple
.
This patch introduces a new option
-preserve-merged-debug-info
to preserve an arbitrary but deterministic version of debug information when DILocations are merged. This is intended to be used in production environments from which sample based profiles are derived such as AutoFDO and MemProf.With this patch we have see a 0.2% improvement on an internal workload at Google when generating AutoFDO profiles. It also significantly improves the ability for MemProf by preserving debug info for merged call instructions used in the contextual profile.