-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[GlobalISel][AArch64] AArch64O0PreLegalizerCombiner: Disable fixed-point iteration #94291
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GlobalISel][AArch64] AArch64O0PreLegalizerCombiner: Disable fixed-point iteration #94291
Conversation
…int iteration This adds an option to CombinerInfo to turn off the fixed-point iteration in the Combiner. This option is then used for AArch64O0PreLegalizerCombiner. The combines there are simple enough that code quality impact should be minimal with the current heuristics (instructions are processed from top to bottom of the basic block, new/changed instructions are added back to the worklist). Test changes are due to some instructions not being DCE'd, which has no actual impact because InstructionSelect performs DCE as well. AArch64 CTMark O0: -0.9% compile-time (instruction count) no impact on size..text
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: Tobias Stadler (tobias-stadler) ChangesThis adds an option to CombinerInfo to turn off the fixed-point iteration in the Combiner. This option is then used for AArch64O0PreLegalizerCombiner. The combines there are simple enough that code quality impact should be minimal with the current heuristics (instructions are processed from top to bottom of the basic block, new/changed instructions are added back to the worklist). Test changes are due to some instructions not being DCE'd, which has no actual impact because InstructionSelect performs DCE as well. AArch64 CTMark O0: Full diff: https://github.com/llvm/llvm-project/pull/94291.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
index 13a8faf955a7e..63cbbb41dedaf 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h
@@ -49,6 +49,11 @@ struct CombinerInfo {
bool EnableOptSize;
/// Whether we're optimizing for minsize (-Oz).
bool EnableMinSize;
+
+ /// Whether the Combiner repeatedly iterates over all instructions until no
+ /// combine can be applied. Disabling this improves compile-time, but the IR
+ /// might not get transformed completely.
+ bool EnableFixedPointIteration = true;
};
} // namespace llvm
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index d18e65a83484f..745d355db2df1 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -166,7 +166,7 @@ bool Combiner::combineMachineInstrs() {
WLObserver->reportFullyCreatedInstrs();
}
MFChanged |= Changed;
- } while (Changed);
+ } while (Changed && CInfo.EnableFixedPointIteration);
#ifndef NDEBUG
if (CSEInfo) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
index 17dd8f2314a2b..83e4b023673b4 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -165,6 +165,8 @@ bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, /*EnableOpt*/ false,
F.hasOptSize(), F.hasMinSize());
+ CInfo.EnableFixedPointIteration = false;
+
AArch64O0PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB,
/*CSEInfo*/ nullptr, RuleConfig, ST);
return Impl.combineMachineInstrs();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
index 5ab086ffd2c13..c4e07de265edd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll
@@ -28,6 +28,7 @@ define i32 @foo() {
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.3
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -35,19 +36,19 @@ define i32 @foo() {
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[GV3:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var2
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: G_STORE [[C4]](s32), [[GV3]](p0) :: (store (s32) into @var2)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+ ; CHECK-NEXT: G_STORE [[C5]](s32), [[GV3]](p0) :: (store (s32) into @var2)
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK-NEXT: [[GV4:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var1
- ; CHECK-NEXT: G_STORE [[C5]](s32), [[GV4]](p0) :: (store (s32) into @var1)
+ ; CHECK-NEXT: G_STORE [[C6]](s32), [[GV4]](p0) :: (store (s32) into @var1)
; CHECK-NEXT: [[GV5:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var3
- ; CHECK-NEXT: G_STORE [[C4]](s32), [[GV5]](p0) :: (store (s32) into @var3)
- ; CHECK-NEXT: G_STORE [[C5]](s32), [[GV4]](p0) :: (store (s32) into @var1)
+ ; CHECK-NEXT: G_STORE [[C5]](s32), [[GV5]](p0) :: (store (s32) into @var3)
+ ; CHECK-NEXT: G_STORE [[C6]](s32), [[GV4]](p0) :: (store (s32) into @var1)
; CHECK-NEXT: G_BR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.if.end:
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C6]](s32)
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C7]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
entry:
%0 = load i32, ptr @var1, align 4
@@ -84,6 +85,7 @@ define i32 @darwin_tls() {
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C1]]
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.3
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -96,8 +98,8 @@ define i32 @darwin_tls() {
; CHECK-NEXT: G_BR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.if.end:
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C2]](s32)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C3]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
entry:
%0 = load i32, ptr @var1, align 4
@@ -127,6 +129,7 @@ define i32 @imm_cost_too_large_cost_of_2() {
; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -147,8 +150,8 @@ define i32 @imm_cost_too_large_cost_of_2() {
; CHECK-NEXT: bb.4.if.end:
; CHECK-NEXT: [[GV5:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var3
; CHECK-NEXT: G_STORE [[CONSTANT_FOLD_BARRIER]](s32), [[GV5]](p0) :: (store (s32) into @var3)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C3]](s32)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C4]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
entry:
%0 = load i32, ptr @var1, align 4
@@ -183,6 +186,7 @@ define i64 @imm_cost_too_large_cost_of_4() {
; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s64) = G_CONSTANT_FOLD_BARRIER [[C1]]
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -203,8 +207,8 @@ define i64 @imm_cost_too_large_cost_of_4() {
; CHECK-NEXT: bb.4.if.end:
; CHECK-NEXT: [[GV5:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var3_64
; CHECK-NEXT: G_STORE [[CONSTANT_FOLD_BARRIER]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: $x0 = COPY [[C3]](s64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: $x0 = COPY [[C4]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
entry:
%0 = load i64, ptr @var1_64, align 4
@@ -239,6 +243,7 @@ define i64 @f64_imm_cost_too_high(double %a) {
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -259,8 +264,8 @@ define i64 @f64_imm_cost_too_high(double %a) {
; CHECK-NEXT: bb.4.if.end:
; CHECK-NEXT: [[GV5:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var3_64
; CHECK-NEXT: G_STORE [[C]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: $x0 = COPY [[C3]](s64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: $x0 = COPY [[C4]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
entry:
%0 = load i64, ptr @var1_64, align 4
@@ -294,6 +299,7 @@ define i64 @f64_imm_cheap(double %a) {
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
; CHECK-NEXT: G_BR %bb.2
; CHECK-NEXT: {{ $}}
@@ -301,24 +307,24 @@ define i64 @f64_imm_cheap(double %a) {
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[GV3:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var2_64
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
- ; CHECK-NEXT: G_STORE [[C3]](s64), [[GV3]](p0) :: (store (s64) into @var2_64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
+ ; CHECK-NEXT: G_STORE [[C4]](s64), [[GV3]](p0) :: (store (s64) into @var2_64)
; CHECK-NEXT: G_BR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.if.then2:
; CHECK-NEXT: successors: %bb.4(0x80000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
+ ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
; CHECK-NEXT: [[GV4:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var1_64
- ; CHECK-NEXT: G_STORE [[C4]](s64), [[GV4]](p0) :: (store (s64) into @var1_64)
+ ; CHECK-NEXT: G_STORE [[C5]](s64), [[GV4]](p0) :: (store (s64) into @var1_64)
; CHECK-NEXT: G_BR %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.if.end:
; CHECK-NEXT: [[GV5:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var3_64
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
- ; CHECK-NEXT: G_STORE [[C5]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
- ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: $x0 = COPY [[C6]](s64)
+ ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
+ ; CHECK-NEXT: G_STORE [[C6]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
+ ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: $x0 = COPY [[C7]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
entry:
%0 = load i64, ptr @var1_64, align 4
|
For -O0, we use a special group of combines:
Is the one round vs fixed-point iteration issue limited to the -O0 combiner or is it a general issue of our combiners? One advertisement of the Gisel combiner was that it could write a JSON file to the disk after a run. It would contain insights about which combines hit, which are nice to have, and compile time statistics. |
In my opinion, it's a general issue. The fixed-point iteration unnecessarily burns a lot of compile-time in every combiner. Turning fixed-point iteration off for O0 is inconsequential according to CTMark and the optnone_combines are very simple, so I don't expect a lot of regressions in other benchmarks. >O0 requires a more elaborate discussion. Turning it off for AArch64PreLegalizerCombiner and AArch64PostLegalizerCombiner and running CTMark O2, I get:
All the code-size regressions originate from AArch64PreLegalizerCombiner and I still need to investigate them further. My hope is that a small additional heuristic in Combiner::WorklistMaintainer will fix most of the regressions, but my guess is that there will always be missed optimizations without fixed-point iteration for more complex combines, so the argument for disabling fixed-point iteration for O2 is not as clear-cut as for O0. Maybe O1? |
Instead of the boolean |
How are you collecting the compile_instructions metric? |
I don't see where this would be needed. The overwhelming majority of work is done in the first iteration. Limiting the number of iterations to 2 already performs identically to fixed-point iteration on CTMark O2. |
I run the test-suite on Linux under an AArch64 cross-toolchain (RelWithDebInfo clang, official ARM GNU linker/libc binaries) using |
This would better match how instcombine handles this |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm but I think we can do with fewer statistics
STATISTIC(NumTwoIterations, "Number of functions with two iterations"); | ||
STATISTIC(NumThreeIterations, "Number of functions with three iterations"); | ||
STATISTIC(NumFourOrMoreIterations, | ||
"Number of functions with four or more iterations"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure all this variety is needed? We can probably prune this from instcombine now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One, Two and Three are hit in CTMark. I have removed Four.
@@ -165,6 +165,8 @@ bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { | |||
CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, | |||
/*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, | |||
F.hasOptSize(), F.hasMinSize()); | |||
CInfo.MaxIterations = 1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a comment, perhaps calling back to this PR # so readers will know why we set it to 1.
Continues the work for disabling fixed-point iteration in the Combiner (#94291). This introduces improved Observer-based heuristics in the GISel Combiner to retry combining defs/uses of modified instructions and for performing sparse dead code elimination. I have experimented a lot with the heuristics and this seems to be the minimal set of heuristics that allows disabling fixed-point iteration for AArch64 CTMark O2 without regressions. Enabling this globally would pass all regression tests for all official targets (apart from small benign diffs), but I have made this fully opt-in for now, because I can't quantify the impact for other targets. This should mostly be on-par with how the WorkList-aware functions in the InstCombiner and DAGCombiner handle rescheduling instructions for recombining. For performance numbers see my follow-up patch for AArch64 (#102167) Pull Request: #102163
This adds an option to CombinerInfo to turn off the fixed-point iteration in the Combiner. This option is then used for AArch64O0PreLegalizerCombiner. The combines there are simple enough that code quality impact should be minimal with the current heuristics (instructions are processed from top to bottom of the basic block, new/changed instructions are added back to the worklist). Test changes are due to some instructions not being DCE'd, which has no actual impact because InstructionSelect performs DCE as well.
AArch64 CTMark O0:
-0.9% geomean compile-time (instruction count), no regressions
no change in size..text for any of the benchmarks