-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[CodeLayout] Do not flip branch condition when using optsize #114607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Ellis Hoag (ellishg) Changes
Full diff: https://github.com/llvm/llvm-project/pull/114607.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index d1dced9ef28dca..bdad63f368dfec 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2906,7 +2906,7 @@ void MachineBlockPlacement::buildCFGChains() {
void MachineBlockPlacement::optimizeBranches() {
BlockChain &FunctionChain = *BlockToChain[&F->front()];
- SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+ SmallVector<MachineOperand, 4> Cond;
// Now that all the basic blocks in the chain have the proper layout,
// make a final call to analyzeBranch with AllowModify set.
@@ -2916,24 +2916,30 @@ void MachineBlockPlacement::optimizeBranches() {
// a fallthrough when it occurs after predicated terminators.
for (MachineBasicBlock *ChainBB : FunctionChain) {
Cond.clear();
- MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
- if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
- // If PrevBB has a two-way branch, try to re-order the branches
- // such that we branch to the successor with higher probability first.
- if (TBB && !Cond.empty() && FBB &&
- MBPI->getEdgeProbability(ChainBB, FBB) >
- MBPI->getEdgeProbability(ChainBB, TBB) &&
- !TII->reverseBranchCondition(Cond)) {
- LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
- << getBlockName(ChainBB) << "\n");
- LLVM_DEBUG(dbgs() << " Edge probability: "
- << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
- << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
- DebugLoc dl; // FIXME: this is nowhere
- TII->removeBranch(*ChainBB);
- TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
- }
- }
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
+ continue;
+ if (!TBB || !FBB || Cond.empty())
+ continue;
+ // If we are optimizing for size we do not consider the runtime performance.
+ // Instead, we retain the original branch condition so we have more uniform
+ // instructions which will benefit ICF.
+ if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
+ continue;
+ // If ChainBB has a two-way branch, try to re-order the branches
+ // such that we branch to the successor with higher probability first.
+ if (MBPI->getEdgeProbability(ChainBB, TBB) >=
+ MBPI->getEdgeProbability(ChainBB, FBB))
+ continue;
+ if (TII->reverseBranchCondition(Cond))
+ continue;
+ LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+ << getBlockName(ChainBB) << "\n");
+ LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
+ << "\n");
+ auto Dl = ChainBB->findBranchDebugLoc();
+ TII->removeBranch(*ChainBB);
+ TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
}
}
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
new file mode 100644
index 00000000000000..3645718968f9e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; When consuming profile data we sometimes flip a branch to improve runtime
+; performance. If we are optimizing for size, we avoid changing the branch to
+; improve outlining and ICF.
+
+define i8 @foo_optsize(i32 %v4) optsize {
+; CHECK-LABEL: foo_optsize:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB0_2
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB0_4
+; CHECK-NEXT: .LBB0_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbz wzr, .LBB0_1
+; CHECK-NEXT: b .LBB0_3
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+define i8 @foo_optspeed(i32 %v4) {
+; CHECK-LABEL: foo_optspeed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB1_2
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB1_4
+; CHECK-NEXT: .LBB1_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB1_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbnz wzr, .LBB1_3
+; CHECK-NEXT: b .LBB1_1
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+!0 = !{!"branch_weights", i32 5, i32 5, i32 100}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 88a132d3850d1d..9e0a19f9a504f2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -303,10 +303,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK32-NEXT: .LBB3_8: # %if.else
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3]
-; CHECK32-NEXT: jb .LBB3_11 # encoding: [0x72,A]
-; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
-; CHECK32-NEXT: jmp .LBB3_9 # encoding: [0xeb,A]
+; CHECK32-NEXT: jae .LBB3_9 # encoding: [0x73,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK32-NEXT: jmp .LBB3_11 # encoding: [0xeb,A]
+; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK32-NEXT: .LBB3_12: # %sw.bb22
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18]
@@ -483,10 +483,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: # %bb.6: # %sw.bb
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d]
-; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A]
-; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; WIN64-NEXT: jmp .LBB3_8 # encoding: [0xeb,A]
+; WIN64-NEXT: jne .LBB3_8 # encoding: [0x75,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT: jmp .LBB3_10 # encoding: [0xeb,A]
+; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; WIN64-NEXT: .LBB3_7: # %sw.bb14
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
|
@llvm/pr-subscribers-backend-x86 Author: Ellis Hoag (ellishg) Changes
Full diff: https://github.com/llvm/llvm-project/pull/114607.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index d1dced9ef28dca..bdad63f368dfec 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -2906,7 +2906,7 @@ void MachineBlockPlacement::buildCFGChains() {
void MachineBlockPlacement::optimizeBranches() {
BlockChain &FunctionChain = *BlockToChain[&F->front()];
- SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+ SmallVector<MachineOperand, 4> Cond;
// Now that all the basic blocks in the chain have the proper layout,
// make a final call to analyzeBranch with AllowModify set.
@@ -2916,24 +2916,30 @@ void MachineBlockPlacement::optimizeBranches() {
// a fallthrough when it occurs after predicated terminators.
for (MachineBasicBlock *ChainBB : FunctionChain) {
Cond.clear();
- MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
- if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
- // If PrevBB has a two-way branch, try to re-order the branches
- // such that we branch to the successor with higher probability first.
- if (TBB && !Cond.empty() && FBB &&
- MBPI->getEdgeProbability(ChainBB, FBB) >
- MBPI->getEdgeProbability(ChainBB, TBB) &&
- !TII->reverseBranchCondition(Cond)) {
- LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
- << getBlockName(ChainBB) << "\n");
- LLVM_DEBUG(dbgs() << " Edge probability: "
- << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
- << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
- DebugLoc dl; // FIXME: this is nowhere
- TII->removeBranch(*ChainBB);
- TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
- }
- }
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true))
+ continue;
+ if (!TBB || !FBB || Cond.empty())
+ continue;
+ // If we are optimizing for size we do not consider the runtime performance.
+ // Instead, we retain the original branch condition so we have more uniform
+ // instructions which will benefit ICF.
+ if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()))
+ continue;
+ // If ChainBB has a two-way branch, try to re-order the branches
+ // such that we branch to the successor with higher probability first.
+ if (MBPI->getEdgeProbability(ChainBB, TBB) >=
+ MBPI->getEdgeProbability(ChainBB, FBB))
+ continue;
+ if (TII->reverseBranchCondition(Cond))
+ continue;
+ LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+ << getBlockName(ChainBB) << "\n");
+ LLVM_DEBUG(dbgs() << " " << getBlockName(TBB) << " < " << getBlockName(FBB)
+ << "\n");
+ auto Dl = ChainBB->findBranchDebugLoc();
+ TII->removeBranch(*ChainBB);
+ TII->insertBranch(*ChainBB, FBB, TBB, Cond, Dl);
}
}
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
new file mode 100644
index 00000000000000..3645718968f9e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; When consuming profile data we sometimes flip a branch to improve runtime
+; performance. If we are optimizing for size, we avoid changing the branch to
+; improve outlining and ICF.
+
+define i8 @foo_optsize(i32 %v4) optsize {
+; CHECK-LABEL: foo_optsize:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB0_2
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB0_4
+; CHECK-NEXT: .LBB0_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbz wzr, .LBB0_1
+; CHECK-NEXT: b .LBB0_3
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+define i8 @foo_optspeed(i32 %v4) {
+; CHECK-LABEL: foo_optspeed:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cbz wzr, .LBB1_2
+; CHECK-NEXT: .LBB1_1:
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_2: // %b1
+; CHECK-NEXT: cbnz w0, .LBB1_4
+; CHECK-NEXT: .LBB1_3: // %b2
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_4: // %b1
+; CHECK-NEXT: cmp w0, #1
+; CHECK-NEXT: b.ne .LBB1_1
+; CHECK-NEXT: // %bb.5: // %b3
+; CHECK-NEXT: cbnz wzr, .LBB1_3
+; CHECK-NEXT: b .LBB1_1
+entry:
+ %v2 = icmp eq i32 0, 0
+ br i1 %v2, label %b1, label %b4
+
+b1:
+ switch i32 %v4, label %b4 [
+ i32 1, label %b3
+ i32 0, label %b2
+ ], !prof !0
+
+b2:
+ br label %b4
+
+b3:
+ %v3 = icmp eq i32 0, 0
+ br i1 %v3, label %b4, label %b2
+
+b4:
+ %v16 = phi i8 [ 1, %b2 ], [ 0, %entry ], [ 0, %b3 ], [ 0, %b1 ]
+ ret i8 %v16
+}
+
+!0 = !{!"branch_weights", i32 5, i32 5, i32 100}
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 88a132d3850d1d..9e0a19f9a504f2 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -303,10 +303,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; CHECK32-NEXT: .LBB3_8: # %if.else
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movl %esi, %ebx # encoding: [0x89,0xf3]
-; CHECK32-NEXT: jb .LBB3_11 # encoding: [0x72,A]
-; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
-; CHECK32-NEXT: jmp .LBB3_9 # encoding: [0xeb,A]
+; CHECK32-NEXT: jae .LBB3_9 # encoding: [0x73,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK32-NEXT: jmp .LBB3_11 # encoding: [0xeb,A]
+; CHECK32-NEXT: # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
; CHECK32-NEXT: .LBB3_12: # %sw.bb22
; CHECK32-NEXT: # in Loop: Header=BB3_1 Depth=1
; CHECK32-NEXT: movzbl (%eax), %ebx # encoding: [0x0f,0xb6,0x18]
@@ -483,10 +483,10 @@ define zeroext i1 @pr31257(ptr nocapture readonly dereferenceable(8) %s) minsize
; WIN64-NEXT: # %bb.6: # %sw.bb
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: cmpl $45, %r9d # encoding: [0x41,0x83,0xf9,0x2d]
-; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A]
-; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; WIN64-NEXT: jmp .LBB3_8 # encoding: [0xeb,A]
+; WIN64-NEXT: jne .LBB3_8 # encoding: [0x75,A]
; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT: jmp .LBB3_10 # encoding: [0xeb,A]
+; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
; WIN64-NEXT: .LBB3_7: # %sw.bb14
; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1
; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
|
Interesting, by this logic a simpler thing to do would probably be to ignore the whole But I suspect this change is geared towards hot functions with just some cold basic blocks... |
Do we have any signal / flag to get an indication whether the user cares about ICF? I feel slightly uncomfortable in that this may regress users optimizing for size but not using any form of ICF / outlining... |
Actually would it make sense to have a flag/mode (during profile loading or similar) that preprocess the profile and removes all branch probability from cold blocks when optimizing for size? That would be nice in that it affects other optimizations as well and the logic would be decoupled from |
This is motivated by a scenario where binary size is regressed when PGO is used. Note that if we don't consume PGO data, we can still read from Basically, we still want to read profile data to improve the performance of critical functions, but we should avoid it for cold or
In this case, flipping the branch does not impact code size because the number of instructions does not change. This optimization is purely for performance, which is not necessary for FWIW this edge case seems to be super rare. The code size win I saw was very small, but still noticeable. |
// If we are optimizing for size we do not consider the runtime performance. | ||
// Instead, we retain the original branch condition so we have more uniform | ||
// instructions which will benefit ICF. | ||
if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An alternative is not to call optimizeBranches
at all in runOnMachineFunction
if the function is being optimized for size. Any thoughts on this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is the only place that calls analyzeBranch
with AllowModify = true
though. I think that part is required for things like removing jump instructions to the immediately following block, so we better not skip it...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly, I saw a very slight improvement with this rather than skipping optimizeBranches()
entirely because of the AllowModify = true
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, makes sense. Of course, we should remove unnecessary branches anyway. It does seem a bit strange that here we check whether a block needs to be optimized for size and not the function; i assume there is a reason for that too?
Btw, is it guaranteed that the loop iterates over all blocks in the function? It uses a chain to extract blocks for some reason, instead of iterating over function blocks directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if a block is cold, then the function must also be cold function is cold, then the block is also cold. So by passing the block instead of the function to shouldOptimizeForSize()
we are catching more cases.
I'm not sure why it's iterating over chains. Maybe I should try to iterate over blocks to see if there is any difference.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if a block is cold, then the function must also be cold.
Should it be the other way around?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I changed the iterator to iterate over blocks instead of chains, and I saw negligible size changes. I'm not sure if there is some reason we are using chains, or if we can switch all of them to iterate over blocks. I'll have to do more research on this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems this code was first added in 776e6de
if (MBPI->getEdgeProbability(ChainBB, TBB) >= | ||
MBPI->getEdgeProbability(ChainBB, FBB)) | ||
continue; | ||
if (TII->reverseBranchCondition(Cond)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd move this check to immediately after line 2923. It pertains to verifying the legality of a target, rather than assessing its profitability.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, reverseBranchCondition()
will actually reverse the condition, if possible. So we want to run this last.
llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
Lines 551 to 559 in f895fc9
bool ARMBaseInstrInfo:: | |
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { | |
if (Cond.size() == 2) { | |
ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); | |
Cond[0].setImm(ARMCC::getOppositeCondition(CC)); | |
return false; | |
} | |
return true; | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm
Do not consider profile data when choosing a successor block to sink into for optsize functions. This should result in more consistent instruction sequences which will improve outlining and ICF. We've observed a slight codesize improvement in a large binary. This is similar reasoning to #114607. Using profile data to select a block to sink into was original added in d04f759.
optimizeBranches()
to use earlycontinue
sinsertBranch()