-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SimplifyCFG] Skip threading if the target may have divergent branches #100185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SimplifyCFG] Skip threading if the target may have divergent branches #100185
Conversation
Created using spr 1.3.4
@llvm/pr-subscribers-backend-amdgpu Author: None (darkbuck) Changes
Full diff: https://github.com/llvm/llvm-project/pull/100185.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e28888931d..1a17524b826a1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3246,7 +3246,12 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
}
/// Return true if we can thread a branch across this block.
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB,
+ const TargetTransformInfo &TTI) {
+ // Skip threading if the branch may be divergent.
+ if (TTI.hasBranchDivergence(BB->getParent()))
+ return false;
+
int Size = 0;
EphemeralValueTracker EphTracker;
@@ -3301,10 +3306,9 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
/// If we have a conditional branch on something for which we know the constant
/// value in predecessors (e.g. a phi node in the current block), thread edges
/// from the predecessor to their ultimate destination.
-static std::optional<bool>
-FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static std::optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues;
BasicBlock *BB = BI->getParent();
Value *Cond = BI->getCondition();
@@ -3332,7 +3336,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
// Now we know that this block has multiple preds and two succs.
// Check that the block is small enough and values defined in the block are
// not used outside of it.
- if (!BlockIsSimpleEnoughToThreadThrough(BB))
+ if (!BlockIsSimpleEnoughToThreadThrough(BB, TTI))
return false;
for (const auto &Pair : KnownValues) {
@@ -3459,15 +3463,14 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
return false;
}
-static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
- DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static bool FoldCondBranchOnValueKnownInPredecessor(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
std::optional<bool> Result;
bool EverChanged = false;
do {
// Note that None means "we changed things, but recurse further."
- Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+ Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, TTI, AC);
EverChanged |= Result == std::nullopt || *Result;
} while (Result == std::nullopt);
return EverChanged;
@@ -7543,7 +7546,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// If this is a branch on something for which we know the constant value in
// predecessors (e.g. a phi node in the current block), thread control
// through this block.
- if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+ if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, TTI, Options.AC))
return requestResimplify();
// Scan predecessor blocks for conditional branches.
diff --git a/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll b/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll
new file mode 100644
index 0000000000000..b1262e294c6d0
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
+
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: define i32 @test_01a(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT: br i1 [[COND]], label %[[MERGE:.*]], label %[[IF_FALSE:.*]]
+; CHECK: [[IF_FALSE]]:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[IF_FALSE_2:.*]]
+; CHECK: [[IF_FALSE_2]]:
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll
index 6ba51e06460c2..d148063589de6 100644
--- a/llvm/test/Transforms/SimplifyCFG/convergent.ll
+++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll
@@ -4,6 +4,9 @@
; RUN: opt -S -passes='simplifycfg<hoist-common-insts;sink-common-insts>' < %s | FileCheck -check-prefixes=CHECK,SINK %s
declare void @foo() convergent
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
declare i32 @tid()
declare i32 @mbcnt(i32 %a, i32 %b) convergent
declare i32 @bpermute(i32 %a, i32 %b) convergent
@@ -45,6 +48,42 @@ exit:
ret i32 %a
}
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: @test_01a(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_CRITEDGE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label [[EXIT:%.*]]
+; CHECK: exit.critedge:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
+
define void @test_02(ptr %y.coerce) convergent {
; NOSINK-LABEL: @test_02(
; NOSINK-NEXT: entry:
|
@llvm/pr-subscribers-llvm-transforms Author: None (darkbuck) Changes
Full diff: https://github.com/llvm/llvm-project/pull/100185.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e28888931d..1a17524b826a1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3246,7 +3246,12 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
}
/// Return true if we can thread a branch across this block.
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB,
+ const TargetTransformInfo &TTI) {
+ // Skip threading if the branch may be divergent.
+ if (TTI.hasBranchDivergence(BB->getParent()))
+ return false;
+
int Size = 0;
EphemeralValueTracker EphTracker;
@@ -3301,10 +3306,9 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
/// If we have a conditional branch on something for which we know the constant
/// value in predecessors (e.g. a phi node in the current block), thread edges
/// from the predecessor to their ultimate destination.
-static std::optional<bool>
-FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static std::optional<bool> FoldCondBranchOnValueKnownInPredecessorImpl(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues;
BasicBlock *BB = BI->getParent();
Value *Cond = BI->getCondition();
@@ -3332,7 +3336,7 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
// Now we know that this block has multiple preds and two succs.
// Check that the block is small enough and values defined in the block are
// not used outside of it.
- if (!BlockIsSimpleEnoughToThreadThrough(BB))
+ if (!BlockIsSimpleEnoughToThreadThrough(BB, TTI))
return false;
for (const auto &Pair : KnownValues) {
@@ -3459,15 +3463,14 @@ FoldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
return false;
}
-static bool FoldCondBranchOnValueKnownInPredecessor(BranchInst *BI,
- DomTreeUpdater *DTU,
- const DataLayout &DL,
- AssumptionCache *AC) {
+static bool FoldCondBranchOnValueKnownInPredecessor(
+ BranchInst *BI, DomTreeUpdater *DTU, const DataLayout &DL,
+ const TargetTransformInfo &TTI, AssumptionCache *AC) {
std::optional<bool> Result;
bool EverChanged = false;
do {
// Note that None means "we changed things, but recurse further."
- Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC);
+ Result = FoldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, TTI, AC);
EverChanged |= Result == std::nullopt || *Result;
} while (Result == std::nullopt);
return EverChanged;
@@ -7543,7 +7546,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
// If this is a branch on something for which we know the constant value in
// predecessors (e.g. a phi node in the current block), thread control
// through this block.
- if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC))
+ if (FoldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, TTI, Options.AC))
return requestResimplify();
// Scan predecessor blocks for conditional branches.
diff --git a/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll b/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll
new file mode 100644
index 0000000000000..b1262e294c6d0
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/AMDGPU/convergent.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
+
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: define i32 @test_01a(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT: br i1 [[COND]], label %[[MERGE:.*]], label %[[IF_FALSE:.*]]
+; CHECK: [[IF_FALSE]]:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: br label %[[MERGE]]
+; CHECK: [[MERGE]]:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[IF_FALSE_2:.*]]
+; CHECK: [[IF_FALSE_2]]:
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/convergent.ll b/llvm/test/Transforms/SimplifyCFG/convergent.ll
index 6ba51e06460c2..d148063589de6 100644
--- a/llvm/test/Transforms/SimplifyCFG/convergent.ll
+++ b/llvm/test/Transforms/SimplifyCFG/convergent.ll
@@ -4,6 +4,9 @@
; RUN: opt -S -passes='simplifycfg<hoist-common-insts;sink-common-insts>' < %s | FileCheck -check-prefixes=CHECK,SINK %s
declare void @foo() convergent
+declare void @bar1()
+declare void @bar2()
+declare void @bar3()
declare i32 @tid()
declare i32 @mbcnt(i32 %a, i32 %b) convergent
declare i32 @bpermute(i32 %a, i32 %b) convergent
@@ -45,6 +48,42 @@ exit:
ret i32 %a
}
+define i32 @test_01a(i32 %a) {
+; CHECK-LABEL: @test_01a(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_CRITEDGE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @bar1()
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: call void @bar3()
+; CHECK-NEXT: br label [[EXIT:%.*]]
+; CHECK: exit.critedge:
+; CHECK-NEXT: call void @bar2()
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 [[A]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %merge, label %if.false
+
+if.false:
+ call void @bar1()
+ br label %merge
+
+merge:
+ call void @bar2()
+ br i1 %cond, label %exit, label %if.false.2
+
+if.false.2:
+ call void @bar3()
+ br label %exit
+
+exit:
+ ret i32 %a
+}
+
define void @test_02(ptr %y.coerce) convergent {
; NOSINK-LABEL: @test_02(
; NOSINK-NEXT: entry:
|
Maybe all of this can be moved to Transforms/Scalar/JumpThreading.cpp ? simplifycfg is doing too many things which could be better suited as separate passes. |
Yeah, SimplifyCFG has few limited duplications to that dedicated pass. But it should be addressed in another dedicated PR as it may have measurable impacts compared to this one. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought JumpThreading did this, why is SImplifyCFG here?
static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB, | ||
const TargetTransformInfo &TTI) { | ||
// Skip threading if the branch may be divergent. | ||
if (TTI.hasBranchDivergence(BB->getParent())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally this should be context dependent on uniformity
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just another casualty of not being able to update UA! :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, if we could query UA for that branch, we could selectively enable this threading if profitable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just another casualty of not being able to update UA! :(
Is it possible to update UA on the demand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No it's not. But intuitively it seems that in the case of jump threading, it should be okay to not update UA.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The TTI check has nothing to do with the specific block BB
. A more natural check for this seems to be the outer function FoldCondBranchOnValueKnownInPredecessor()
, where it can be an early return even before entering the do-while
loop.
Compared to jump-threading, SimplifyCFG also does limited threading. For example, |
Created using spr 1.3.4
I think rather than make a small change to SimplifyCFG, it's better to remove this entirely. JumpThreading.cpp already does the right thing with divergent branches, and any future improvements will happen there. This part of SimplifyCFG actually predates JumpThreading.cpp by three years! (2005 v/s 2008). |
The usual answer to "why does SimplifyCFG perform this optimization that another pass does more comprehensively" is "phase ordering". There are a lot of SimplifyCFG runs in the optimization pipeline, while we cannot afford to run dedicated passes (with expensive dependencies like LVI) so many times. It's possible that this logic in SimplifyCFG is indeed no longer necessary, and you're welcome to give removing it a try, but I think the most likely outcome is that dropping it does cause regressions... |
Not only does phase order matter but |
Sounds good to me, but no immediate thoughts on how to go about doing it. |
Maybe jump threading can have flags to do 'lightweight' threading (that subsumes what SimplifyCFG does with a simple cost model)? This way we have the code in one place. |
If there is no objection, I will land this change. |
That looks to me quite practice. |
This may have broken the AMDGPU hip-clang buildbot: https://lab.llvm.org/buildbot/#/builders/123/builds/2677 |
The change was merged without addressing my comment about where the actual check for TTI should be. |
Not only that, it was merged without any approval... |
in addition to breaks HIP buildbot, unanswered review comments, merged without approval i am seeing downstream aborts in gpu code due to this patch. plz revert it. |
This patch skips the threading on known values if the target has
divergent branch.
So far, threading on known values is skipped when the basic block has
covergent calls. However, even without convergent calls, if that
condition is divergent, threading duplicates the execution of that
block threaded and hence results in lower performance. E.g.,
after threading,
After threading, work3 is executed twice if 'cond' is a divergent one.