Skip to content

Commit 9673936

Browse files
dtcxzywlravenclaw
authored andcommitted
[SimplifyCFG] Simplify nested branches (llvm#97067)
This patch folds the following pattern (I don't know what to call this): ``` bb0: br i1 %cond1, label %bb1, label %bb2 bb1: br i1 %cond2, label %bb3, label %bb4 bb2: br i1 %cond2, label %bb4, label %bb3 bb3: ... bb4: ... ``` into ``` bb0: %cond = xor i1 %cond1, %cond2 br i1 %cond, label %bb4, label %bb3 bb3: ... bb4: ... ``` Alive2: https://alive2.llvm.org/ce/z/5iOJEL Closes llvm#97022. Closes llvm#83417. I found this pattern in some verilator-generated code, which is widely used in RTL simulation. This fold will reduces branches and improves the performance of CPU frontend. To my surprise, this pattern is also common in C/C++ code base. Affected libraries/applications: cmake/cvc5/freetype/git/gromacs/jq/linux/openblas/openmpi/openssl/php/postgres/ruby/sqlite/wireshark/z3/...
1 parent 81e8867 commit 9673936

File tree

3 files changed

+478
-16
lines changed

3 files changed

+478
-16
lines changed

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7361,6 +7361,95 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
73617361
return PredPred;
73627362
}
73637363

7364+
/// Fold the following pattern:
7365+
/// bb0:
7366+
/// br i1 %cond1, label %bb1, label %bb2
7367+
/// bb1:
7368+
/// br i1 %cond2, label %bb3, label %bb4
7369+
/// bb2:
7370+
/// br i1 %cond2, label %bb4, label %bb3
7371+
/// bb3:
7372+
/// ...
7373+
/// bb4:
7374+
/// ...
7375+
/// into
7376+
/// bb0:
7377+
/// %cond = xor i1 %cond1, %cond2
7378+
/// br i1 %cond, label %bb4, label %bb3
7379+
/// bb3:
7380+
/// ...
7381+
/// bb4:
7382+
/// ...
7383+
/// NOTE: %cond2 always dominates the terminator of bb0.
7384+
static bool mergeNestedCondBranch(BranchInst *BI, DomTreeUpdater *DTU) {
7385+
BasicBlock *BB = BI->getParent();
7386+
BasicBlock *BB1 = BI->getSuccessor(0);
7387+
BasicBlock *BB2 = BI->getSuccessor(1);
7388+
auto IsSimpleSuccessor = [BB](BasicBlock *Succ, BranchInst *&SuccBI) {
7389+
if (Succ == BB)
7390+
return false;
7391+
if (&Succ->front() != Succ->getTerminator())
7392+
return false;
7393+
SuccBI = dyn_cast<BranchInst>(Succ->getTerminator());
7394+
if (!SuccBI || !SuccBI->isConditional())
7395+
return false;
7396+
BasicBlock *Succ1 = SuccBI->getSuccessor(0);
7397+
BasicBlock *Succ2 = SuccBI->getSuccessor(1);
7398+
return Succ1 != Succ && Succ2 != Succ && Succ1 != BB && Succ2 != BB &&
7399+
!isa<PHINode>(Succ1->front()) && !isa<PHINode>(Succ2->front());
7400+
};
7401+
BranchInst *BB1BI, *BB2BI;
7402+
if (!IsSimpleSuccessor(BB1, BB1BI) || !IsSimpleSuccessor(BB2, BB2BI))
7403+
return false;
7404+
7405+
if (BB1BI->getCondition() != BB2BI->getCondition() ||
7406+
BB1BI->getSuccessor(0) != BB2BI->getSuccessor(1) ||
7407+
BB1BI->getSuccessor(1) != BB2BI->getSuccessor(0))
7408+
return false;
7409+
7410+
BasicBlock *BB3 = BB1BI->getSuccessor(0);
7411+
BasicBlock *BB4 = BB1BI->getSuccessor(1);
7412+
IRBuilder<> Builder(BI);
7413+
BI->setCondition(
7414+
Builder.CreateXor(BI->getCondition(), BB1BI->getCondition()));
7415+
BB1->removePredecessor(BB);
7416+
BI->setSuccessor(0, BB4);
7417+
BB2->removePredecessor(BB);
7418+
BI->setSuccessor(1, BB3);
7419+
if (DTU) {
7420+
SmallVector<DominatorTree::UpdateType, 4> Updates;
7421+
Updates.push_back({DominatorTree::Delete, BB, BB1});
7422+
Updates.push_back({DominatorTree::Insert, BB, BB4});
7423+
Updates.push_back({DominatorTree::Delete, BB, BB2});
7424+
Updates.push_back({DominatorTree::Insert, BB, BB3});
7425+
7426+
DTU->applyUpdates(Updates);
7427+
}
7428+
bool HasWeight = false;
7429+
uint64_t BBTWeight, BBFWeight;
7430+
if (extractBranchWeights(*BI, BBTWeight, BBFWeight))
7431+
HasWeight = true;
7432+
else
7433+
BBTWeight = BBFWeight = 1;
7434+
uint64_t BB1TWeight, BB1FWeight;
7435+
if (extractBranchWeights(*BB1BI, BB1TWeight, BB1FWeight))
7436+
HasWeight = true;
7437+
else
7438+
BB1TWeight = BB1FWeight = 1;
7439+
uint64_t BB2TWeight, BB2FWeight;
7440+
if (extractBranchWeights(*BB2BI, BB2TWeight, BB2FWeight))
7441+
HasWeight = true;
7442+
else
7443+
BB2TWeight = BB2FWeight = 1;
7444+
if (HasWeight) {
7445+
uint64_t Weights[2] = {BBTWeight * BB1FWeight + BBFWeight * BB2TWeight,
7446+
BBTWeight * BB1TWeight + BBFWeight * BB2FWeight};
7447+
FitWeights(Weights);
7448+
setBranchWeights(BI, Weights[0], Weights[1], /*IsExpected=*/false);
7449+
}
7450+
return true;
7451+
}
7452+
73647453
bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
73657454
assert(
73667455
!isa<ConstantInt>(BI->getCondition()) &&
@@ -7468,6 +7557,10 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
74687557
if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
74697558
return requestResimplify();
74707559

7560+
// Look for nested conditional branches.
7561+
if (mergeNestedCondBranch(BI, DTU))
7562+
return requestResimplify();
7563+
74717564
return false;
74727565
}
74737566

llvm/test/CodeGen/ARM/and-cmp0-sink.ll

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ exit:
186186
}
187187

188188
; Test with a mask that can be encoded with T32 instruction set, but not with A32.
189-
define i32 @f0(i1 %c0, i32 %v) {
189+
define i32 @f0(i1 %c0, i32 %v, ptr %p) {
190190
; V7M-LABEL: f0:
191191
; V7M: @ %bb.0: @ %E
192192
; V7M-NEXT: lsls r0, r0, #31
@@ -198,7 +198,9 @@ define i32 @f0(i1 %c0, i32 %v) {
198198
; V7M-NEXT: bxeq lr
199199
; V7M-NEXT: b .LBB1_3
200200
; V7M-NEXT: .LBB1_2: @ %B
201+
; V7M-NEXT: movs r0, #1
201202
; V7M-NEXT: tst.w r1, #16843009
203+
; V7M-NEXT: str r0, [r2]
202204
; V7M-NEXT: itt ne
203205
; V7M-NEXT: movne r0, #0
204206
; V7M-NEXT: bxne lr
@@ -208,10 +210,10 @@ define i32 @f0(i1 %c0, i32 %v) {
208210
;
209211
; V7A-LABEL: f0:
210212
; V7A: @ %bb.0: @ %E
211-
; V7A-NEXT: movw r2, #257
213+
; V7A-NEXT: movw r3, #257
212214
; V7A-NEXT: tst r0, #1
213-
; V7A-NEXT: movt r2, #257
214-
; V7A-NEXT: and r1, r1, r2
215+
; V7A-NEXT: movt r3, #257
216+
; V7A-NEXT: and r1, r1, r3
215217
; V7A-NEXT: beq .LBB1_3
216218
; V7A-NEXT: @ %bb.1: @ %A
217219
; V7A-NEXT: cmp r1, #0
@@ -221,8 +223,10 @@ define i32 @f0(i1 %c0, i32 %v) {
221223
; V7A-NEXT: mov r0, #1
222224
; V7A-NEXT: bx lr
223225
; V7A-NEXT: .LBB1_3: @ %B
224-
; V7A-NEXT: mov r0, #0
226+
; V7A-NEXT: mov r0, #1
225227
; V7A-NEXT: cmp r1, #0
228+
; V7A-NEXT: str r0, [r2]
229+
; V7A-NEXT: mov r0, #0
226230
; V7A-NEXT: moveq r0, #1
227231
; V7A-NEXT: bx lr
228232
;
@@ -237,7 +241,9 @@ define i32 @f0(i1 %c0, i32 %v) {
237241
; V7A-T-NEXT: bxeq lr
238242
; V7A-T-NEXT: b .LBB1_3
239243
; V7A-T-NEXT: .LBB1_2: @ %B
244+
; V7A-T-NEXT: movs r0, #1
240245
; V7A-T-NEXT: tst.w r1, #16843009
246+
; V7A-T-NEXT: str r0, [r2]
241247
; V7A-T-NEXT: itt ne
242248
; V7A-T-NEXT: movne r0, #0
243249
; V7A-T-NEXT: bxne lr
@@ -247,18 +253,20 @@ define i32 @f0(i1 %c0, i32 %v) {
247253
;
248254
; V6M-LABEL: f0:
249255
; V6M: @ %bb.0: @ %E
250-
; V6M-NEXT: ldr r2, .LCPI1_0
251-
; V6M-NEXT: ands r2, r1
256+
; V6M-NEXT: ldr r3, .LCPI1_0
257+
; V6M-NEXT: ands r3, r1
252258
; V6M-NEXT: lsls r0, r0, #31
253259
; V6M-NEXT: beq .LBB1_3
254260
; V6M-NEXT: @ %bb.1: @ %A
255-
; V6M-NEXT: cmp r2, #0
261+
; V6M-NEXT: cmp r3, #0
256262
; V6M-NEXT: bne .LBB1_5
257263
; V6M-NEXT: @ %bb.2:
258264
; V6M-NEXT: movs r0, #0
259265
; V6M-NEXT: bx lr
260266
; V6M-NEXT: .LBB1_3: @ %B
261-
; V6M-NEXT: cmp r2, #0
267+
; V6M-NEXT: movs r0, #1
268+
; V6M-NEXT: str r0, [r2]
269+
; V6M-NEXT: cmp r3, #0
262270
; V6M-NEXT: beq .LBB1_5
263271
; V6M-NEXT: @ %bb.4:
264272
; V6M-NEXT: movs r0, #0
@@ -280,6 +288,7 @@ A:
280288

281289
B:
282290
%c2 = icmp eq i32 %a, 0
291+
store i32 1, ptr %p, align 4
283292
br i1 %c2, label %D, label %C
284293

285294
C:
@@ -294,7 +303,7 @@ X:
294303
}
295304

296305
; Test with a mask that can be encoded both with T32 and A32 instruction sets.
297-
define i32 @f1(i1 %c0, i32 %v) {
306+
define i32 @f1(i1 %c0, i32 %v, ptr %p) {
298307
; V7M-LABEL: f1:
299308
; V7M: @ %bb.0: @ %E
300309
; V7M-NEXT: lsls r0, r0, #31
@@ -306,7 +315,9 @@ define i32 @f1(i1 %c0, i32 %v) {
306315
; V7M-NEXT: bxeq lr
307316
; V7M-NEXT: b .LBB2_3
308317
; V7M-NEXT: .LBB2_2: @ %B
318+
; V7M-NEXT: movs r0, #1
309319
; V7M-NEXT: tst.w r1, #100663296
320+
; V7M-NEXT: str r0, [r2]
310321
; V7M-NEXT: itt ne
311322
; V7M-NEXT: movne r0, #0
312323
; V7M-NEXT: bxne lr
@@ -326,8 +337,10 @@ define i32 @f1(i1 %c0, i32 %v) {
326337
; V7A-NEXT: mov r0, #1
327338
; V7A-NEXT: bx lr
328339
; V7A-NEXT: .LBB2_3: @ %B
329-
; V7A-NEXT: mov r0, #0
340+
; V7A-NEXT: mov r0, #1
330341
; V7A-NEXT: tst r1, #100663296
342+
; V7A-NEXT: str r0, [r2]
343+
; V7A-NEXT: mov r0, #0
331344
; V7A-NEXT: moveq r0, #1
332345
; V7A-NEXT: bx lr
333346
;
@@ -342,7 +355,9 @@ define i32 @f1(i1 %c0, i32 %v) {
342355
; V7A-T-NEXT: bxeq lr
343356
; V7A-T-NEXT: b .LBB2_3
344357
; V7A-T-NEXT: .LBB2_2: @ %B
358+
; V7A-T-NEXT: movs r0, #1
345359
; V7A-T-NEXT: tst.w r1, #100663296
360+
; V7A-T-NEXT: str r0, [r2]
346361
; V7A-T-NEXT: itt ne
347362
; V7A-T-NEXT: movne r0, #0
348363
; V7A-T-NEXT: bxne lr
@@ -352,19 +367,21 @@ define i32 @f1(i1 %c0, i32 %v) {
352367
;
353368
; V6M-LABEL: f1:
354369
; V6M: @ %bb.0: @ %E
355-
; V6M-NEXT: movs r2, #3
356-
; V6M-NEXT: lsls r2, r2, #25
357-
; V6M-NEXT: ands r2, r1
370+
; V6M-NEXT: movs r3, #3
371+
; V6M-NEXT: lsls r3, r3, #25
372+
; V6M-NEXT: ands r3, r1
358373
; V6M-NEXT: lsls r0, r0, #31
359374
; V6M-NEXT: beq .LBB2_3
360375
; V6M-NEXT: @ %bb.1: @ %A
361-
; V6M-NEXT: cmp r2, #0
376+
; V6M-NEXT: cmp r3, #0
362377
; V6M-NEXT: bne .LBB2_5
363378
; V6M-NEXT: @ %bb.2:
364379
; V6M-NEXT: movs r0, #0
365380
; V6M-NEXT: bx lr
366381
; V6M-NEXT: .LBB2_3: @ %B
367-
; V6M-NEXT: cmp r2, #0
382+
; V6M-NEXT: movs r0, #1
383+
; V6M-NEXT: str r0, [r2]
384+
; V6M-NEXT: cmp r3, #0
368385
; V6M-NEXT: beq .LBB2_5
369386
; V6M-NEXT: @ %bb.4:
370387
; V6M-NEXT: movs r0, #0
@@ -382,6 +399,7 @@ A:
382399

383400
B:
384401
%c2 = icmp eq i32 %a, 0
402+
store i32 1, ptr %p, align 4
385403
br i1 %c2, label %D, label %C
386404

387405
C:

0 commit comments

Comments
 (0)