Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit da754b5

Browse files
committed
[CGP] despeculate expensive cttz/ctlz intrinsics
This is another step towards allowing SimplifyCFG to speculate harder, but then have CGP clean things up if the target doesn't like it. Previous patches in this series: http://reviews.llvm.org/D12882 http://reviews.llvm.org/D13297 D13297 should catch most expensive ops, but speculation of cttz/ctlz requires special handling because of weirdness in the intrinsic definition for handling a zero input (that definition can probably be blamed on x86). For example, if we have the usual speculated-by-select expensive op pattern like this: %tobool = icmp eq i64 %A, 0 %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) ; is_zero_undef == true %cond = select i1 %tobool, i64 64, i64 %0 ret i64 %cond There's an instcombine that will turn it into: %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 false) ; is_zero_undef == false This CGP patch is looking for that case and despeculating it back into: entry: %tobool = icmp eq i64 %A, 0 br i1 %tobool, label %cond.end, label %cond.true cond.true: %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) ; is_zero_undef == true br label %cond.end cond.end: %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] ret i64 %cond This unfortunately may lead to poorer codegen (see the changes in the existing x86 test), but if we increase speculation in SimplifyCFG (the next step in this patch series), then we should avoid those kinds of cases in the first place. The need for this patch was originally mentioned here: http://reviews.llvm.org/D7506 with follow-up here: http://reviews.llvm.org/D7554 Differential Revision: http://reviews.llvm.org/D14630 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253573 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 086b179 commit da754b5

File tree

2 files changed

+121
-18
lines changed

2 files changed

+121
-18
lines changed

lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,6 +1606,85 @@ static void ScalarizeMaskedScatter(CallInst *CI) {
16061606
CI->eraseFromParent();
16071607
}
16081608

1609+
/// If counting leading or trailing zeros is an expensive operation and a zero
1610+
/// input is defined, add a check for zero to avoid calling the intrinsic.
1611+
///
1612+
/// We want to transform:
1613+
/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
1614+
///
1615+
/// into:
1616+
/// entry:
1617+
/// %cmpz = icmp eq i64 %A, 0
1618+
/// br i1 %cmpz, label %cond.end, label %cond.false
1619+
/// cond.false:
1620+
/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
1621+
/// br label %cond.end
1622+
/// cond.end:
1623+
/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
1624+
///
1625+
/// If the transform is performed, return true and set ModifiedDT to true.
1626+
static bool despeculateCountZeros(IntrinsicInst *CountZeros,
1627+
const TargetLowering *TLI,
1628+
const DataLayout *DL,
1629+
bool &ModifiedDT) {
1630+
if (!TLI || !DL)
1631+
return false;
1632+
1633+
// If a zero input is undefined, it doesn't make sense to despeculate that.
1634+
if (match(CountZeros->getOperand(1), m_One()))
1635+
return false;
1636+
1637+
// If it's cheap to speculate, there's nothing to do.
1638+
auto IntrinsicID = CountZeros->getIntrinsicID();
1639+
if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
1640+
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
1641+
return false;
1642+
1643+
// Only handle legal scalar cases. Anything else requires too much work.
1644+
Type *Ty = CountZeros->getType();
1645+
unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
1646+
if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSize())
1647+
return false;
1648+
1649+
// The intrinsic will be sunk behind a compare against zero and branch.
1650+
BasicBlock *StartBlock = CountZeros->getParent();
1651+
BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
1652+
1653+
// Create another block after the count zero intrinsic. A PHI will be added
1654+
// in this block to select the result of the intrinsic or the bit-width
1655+
// constant if the input to the intrinsic is zero.
1656+
BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
1657+
BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
1658+
1659+
// Set up a builder to create a compare, conditional branch, and PHI.
1660+
IRBuilder<> Builder(CountZeros->getContext());
1661+
Builder.SetInsertPoint(StartBlock->getTerminator());
1662+
Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
1663+
1664+
// Replace the unconditional branch that was created by the first split with
1665+
// a compare against zero and a conditional branch.
1666+
Value *Zero = Constant::getNullValue(Ty);
1667+
Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
1668+
Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
1669+
StartBlock->getTerminator()->eraseFromParent();
1670+
1671+
// Create a PHI in the end block to select either the output of the intrinsic
1672+
// or the bit width of the operand.
1673+
Builder.SetInsertPoint(&EndBlock->front());
1674+
PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
1675+
CountZeros->replaceAllUsesWith(PN);
1676+
Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
1677+
PN->addIncoming(BitWidth, StartBlock);
1678+
PN->addIncoming(CountZeros, CallBlock);
1679+
1680+
// We are explicitly handling the zero case, so we can set the intrinsic's
1681+
// undefined zero argument to 'true'. This will also prevent reprocessing the
1682+
// intrinsic; we only despeculate when a zero input is defined.
1683+
CountZeros->setArgOperand(1, Builder.getTrue());
1684+
ModifiedDT = true;
1685+
return true;
1686+
}
1687+
16091688
bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
16101689
BasicBlock *BB = CI->getParent();
16111690

@@ -1746,6 +1825,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
17461825
II->replaceAllUsesWith(II->getArgOperand(0));
17471826
II->eraseFromParent();
17481827
return true;
1828+
1829+
case Intrinsic::cttz:
1830+
case Intrinsic::ctlz:
1831+
// If counting zeros is expensive, try to avoid it.
1832+
return despeculateCountZeros(II, TLI, DL, ModifiedDT);
17491833
}
17501834

17511835
if (TLI) {

test/CodeGen/X86/clz.ll

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -87,55 +87,74 @@ define i64 @ctlz_i64(i64 %x) {
8787
ret i64 %tmp
8888
}
8989

90-
define i32 @ctlz_i32_cmov(i32 %n) {
91-
; CHECK-LABEL: ctlz_i32_cmov:
90+
define i32 @ctlz_i32_zero_test(i32 %n) {
91+
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
92+
93+
; CHECK-LABEL: ctlz_i32_zero_test:
9294
; CHECK: # BB#0:
93-
; CHECK-NEXT: bsrl %edi, %ecx
94-
; CHECK-NEXT: movl $63, %eax
95-
; CHECK-NEXT: cmovnel %ecx, %eax
95+
; CHECK-NEXT: movl $32, %eax
96+
; CHECK-NEXT: testl %edi, %edi
97+
; CHECK-NEXT: je .LBB8_2
98+
; CHECK-NEXT: # BB#1: # %cond.false
99+
; CHECK-NEXT: bsrl %edi, %eax
96100
; CHECK-NEXT: xorl $31, %eax
101+
; CHECK-NEXT: .LBB8_2: # %cond.end
97102
; CHECK-NEXT: retq
98-
; Generate a cmov to handle zero inputs when necessary.
99103
%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
100104
ret i32 %tmp1
101105
}
102106

103107
define i32 @ctlz_i32_fold_cmov(i32 %n) {
108+
; Don't generate the cmovne when the source is known non-zero (and bsr would
109+
; not set ZF).
110+
; rdar://9490949
111+
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
112+
; codegen doesn't know how to delete the movl and je.
113+
104114
; CHECK-LABEL: ctlz_i32_fold_cmov:
105115
; CHECK: # BB#0:
106116
; CHECK-NEXT: orl $1, %edi
117+
; CHECK-NEXT: movl $32, %eax
118+
; CHECK-NEXT: je .LBB9_2
119+
; CHECK-NEXT: # BB#1: # %cond.false
107120
; CHECK-NEXT: bsrl %edi, %eax
108121
; CHECK-NEXT: xorl $31, %eax
122+
; CHECK-NEXT: .LBB9_2: # %cond.end
109123
; CHECK-NEXT: retq
110-
; Don't generate the cmovne when the source is known non-zero (and bsr would
111-
; not set ZF).
112-
; rdar://9490949
113124
%or = or i32 %n, 1
114125
%tmp1 = call i32 @llvm.ctlz.i32(i32 %or, i1 false)
115126
ret i32 %tmp1
116127
}
117128

118129
define i32 @ctlz_bsr(i32 %n) {
130+
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
131+
; the most significant bit, which is what 'bsr' does natively.
132+
119133
; CHECK-LABEL: ctlz_bsr:
120134
; CHECK: # BB#0:
121135
; CHECK-NEXT: bsrl %edi, %eax
122136
; CHECK-NEXT: retq
123-
; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute
124-
; the most significant bit, which is what 'bsr' does natively.
125137
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 true)
126138
%bsr = xor i32 %ctlz, 31
127139
ret i32 %bsr
128140
}
129141

130-
define i32 @ctlz_bsr_cmov(i32 %n) {
131-
; CHECK-LABEL: ctlz_bsr_cmov:
142+
define i32 @ctlz_bsr_zero_test(i32 %n) {
143+
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
144+
; FIXME: The compare and branch are produced late in IR (by CodeGenPrepare), and
145+
; codegen doesn't know how to combine the $32 and $31 into $63.
146+
147+
; CHECK-LABEL: ctlz_bsr_zero_test:
132148
; CHECK: # BB#0:
133-
; CHECK-NEXT: bsrl %edi, %ecx
134-
; CHECK-NEXT: movl $63, %eax
135-
; CHECK-NEXT: cmovnel %ecx, %eax
149+
; CHECK-NEXT: movl $32, %eax
150+
; CHECK-NEXT: testl %edi, %edi
151+
; CHECK-NEXT: je .LBB11_2
152+
; CHECK-NEXT: # BB#1: # %cond.false
153+
; CHECK-NEXT: bsrl %edi, %eax
154+
; CHECK-NEXT: xorl $31, %eax
155+
; CHECK-NEXT: .LBB11_2: # %cond.end
156+
; CHECK-NEXT: xorl $31, %eax
136157
; CHECK-NEXT: retq
137-
; Same as ctlz_bsr, but ensure this happens even when there is a potential
138-
; zero.
139158
%ctlz = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
140159
%bsr = xor i32 %ctlz, 31
141160
ret i32 %bsr

0 commit comments

Comments
 (0)