Skip to content

Commit 8446e05

Browse files
committed
[CodeGenPrepare] Unfold slow ctpop when used in power-of-two test
DAG combiner already does this transformation, but in some cases it does not have a chance because either CodeGenPrepare or SelectionDAGBuilder move icmp to a different basic block.
1 parent 1106bff commit 8446e05

File tree

5 files changed

+60
-70
lines changed

5 files changed

+60
-70
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,7 @@ class CodeGenPrepare {
473473
bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
474474
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
475475
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
476+
bool unfoldPow2Test(CmpInst *Cmp);
476477
void verifyBFIUpdates(Function &F);
477478
bool _run(Function &F);
478479
};
@@ -1757,6 +1758,40 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
17571758
return true;
17581759
}
17591760

1761+
// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
1762+
bool CodeGenPrepare::unfoldPow2Test(CmpInst *Cmp) {
1763+
ICmpInst::Predicate Pred;
1764+
Value *X;
1765+
uint64_t C;
1766+
1767+
if (!match(Cmp, m_ICmp(Pred, m_Intrinsic<Intrinsic::ctpop>(m_Value(X)),
1768+
m_ConstantInt(C))))
1769+
return false;
1770+
1771+
Type *Ty = X->getType();
1772+
if (Ty->isVectorTy() || TTI->getPopcntSupport(Ty->getIntegerBitWidth()) ==
1773+
TargetTransformInfo::PSK_FastHardware)
1774+
return false;
1775+
1776+
// (ctpop x) u< 2 -> (x & (x - 1)) == 0
1777+
// (ctpop x) u> 1 -> (x & (x - 1)) != 0
1778+
if ((Pred == CmpInst::ICMP_ULT && C == 2) ||
1779+
(Pred == CmpInst::ICMP_UGT && C == 1)) {
1780+
IRBuilder<> Builder(Cmp);
1781+
Value *Sub = Builder.CreateAdd(X, Constant::getAllOnesValue(Ty));
1782+
Value *And = Builder.CreateAnd(X, Sub);
1783+
CmpInst::Predicate NewPred =
1784+
Pred == CmpInst::ICMP_ULT ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE;
1785+
Value *NewCmp =
1786+
Builder.CreateICmp(NewPred, And, ConstantInt::getNullValue(Ty));
1787+
Cmp->replaceAllUsesWith(NewCmp);
1788+
RecursivelyDeleteTriviallyDeadInstructions(Cmp);
1789+
return true;
1790+
}
1791+
1792+
return false;
1793+
}
1794+
17601795
/// Sink the given CmpInst into user blocks to reduce the number of virtual
17611796
/// registers that must be created and coalesced. This is a clear win except on
17621797
/// targets with multiple condition code registers (PowerPC), where it might
@@ -1984,6 +2019,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
19842019
if (combineToUSubWithOverflow(Cmp, ModifiedDT))
19852020
return true;
19862021

2022+
if (unfoldPow2Test(Cmp))
2023+
return true;
2024+
19872025
if (foldICmpWithDominatingICmp(Cmp, *TLI))
19882026
return true;
19892027

llvm/test/CodeGen/RISCV/pr101786.ll

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,9 @@ define i64 @test(i64 %x, ptr %p) {
88
; CHECK-NEXT: li a0, 0
99
; CHECK-NEXT: bgtz a2, .LBB0_3
1010
; CHECK-NEXT: # %bb.1: # %entry
11-
; CHECK-NEXT: srli a3, a2, 1
12-
; CHECK-NEXT: lui a4, 349525
13-
; CHECK-NEXT: addiw a4, a4, 1365
14-
; CHECK-NEXT: slli a5, a4, 32
15-
; CHECK-NEXT: add a4, a4, a5
16-
; CHECK-NEXT: and a3, a3, a4
17-
; CHECK-NEXT: sub a2, a2, a3
18-
; CHECK-NEXT: lui a3, 209715
19-
; CHECK-NEXT: addiw a3, a3, 819
20-
; CHECK-NEXT: slli a4, a3, 32
21-
; CHECK-NEXT: add a3, a3, a4
22-
; CHECK-NEXT: and a4, a2, a3
23-
; CHECK-NEXT: srli a2, a2, 2
11+
; CHECK-NEXT: addi a3, a2, -1
2412
; CHECK-NEXT: and a2, a2, a3
25-
; CHECK-NEXT: add a2, a4, a2
26-
; CHECK-NEXT: srli a3, a2, 4
27-
; CHECK-NEXT: add a2, a2, a3
28-
; CHECK-NEXT: lui a3, 61681
29-
; CHECK-NEXT: addiw a3, a3, -241
30-
; CHECK-NEXT: slli a4, a3, 32
31-
; CHECK-NEXT: add a3, a3, a4
32-
; CHECK-NEXT: and a2, a2, a3
33-
; CHECK-NEXT: slli a3, a2, 8
34-
; CHECK-NEXT: add a2, a2, a3
35-
; CHECK-NEXT: slli a3, a2, 16
36-
; CHECK-NEXT: add a2, a2, a3
37-
; CHECK-NEXT: slli a3, a2, 32
38-
; CHECK-NEXT: add a2, a2, a3
39-
; CHECK-NEXT: srli a2, a2, 56
40-
; CHECK-NEXT: li a3, 1
41-
; CHECK-NEXT: bltu a3, a2, .LBB0_3
13+
; CHECK-NEXT: bnez a2, .LBB0_3
4214
; CHECK-NEXT: # %bb.2: # %if.else
4315
; CHECK-NEXT: ld a0, 0(a1)
4416
; CHECK-NEXT: .LBB0_3: # %if.end

llvm/test/CodeGen/RISCV/rv32zbb.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -571,12 +571,12 @@ define i64 @ctpop_i64(i64 %a) nounwind {
571571
define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
572572
; RV32I-LABEL: ctpop_i64_ugt_two:
573573
; RV32I: # %bb.0:
574-
; RV32I-NEXT: addi a2, a0, -1
575-
; RV32I-NEXT: and a2, a0, a2
576-
; RV32I-NEXT: seqz a0, a0
577-
; RV32I-NEXT: sub a0, a1, a0
578-
; RV32I-NEXT: and a0, a1, a0
579-
; RV32I-NEXT: or a0, a2, a0
574+
; RV32I-NEXT: seqz a2, a0
575+
; RV32I-NEXT: sub a2, a1, a2
576+
; RV32I-NEXT: addi a3, a0, -1
577+
; RV32I-NEXT: and a0, a0, a3
578+
; RV32I-NEXT: and a1, a1, a2
579+
; RV32I-NEXT: or a0, a0, a1
580580
; RV32I-NEXT: seqz a0, a0
581581
; RV32I-NEXT: ret
582582
;
@@ -595,12 +595,12 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
595595
define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
596596
; RV32I-LABEL: ctpop_i64_ugt_one:
597597
; RV32I: # %bb.0:
598-
; RV32I-NEXT: addi a2, a0, -1
599-
; RV32I-NEXT: and a2, a0, a2
600-
; RV32I-NEXT: seqz a0, a0
601-
; RV32I-NEXT: sub a0, a1, a0
602-
; RV32I-NEXT: and a0, a1, a0
603-
; RV32I-NEXT: or a0, a2, a0
598+
; RV32I-NEXT: seqz a2, a0
599+
; RV32I-NEXT: sub a2, a1, a2
600+
; RV32I-NEXT: addi a3, a0, -1
601+
; RV32I-NEXT: and a0, a0, a3
602+
; RV32I-NEXT: and a1, a1, a2
603+
; RV32I-NEXT: or a0, a0, a1
604604
; RV32I-NEXT: snez a0, a0
605605
; RV32I-NEXT: ret
606606
;

llvm/test/CodeGen/X86/pr94829.ll

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,8 @@
44
define ptr @test(i64 %x) {
55
; CHECK-LABEL: test:
66
; CHECK: # %bb.0: # %entry
7-
; CHECK-NEXT: movq %rdi, %rax
8-
; CHECK-NEXT: shrq %rax
9-
; CHECK-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
10-
; CHECK-NEXT: andq %rax, %rcx
11-
; CHECK-NEXT: subq %rcx, %rdi
12-
; CHECK-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
13-
; CHECK-NEXT: movq %rdi, %rcx
14-
; CHECK-NEXT: andq %rax, %rcx
15-
; CHECK-NEXT: shrq $2, %rdi
16-
; CHECK-NEXT: andq %rax, %rdi
17-
; CHECK-NEXT: addq %rcx, %rdi
18-
; CHECK-NEXT: movq %rdi, %rax
19-
; CHECK-NEXT: shrq $4, %rax
20-
; CHECK-NEXT: addq %rdi, %rax
21-
; CHECK-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
22-
; CHECK-NEXT: andq %rax, %rcx
23-
; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
24-
; CHECK-NEXT: imulq %rcx, %rax
25-
; CHECK-NEXT: shrq $56, %rax
26-
; CHECK-NEXT: cmpq $2, %rax
27-
; CHECK-NEXT: jb .LBB0_2
28-
; CHECK-NEXT: # %bb.1: # %if.else
29-
; CHECK-NEXT: cmpl $2, %eax
30-
; CHECK-NEXT: .LBB0_2: # %exit1
7+
; CHECK-NEXT: leaq -1(%rdi), %rax
8+
; CHECK-NEXT: testq %rax, %rdi
319
; CHECK-NEXT: xorl %eax, %eax
3210
; CHECK-NEXT: retq
3311
entry:

llvm/test/Transforms/CodeGenPrepare/unfold-pow2-test.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ define i64 @test_ult_2(i64 %x, i64 %y, i64 %a, i64 %b) {
1212
; SLOW-LABEL: define i64 @test_ult_2(
1313
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
1414
; SLOW-NEXT: [[ENTRY:.*]]:
15-
; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
16-
; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
15+
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
16+
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
17+
; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
1718
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
1819
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
1920
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
@@ -56,8 +57,9 @@ define i64 @test_ugt_1(i64 %x, i64 %y, i64 %a, i64 %b) {
5657
; SLOW-LABEL: define i64 @test_ugt_1(
5758
; SLOW-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], i64 [[A:%.*]], i64 [[B:%.*]]) {
5859
; SLOW-NEXT: [[ENTRY:.*]]:
59-
; SLOW-NEXT: [[CTPOP:%.*]] = call i64 @llvm.ctpop.i64(i64 [[X]])
60-
; SLOW-NEXT: [[CMP1:%.*]] = icmp ugt i64 [[CTPOP]], 1
60+
; SLOW-NEXT: [[TMP0:%.*]] = add i64 [[X]], -1
61+
; SLOW-NEXT: [[TMP1:%.*]] = and i64 [[X]], [[TMP0]]
62+
; SLOW-NEXT: [[CMP1:%.*]] = icmp ne i64 [[TMP1]], 0
6163
; SLOW-NEXT: [[CMP2:%.*]] = icmp sgt i64 [[Y]], 0
6264
; SLOW-NEXT: [[CMP:%.*]] = or i1 [[CMP2]], [[CMP1]]
6365
; SLOW-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]

0 commit comments

Comments
 (0)