Skip to content

Commit 5e8900a

Browse files
author
Manish Kausik H
committed
[LegalizeDAG] Optimize CodeGen for ISD::CTLZ_ZERO_UNDEF
Previously we had the same instructions being generated for `ISD::CTLZ` and `ISD::CTLZ_ZERO_UNDEF` which did not take advantage of the fact that zero is an invalid input for `ISD::CTLZ_ZERO_UNDEF`. This commit separates codegen for the two cases to allow for the optimization for the latter case. The details of the optimization are outlined in #82075 Fixes #82075
1 parent c0febca commit 5e8900a

File tree

6 files changed

+157
-99
lines changed

6 files changed

+157
-99
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4961,7 +4961,6 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
49614961
case ISD::CTTZ:
49624962
case ISD::CTTZ_ZERO_UNDEF:
49634963
case ISD::CTLZ:
4964-
case ISD::CTLZ_ZERO_UNDEF:
49654964
case ISD::CTPOP:
49664965
// Zero extend the argument unless its cttz, then use any_extend.
49674966
if (Node->getOpcode() == ISD::CTTZ ||
@@ -4982,15 +4981,30 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
49824981
// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
49834982
// already the correct result.
49844983
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
4985-
if (Node->getOpcode() == ISD::CTLZ ||
4986-
Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
4984+
if (Node->getOpcode() == ISD::CTLZ) {
49874985
// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
49884986
Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
49894987
DAG.getConstant(NVT.getSizeInBits() -
49904988
OVT.getSizeInBits(), dl, NVT));
49914989
}
49924990
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
49934991
break;
4992+
case ISD::CTLZ_ZERO_UNDEF:
4993+
// We know that the argument is unlikely to be zero, hence we can take a
4994+
// different approach as compared to ISD::CTLZ
4995+
4996+
// Any Extend the argument
4997+
Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
4998+
4999+
// Tmp1 = Tmp1 << (sizeinbits(NVT) - sizeinbits(Old VT))
5000+
Tmp1 = DAG.getNode(
5001+
ISD::SHL, dl, NVT, Tmp1,
5002+
DAG.getConstant(NVT.getSizeInBits() - OVT.getSizeInBits(), dl, NVT));
5003+
5004+
// Perform the larger operation
5005+
Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
5006+
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
5007+
break;
49945008
case ISD::BITREVERSE:
49955009
case ISD::BSWAP: {
49965010
unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -642,21 +642,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
642642
}
643643
}
644644

645-
// Zero extend to the promoted type and do the count there.
646-
SDValue Op = ZExtPromotedInteger(N->getOperand(0));
645+
unsigned CtlzOpcode = N->getOpcode();
646+
if (CtlzOpcode == ISD::CTLZ) {
647+
// Zero extend to the promoted type and do the count there.
648+
SDValue Op = ZExtPromotedInteger(N->getOperand(0));
649+
650+
// Subtract off the extra leading bits in the bigger type.
651+
SDValue ExtractLeadingBits = DAG.getConstant(
652+
NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
653+
if (!N->isVPOpcode())
654+
return DAG.getNode(ISD::SUB, dl, NVT,
655+
DAG.getNode(N->getOpcode(), dl, NVT, Op),
656+
ExtractLeadingBits);
657+
SDValue Mask = N->getOperand(1);
658+
SDValue EVL = N->getOperand(2);
659+
return DAG.getNode(ISD::VP_SUB, dl, NVT,
660+
DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
661+
ExtractLeadingBits, Mask, EVL);
662+
} else if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF) {
663+
// Any Extend the argument
664+
SDValue Op = GetPromotedInteger(N->getOperand(0));
665+
666+
// Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT))
667+
unsigned SHLAmount = NVT.getSizeInBits() - OVT.getSizeInBits();
668+
auto ShiftConst =
669+
DAG.getShiftAmountConstant(SHLAmount, Op.getValueType(), dl);
670+
if (!N->isVPOpcode()) {
671+
Op = DAG.getNode(ISD::SHL, dl, NVT, Op, ShiftConst);
672+
return DAG.getNode(CtlzOpcode, dl, NVT, Op);
673+
}
647674

648-
// Subtract off the extra leading bits in the bigger type.
649-
SDValue ExtractLeadingBits = DAG.getConstant(
650-
NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
651-
if (!N->isVPOpcode())
652-
return DAG.getNode(ISD::SUB, dl, NVT,
653-
DAG.getNode(N->getOpcode(), dl, NVT, Op),
654-
ExtractLeadingBits);
655-
SDValue Mask = N->getOperand(1);
656-
SDValue EVL = N->getOperand(2);
657-
return DAG.getNode(ISD::VP_SUB, dl, NVT,
658-
DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
659-
ExtractLeadingBits, Mask, EVL);
675+
SDValue Mask = N->getOperand(1);
676+
SDValue EVL = N->getOperand(2);
677+
Op = DAG.getNode(ISD::VP_SHL, dl, NVT, Op, ShiftConst, Mask, EVL);
678+
return DAG.getNode(CtlzOpcode, dl, NVT, Op, Mask, EVL);
679+
} else {
680+
llvm_unreachable("Invalid CTLZ Opcode");
681+
}
660682
}
661683

662684
SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc < %s --mtriple=aarch64 | FileCheck %s
3+
4+
declare i8 @llvm.ctlz.i8(i8, i1 immarg)
5+
6+
define i32 @clz_nzu8(ptr %num) {
7+
; CHECK-LABEL: clz_nzu8:
8+
; CHECK: // %bb.0: // %start
9+
; CHECK-NEXT: ldrb w8, [x0]
10+
; CHECK-NEXT: lsl w8, w8, #24
11+
; CHECK-NEXT: clz w0, w8
12+
; CHECK-NEXT: ret
13+
start:
14+
%self = load i8, ptr %num, align 1
15+
%0 = call i8 @llvm.ctlz.i8(i8 %self, i1 true)
16+
%_0 = zext i8 %0 to i32
17+
ret i32 %_0
18+
}

llvm/test/CodeGen/X86/clz.ll

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86,X86-NOCMOV
33
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefixes=X86,X86-CMOV
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
@@ -225,33 +225,31 @@ define i8 @ctlz_i8(i8 %x) {
225225
;
226226
; X86-CLZ-LABEL: ctlz_i8:
227227
; X86-CLZ: # %bb.0:
228-
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
228+
; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
229+
; X86-CLZ-NEXT: shll $24, %eax
229230
; X86-CLZ-NEXT: lzcntl %eax, %eax
230-
; X86-CLZ-NEXT: addl $-24, %eax
231231
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
232232
; X86-CLZ-NEXT: retl
233233
;
234234
; X64-CLZ-LABEL: ctlz_i8:
235235
; X64-CLZ: # %bb.0:
236-
; X64-CLZ-NEXT: movzbl %dil, %eax
237-
; X64-CLZ-NEXT: lzcntl %eax, %eax
238-
; X64-CLZ-NEXT: addl $-24, %eax
236+
; X64-CLZ-NEXT: shll $24, %edi
237+
; X64-CLZ-NEXT: lzcntl %edi, %eax
239238
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
240239
; X64-CLZ-NEXT: retq
241240
;
242241
; X64-FASTLZCNT-LABEL: ctlz_i8:
243242
; X64-FASTLZCNT: # %bb.0:
244-
; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
245-
; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
246-
; X64-FASTLZCNT-NEXT: addl $-24, %eax
243+
; X64-FASTLZCNT-NEXT: shll $24, %edi
244+
; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax
247245
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
248246
; X64-FASTLZCNT-NEXT: retq
249247
;
250248
; X86-FASTLZCNT-LABEL: ctlz_i8:
251249
; X86-FASTLZCNT: # %bb.0:
252-
; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
250+
; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
251+
; X86-FASTLZCNT-NEXT: shll $24, %eax
253252
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
254-
; X86-FASTLZCNT-NEXT: addl $-24, %eax
255253
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
256254
; X86-FASTLZCNT-NEXT: retl
257255
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
@@ -1154,26 +1152,26 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
11541152
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
11551153
; X86-CLZ-NEXT: orb $64, %al
11561154
; X86-CLZ-NEXT: movzbl %al, %eax
1155+
; X86-CLZ-NEXT: shll $24, %eax
11571156
; X86-CLZ-NEXT: lzcntl %eax, %eax
1158-
; X86-CLZ-NEXT: addl $-24, %eax
11591157
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
11601158
; X86-CLZ-NEXT: retl
11611159
;
11621160
; X64-CLZ-LABEL: ctlz_i8_knownbits:
11631161
; X64-CLZ: # %bb.0:
11641162
; X64-CLZ-NEXT: orb $64, %dil
11651163
; X64-CLZ-NEXT: movzbl %dil, %eax
1164+
; X64-CLZ-NEXT: shll $24, %eax
11661165
; X64-CLZ-NEXT: lzcntl %eax, %eax
1167-
; X64-CLZ-NEXT: addl $-24, %eax
11681166
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
11691167
; X64-CLZ-NEXT: retq
11701168
;
11711169
; X64-FASTLZCNT-LABEL: ctlz_i8_knownbits:
11721170
; X64-FASTLZCNT: # %bb.0:
11731171
; X64-FASTLZCNT-NEXT: orb $64, %dil
11741172
; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
1173+
; X64-FASTLZCNT-NEXT: shll $24, %eax
11751174
; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
1176-
; X64-FASTLZCNT-NEXT: addl $-24, %eax
11771175
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
11781176
; X64-FASTLZCNT-NEXT: retq
11791177
;
@@ -1182,8 +1180,8 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
11821180
; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
11831181
; X86-FASTLZCNT-NEXT: orb $64, %al
11841182
; X86-FASTLZCNT-NEXT: movzbl %al, %eax
1183+
; X86-FASTLZCNT-NEXT: shll $24, %eax
11851184
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
1186-
; X86-FASTLZCNT-NEXT: addl $-24, %eax
11871185
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
11881186
; X86-FASTLZCNT-NEXT: retl
11891187

@@ -1581,18 +1579,17 @@ define i8 @ctlz_xor7_i8_true(i8 %x) {
15811579
;
15821580
; X64-FASTLZCNT-LABEL: ctlz_xor7_i8_true:
15831581
; X64-FASTLZCNT: # %bb.0:
1584-
; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
1585-
; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
1586-
; X64-FASTLZCNT-NEXT: addl $-24, %eax
1582+
; X64-FASTLZCNT-NEXT: shll $24, %edi
1583+
; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax
15871584
; X64-FASTLZCNT-NEXT: xorb $7, %al
15881585
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
15891586
; X64-FASTLZCNT-NEXT: retq
15901587
;
15911588
; X86-FASTLZCNT-LABEL: ctlz_xor7_i8_true:
15921589
; X86-FASTLZCNT: # %bb.0:
1593-
; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1590+
; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
1591+
; X86-FASTLZCNT-NEXT: shll $24, %eax
15941592
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
1595-
; X86-FASTLZCNT-NEXT: addl $-24, %eax
15961593
; X86-FASTLZCNT-NEXT: xorb $7, %al
15971594
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
15981595
; X86-FASTLZCNT-NEXT: retl

llvm/test/CodeGen/X86/lzcnt.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc < %s -mtriple=i686-- -mattr=+lzcnt | FileCheck %s --check-prefix=X86
33
; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=+lzcnt | FileCheck %s --check-prefix=X32
44
; RUN: llc < %s -mtriple=x86_64-- -mattr=+lzcnt | FileCheck %s --check-prefix=X64
@@ -106,25 +106,23 @@ define i64 @t4(i64 %x) nounwind {
106106
define i8 @t5(i8 %x) nounwind {
107107
; X86-LABEL: t5:
108108
; X86: # %bb.0:
109-
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
109+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
110+
; X86-NEXT: shll $24, %eax
110111
; X86-NEXT: lzcntl %eax, %eax
111-
; X86-NEXT: addl $-24, %eax
112112
; X86-NEXT: # kill: def $al killed $al killed $eax
113113
; X86-NEXT: retl
114114
;
115115
; X32-LABEL: t5:
116116
; X32: # %bb.0:
117-
; X32-NEXT: movzbl %dil, %eax
118-
; X32-NEXT: lzcntl %eax, %eax
119-
; X32-NEXT: addl $-24, %eax
117+
; X32-NEXT: shll $24, %edi
118+
; X32-NEXT: lzcntl %edi, %eax
120119
; X32-NEXT: # kill: def $al killed $al killed $eax
121120
; X32-NEXT: retq
122121
;
123122
; X64-LABEL: t5:
124123
; X64: # %bb.0:
125-
; X64-NEXT: movzbl %dil, %eax
126-
; X64-NEXT: lzcntl %eax, %eax
127-
; X64-NEXT: addl $-24, %eax
124+
; X64-NEXT: shll $24, %edi
125+
; X64-NEXT: lzcntl %edi, %eax
128126
; X64-NEXT: # kill: def $al killed $al killed $eax
129127
; X64-NEXT: retq
130128
%tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true )

0 commit comments

Comments
 (0)