Skip to content

Commit 275729a

Browse files
committed
[X86] Generalize i8 CTPOP expansion to work with any input with 8 or less active bits
Extend #79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in #79823
1 parent 1e7d587 commit 275729a

File tree

3 files changed

+63
-113
lines changed

3 files changed

+63
-113
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -428,10 +428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
428428
setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
429429
} else {
430430
setOperationAction(ISD::CTPOP , MVT::i8 , Custom);
431-
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
432-
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
431+
setOperationAction(ISD::CTPOP , MVT::i16 , Custom);
432+
setOperationAction(ISD::CTPOP , MVT::i32 , Custom);
433433
if (Subtarget.is64Bit())
434-
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
434+
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
435435
else
436436
setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
437437
}
@@ -31030,29 +31030,37 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
3103031030
return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
3103131031
}
3103231032

31033-
static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
31033+
static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
3103431034
SelectionDAG &DAG) {
31035-
MVT VT = Op.getSimpleValueType();
31036-
SDLoc DL(Op);
31035+
MVT VT = N.getSimpleValueType();
31036+
SDValue Op = N.getOperand(0);
31037+
SDLoc DL(N);
3103731038

31038-
// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31039-
if (VT == MVT::i8) {
31040-
SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31041-
Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32);
31042-
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31043-
DAG.getConstant(0x08040201U, DL, MVT::i32));
31044-
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31045-
DAG.getShiftAmountConstant(3, MVT::i32, DL));
31046-
Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31047-
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31048-
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31049-
DAG.getShiftAmountConstant(28, MVT::i32, DL));
31050-
return DAG.getZExtOrTrunc(Op, DL, VT);
31039+
if (VT.isScalarInteger()) {
31040+
KnownBits Known = DAG.computeKnownBits(Op);
31041+
unsigned ActiveBits = Known.countMaxActiveBits();
31042+
31043+
// i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31044+
if (ActiveBits <= 8) {
31045+
SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31046+
Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31047+
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31048+
DAG.getConstant(0x08040201U, DL, MVT::i32));
31049+
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31050+
DAG.getShiftAmountConstant(3, MVT::i32, DL));
31051+
Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31052+
Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31053+
Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31054+
DAG.getShiftAmountConstant(28, MVT::i32, DL));
31055+
return DAG.getZExtOrTrunc(Op, DL, VT);
31056+
}
31057+
31058+
return SDValue(); // fallback to generic expansion.
3105131059
}
3105231060

3105331061
assert(VT.isVector() &&
3105431062
"We only do custom lowering for vector population count.");
31055-
return LowerVectorCTPOP(Op, DL, Subtarget, DAG);
31063+
return LowerVectorCTPOP(N, DL, Subtarget, DAG);
3105631064
}
3105731065

3105831066
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

llvm/test/CodeGen/X86/masked_compressstore.ll

Lines changed: 20 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -516,23 +516,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
516516
; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2
517517
; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
518518
; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1
519-
; AVX512F-NEXT: kmovw %k1, %eax
520-
; AVX512F-NEXT: movzbl %al, %ecx
521-
; AVX512F-NEXT: shrl %eax
522-
; AVX512F-NEXT: andl $85, %eax
523-
; AVX512F-NEXT: subl %eax, %ecx
524-
; AVX512F-NEXT: movl %ecx, %eax
525-
; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
526-
; AVX512F-NEXT: shrl $2, %ecx
527-
; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
528-
; AVX512F-NEXT: addl %eax, %ecx
529-
; AVX512F-NEXT: movl %ecx, %eax
530-
; AVX512F-NEXT: shrl $4, %eax
531-
; AVX512F-NEXT: addl %ecx, %eax
532-
; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
533-
; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
534-
; AVX512F-NEXT: shrl $24, %eax
535519
; AVX512F-NEXT: kshiftrw $8, %k1, %k2
520+
; AVX512F-NEXT: kmovw %k1, %eax
521+
; AVX512F-NEXT: movzbl %al, %eax
522+
; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
523+
; AVX512F-NEXT: shrl $3, %eax
524+
; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
525+
; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
526+
; AVX512F-NEXT: shrl $28, %eax
536527
; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
537528
; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
538529
; AVX512F-NEXT: vzeroupper
@@ -543,23 +534,13 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
543534
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
544535
; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2
545536
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1
546-
; AVX512VLDQ-NEXT: kmovb %k1, %eax
547-
; AVX512VLDQ-NEXT: movl %eax, %ecx
548-
; AVX512VLDQ-NEXT: shrl %ecx
549-
; AVX512VLDQ-NEXT: andl $-43, %ecx
550-
; AVX512VLDQ-NEXT: subl %ecx, %eax
551-
; AVX512VLDQ-NEXT: movl %eax, %ecx
552-
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
553-
; AVX512VLDQ-NEXT: shrl $2, %eax
554-
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
555-
; AVX512VLDQ-NEXT: addl %ecx, %eax
556-
; AVX512VLDQ-NEXT: movl %eax, %ecx
557-
; AVX512VLDQ-NEXT: shrl $4, %ecx
558-
; AVX512VLDQ-NEXT: addl %eax, %ecx
559-
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
560-
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
561-
; AVX512VLDQ-NEXT: shrl $24, %eax
562537
; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2
538+
; AVX512VLDQ-NEXT: kmovb %k1, %eax
539+
; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
540+
; AVX512VLDQ-NEXT: shrl $3, %eax
541+
; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
542+
; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
543+
; AVX512VLDQ-NEXT: shrl $28, %eax
563544
; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
564545
; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
565546
; AVX512VLDQ-NEXT: vzeroupper
@@ -569,23 +550,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
569550
; AVX512VLBW: ## %bb.0:
570551
; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2
571552
; AVX512VLBW-NEXT: vpmovb2m %xmm2, %k1
572-
; AVX512VLBW-NEXT: kmovd %k1, %eax
573-
; AVX512VLBW-NEXT: movzbl %al, %ecx
574-
; AVX512VLBW-NEXT: shrl %eax
575-
; AVX512VLBW-NEXT: andl $85, %eax
576-
; AVX512VLBW-NEXT: subl %eax, %ecx
577-
; AVX512VLBW-NEXT: movl %ecx, %eax
578-
; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
579-
; AVX512VLBW-NEXT: shrl $2, %ecx
580-
; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
581-
; AVX512VLBW-NEXT: addl %eax, %ecx
582-
; AVX512VLBW-NEXT: movl %ecx, %eax
583-
; AVX512VLBW-NEXT: shrl $4, %eax
584-
; AVX512VLBW-NEXT: addl %ecx, %eax
585-
; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F
586-
; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101
587-
; AVX512VLBW-NEXT: shrl $24, %eax
588553
; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2
554+
; AVX512VLBW-NEXT: kmovd %k1, %eax
555+
; AVX512VLBW-NEXT: movzbl %al, %eax
556+
; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
557+
; AVX512VLBW-NEXT: shrl $3, %eax
558+
; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
559+
; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
560+
; AVX512VLBW-NEXT: shrl $28, %eax
589561
; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
590562
; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
591563
; AVX512VLBW-NEXT: vzeroupper

llvm/test/CodeGen/X86/masked_expandload.ll

Lines changed: 15 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,21 +1008,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
10081008
; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
10091009
; AVX512F-NEXT: kmovw %k2, %eax
10101010
; AVX512F-NEXT: movzbl %al, %eax
1011-
; AVX512F-NEXT: movl %eax, %ecx
1012-
; AVX512F-NEXT: shrl %ecx
1013-
; AVX512F-NEXT: andl $-43, %ecx
1014-
; AVX512F-NEXT: subl %ecx, %eax
1015-
; AVX512F-NEXT: movl %eax, %ecx
1016-
; AVX512F-NEXT: andl $858993459, %ecx ## imm = 0x33333333
1017-
; AVX512F-NEXT: shrl $2, %eax
1018-
; AVX512F-NEXT: andl $858993459, %eax ## imm = 0x33333333
1019-
; AVX512F-NEXT: addl %ecx, %eax
1020-
; AVX512F-NEXT: movl %eax, %ecx
1021-
; AVX512F-NEXT: shrl $4, %ecx
1022-
; AVX512F-NEXT: addl %eax, %ecx
1023-
; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
1024-
; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
1025-
; AVX512F-NEXT: shrl $24, %eax
1011+
; AVX512F-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
1012+
; AVX512F-NEXT: shrl $3, %eax
1013+
; AVX512F-NEXT: andl $286331153, %eax ## imm = 0x11111111
1014+
; AVX512F-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
1015+
; AVX512F-NEXT: shrl $28, %eax
10261016
; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
10271017
; AVX512F-NEXT: retq
10281018
;
@@ -1032,21 +1022,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
10321022
; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1
10331023
; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2
10341024
; AVX512VLDQ-NEXT: kmovb %k2, %eax
1035-
; AVX512VLDQ-NEXT: movl %eax, %ecx
1036-
; AVX512VLDQ-NEXT: shrl %ecx
1037-
; AVX512VLDQ-NEXT: andl $-43, %ecx
1038-
; AVX512VLDQ-NEXT: subl %ecx, %eax
1039-
; AVX512VLDQ-NEXT: movl %eax, %ecx
1040-
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
1041-
; AVX512VLDQ-NEXT: shrl $2, %eax
1042-
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
1043-
; AVX512VLDQ-NEXT: addl %ecx, %eax
1044-
; AVX512VLDQ-NEXT: movl %eax, %ecx
1045-
; AVX512VLDQ-NEXT: shrl $4, %ecx
1046-
; AVX512VLDQ-NEXT: addl %eax, %ecx
1047-
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
1048-
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
1049-
; AVX512VLDQ-NEXT: shrl $24, %eax
1025+
; AVX512VLDQ-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
1026+
; AVX512VLDQ-NEXT: shrl $3, %eax
1027+
; AVX512VLDQ-NEXT: andl $286331153, %eax ## imm = 0x11111111
1028+
; AVX512VLDQ-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
1029+
; AVX512VLDQ-NEXT: shrl $28, %eax
10501030
; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
10511031
; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
10521032
; AVX512VLDQ-NEXT: retq
@@ -1059,21 +1039,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
10591039
; AVX512VLBW-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
10601040
; AVX512VLBW-NEXT: kmovd %k2, %eax
10611041
; AVX512VLBW-NEXT: movzbl %al, %eax
1062-
; AVX512VLBW-NEXT: movl %eax, %ecx
1063-
; AVX512VLBW-NEXT: shrl %ecx
1064-
; AVX512VLBW-NEXT: andl $-43, %ecx
1065-
; AVX512VLBW-NEXT: subl %ecx, %eax
1066-
; AVX512VLBW-NEXT: movl %eax, %ecx
1067-
; AVX512VLBW-NEXT: andl $858993459, %ecx ## imm = 0x33333333
1068-
; AVX512VLBW-NEXT: shrl $2, %eax
1069-
; AVX512VLBW-NEXT: andl $858993459, %eax ## imm = 0x33333333
1070-
; AVX512VLBW-NEXT: addl %ecx, %eax
1071-
; AVX512VLBW-NEXT: movl %eax, %ecx
1072-
; AVX512VLBW-NEXT: shrl $4, %ecx
1073-
; AVX512VLBW-NEXT: addl %eax, %ecx
1074-
; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
1075-
; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
1076-
; AVX512VLBW-NEXT: shrl $24, %eax
1042+
; AVX512VLBW-NEXT: imull $134480385, %eax, %eax ## imm = 0x8040201
1043+
; AVX512VLBW-NEXT: shrl $3, %eax
1044+
; AVX512VLBW-NEXT: andl $286331153, %eax ## imm = 0x11111111
1045+
; AVX512VLBW-NEXT: imull $286331153, %eax, %eax ## imm = 0x11111111
1046+
; AVX512VLBW-NEXT: shrl $28, %eax
10771047
; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
10781048
; AVX512VLBW-NEXT: retq
10791049
%mask = icmp eq <16 x i32> %trigger, zeroinitializer

0 commit comments

Comments
 (0)