[X86] Generalize i8 CTPOP expansion to work with any input with 8 or less active bits

RKSimon · RKSimon · commit 275729ae06d5 · 2024-02-02T14:29:57.000Z
Extend #79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in #79823
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -428,10 +428,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
   } else {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
-    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Custom);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Custom);
     if (Subtarget.is64Bit())
-      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
     else
       setOperationAction(ISD::CTPOP        , MVT::i64  , Custom);
   }
@@ -31030,29 +31030,37 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
-static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  SDLoc DL(Op);
+  MVT VT = N.getSimpleValueType();
+  SDValue Op = N.getOperand(0);
+  SDLoc DL(N);
 
-  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
-  if (VT == MVT::i8) {
-    SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
-    Op = DAG.getZExtOrTrunc(Op.getOperand(0), DL, MVT::i32);
-    Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
-                     DAG.getConstant(0x08040201U, DL, MVT::i32));
-    Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
-                     DAG.getShiftAmountConstant(3, MVT::i32, DL));
-    Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
-    Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
-    Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
-                     DAG.getShiftAmountConstant(28, MVT::i32, DL));
-    return DAG.getZExtOrTrunc(Op, DL, VT);
+  if (VT.isScalarInteger()) {
+    KnownBits Known = DAG.computeKnownBits(Op);
+    unsigned ActiveBits = Known.countMaxActiveBits();
+
+    // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+    if (ActiveBits <= 8) {
+      SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
+      Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
+      Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
+                       DAG.getConstant(0x08040201U, DL, MVT::i32));
+      Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
+                       DAG.getShiftAmountConstant(3, MVT::i32, DL));
+      Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
+      Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
+      Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
+                       DAG.getShiftAmountConstant(28, MVT::i32, DL));
+      return DAG.getZExtOrTrunc(Op, DL, VT);
+    }
+
+    return SDValue(); // fallback to generic expansion.
   }
 
   assert(VT.isVector() &&
          "We only do custom lowering for vector population count.");
-  return LowerVectorCTPOP(Op, DL, Subtarget, DAG);
+  return LowerVectorCTPOP(N, DL, Subtarget, DAG);
 }
 
 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -516,23 +516,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT:    kmovw %k1, %eax
-; AVX512F-NEXT:    movzbl %al, %ecx
-; AVX512F-NEXT:    shrl %eax
-; AVX512F-NEXT:    andl $85, %eax
-; AVX512F-NEXT:    subl %eax, %ecx
-; AVX512F-NEXT:    movl %ecx, %eax
-; AVX512F-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT:    shrl $2, %ecx
-; AVX512F-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT:    addl %eax, %ecx
-; AVX512F-NEXT:    movl %ecx, %eax
-; AVX512F-NEXT:    shrl $4, %eax
-; AVX512F-NEXT:    addl %ecx, %eax
-; AVX512F-NEXT:    andl $252645135, %eax ## imm = 0xF0F0F0F
-; AVX512F-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
-; AVX512F-NEXT:    shrl $24, %eax
 ; AVX512F-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512F-NEXT:    kmovw %k1, %eax
+; AVX512F-NEXT:    movzbl %al, %eax
+; AVX512F-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512F-NEXT:    shrl $3, %eax
+; AVX512F-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512F-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512F-NEXT:    shrl $28, %eax
 ; AVX512F-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
 ; AVX512F-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
 ; AVX512F-NEXT:    vzeroupper
@@ -543,23 +534,13 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; AVX512VLDQ-NEXT:    vpslld $31, %zmm2, %zmm2
 ; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k1
-; AVX512VLDQ-NEXT:    kmovb %k1, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    shrl %ecx
-; AVX512VLDQ-NEXT:    andl $-43, %ecx
-; AVX512VLDQ-NEXT:    subl %ecx, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLDQ-NEXT:    shrl $2, %eax
-; AVX512VLDQ-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLDQ-NEXT:    addl %ecx, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    shrl $4, %ecx
-; AVX512VLDQ-NEXT:    addl %eax, %ecx
-; AVX512VLDQ-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLDQ-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLDQ-NEXT:    shrl $24, %eax
 ; AVX512VLDQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512VLDQ-NEXT:    kmovb %k1, %eax
+; AVX512VLDQ-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLDQ-NEXT:    shrl $3, %eax
+; AVX512VLDQ-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT:    shrl $28, %eax
 ; AVX512VLDQ-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
 ; AVX512VLDQ-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
 ; AVX512VLDQ-NEXT:    vzeroupper
@@ -569,23 +550,14 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $7, %xmm2, %xmm2
 ; AVX512VLBW-NEXT:    vpmovb2m %xmm2, %k1
-; AVX512VLBW-NEXT:    kmovd %k1, %eax
-; AVX512VLBW-NEXT:    movzbl %al, %ecx
-; AVX512VLBW-NEXT:    shrl %eax
-; AVX512VLBW-NEXT:    andl $85, %eax
-; AVX512VLBW-NEXT:    subl %eax, %ecx
-; AVX512VLBW-NEXT:    movl %ecx, %eax
-; AVX512VLBW-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT:    shrl $2, %ecx
-; AVX512VLBW-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT:    addl %eax, %ecx
-; AVX512VLBW-NEXT:    movl %ecx, %eax
-; AVX512VLBW-NEXT:    shrl $4, %eax
-; AVX512VLBW-NEXT:    addl %ecx, %eax
-; AVX512VLBW-NEXT:    andl $252645135, %eax ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT:    imull $16843009, %eax, %eax ## imm = 0x1010101
-; AVX512VLBW-NEXT:    shrl $24, %eax
 ; AVX512VLBW-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512VLBW-NEXT:    kmovd %k1, %eax
+; AVX512VLBW-NEXT:    movzbl %al, %eax
+; AVX512VLBW-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLBW-NEXT:    shrl $3, %eax
+; AVX512VLBW-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT:    shrl $28, %eax
 ; AVX512VLBW-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
 ; AVX512VLBW-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
 ; AVX512VLBW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1008,21 +1008,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
 ; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
 ; AVX512F-NEXT:    kmovw %k2, %eax
 ; AVX512F-NEXT:    movzbl %al, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl %ecx
-; AVX512F-NEXT:    andl $-43, %ecx
-; AVX512F-NEXT:    subl %ecx, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512F-NEXT:    shrl $2, %eax
-; AVX512F-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512F-NEXT:    addl %ecx, %eax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    shrl $4, %ecx
-; AVX512F-NEXT:    addl %eax, %ecx
-; AVX512F-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512F-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512F-NEXT:    shrl $24, %eax
+; AVX512F-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512F-NEXT:    shrl $3, %eax
+; AVX512F-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512F-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512F-NEXT:    shrl $28, %eax
 ; AVX512F-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
 ; AVX512F-NEXT:    retq
 ;
@@ -1032,21 +1022,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
 ; AVX512VLDQ-NEXT:    vptestnmd %ymm3, %ymm3, %k1
 ; AVX512VLDQ-NEXT:    vptestnmd %ymm2, %ymm2, %k2
 ; AVX512VLDQ-NEXT:    kmovb %k2, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    shrl %ecx
-; AVX512VLDQ-NEXT:    andl $-43, %ecx
-; AVX512VLDQ-NEXT:    subl %ecx, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLDQ-NEXT:    shrl $2, %eax
-; AVX512VLDQ-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLDQ-NEXT:    addl %ecx, %eax
-; AVX512VLDQ-NEXT:    movl %eax, %ecx
-; AVX512VLDQ-NEXT:    shrl $4, %ecx
-; AVX512VLDQ-NEXT:    addl %eax, %ecx
-; AVX512VLDQ-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLDQ-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLDQ-NEXT:    shrl $24, %eax
+; AVX512VLDQ-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLDQ-NEXT:    shrl $3, %eax
+; AVX512VLDQ-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLDQ-NEXT:    shrl $28, %eax
 ; AVX512VLDQ-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
 ; AVX512VLDQ-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
 ; AVX512VLDQ-NEXT:    retq
@@ -1059,21 +1039,11 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
 ; AVX512VLBW-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
 ; AVX512VLBW-NEXT:    kmovd %k2, %eax
 ; AVX512VLBW-NEXT:    movzbl %al, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    shrl %ecx
-; AVX512VLBW-NEXT:    andl $-43, %ecx
-; AVX512VLBW-NEXT:    subl %ecx, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
-; AVX512VLBW-NEXT:    shrl $2, %eax
-; AVX512VLBW-NEXT:    andl $858993459, %eax ## imm = 0x33333333
-; AVX512VLBW-NEXT:    addl %ecx, %eax
-; AVX512VLBW-NEXT:    movl %eax, %ecx
-; AVX512VLBW-NEXT:    shrl $4, %ecx
-; AVX512VLBW-NEXT:    addl %eax, %ecx
-; AVX512VLBW-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
-; AVX512VLBW-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
-; AVX512VLBW-NEXT:    shrl $24, %eax
+; AVX512VLBW-NEXT:    imull $134480385, %eax, %eax ## imm = 0x8040201
+; AVX512VLBW-NEXT:    shrl $3, %eax
+; AVX512VLBW-NEXT:    andl $286331153, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT:    imull $286331153, %eax, %eax ## imm = 0x11111111
+; AVX512VLBW-NEXT:    shrl $28, %eax
 ; AVX512VLBW-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
 ; AVX512VLBW-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer