[X86] Add i8 CTPOP lowering using i32 MUL #79989

RKSimon · 2024-01-30T11:52:28Z

This is first basic proposal in #79823 - we can investigate improving support for other widths if we can find further use cases.

llvmbot · 2024-01-31T16:42:34Z

@llvm/pr-subscribers-llvm-selectiondag

Author: Simon Pilgrim (RKSimon)

Changes

Fixes #79823

Full diff: https://github.com/llvm/llvm-project/pull/79989.diff

5 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+6)
(modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+18)
(modified) llvm/lib/Target/X86/X86ISelLowering.h (+4)
(modified) llvm/test/CodeGen/X86/ctpop-combine.ll (+7-14)
(modified) llvm/test/CodeGen/X86/popcnt.ll (+21-37)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d39094aa7fed7..f4e9d7baf82b6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
+  /// multiples should the input value be suitable.
+  virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
+    return false;
+  }
+
   // Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
   // A) where y has a single bit set?
   virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5828822e062b1..f224930bb5bb9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,6 +8639,24 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
     return SDValue();
 
+  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+  if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&
+      isOperationLegal(ISD::AND, MVT::i32) &&
+      isOperationLegal(ISD::SRL, MVT::i32) &&
+      isOperationLegal(ISD::MUL, MVT::i32)) {
+    SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+    Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+                     DAG.getConstant(0x08040201U, dl, MVT::i32));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(3, MVT::i32, dl));
+    Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   SDValue Mask55 =
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 32745400a38b7..c87e29dc46db9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1174,6 +1174,10 @@ namespace llvm {
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
+      return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
+    }
+
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
       // Converting to sat variants holds little benefit on X86 as we will just
       // need to saturate the value back using fp arithmatic.
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index fba44218e0572..73152e9f909cf 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; NO-POPCOUNT-NEXT:    andb $127, %cl
-; NO-POPCOUNT-NEXT:    shrb %dil
-; NO-POPCOUNT-NEXT:    andb $21, %dil
-; NO-POPCOUNT-NEXT:    subb %dil, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
-; NO-POPCOUNT-NEXT:    shrb $2, %cl
-; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    addb %al, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %cl, %al
-; NO-POPCOUNT-NEXT:    andb $15, %al
+; NO-POPCOUNT-NEXT:    andl $127, %edi
+; NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT:    shrl $3, %eax
+; NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    shrl $28, %eax
+; NO-POPCOUNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
   %count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index a9d77fd2c0a61..c8d060dfee182 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -10,37 +10,24 @@
 define i8 @cnt8(i8 %x) nounwind readnone {
 ; X86-LABEL: cnt8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    andb $85, %al
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $51, %al
-; X86-NEXT:    shrb $2, %cl
-; X86-NEXT:    andb $51, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $4, %al
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT:    shrl $28, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb %al
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %dil, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %cl, %al
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT:    shrl $28, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-NDD-LABEL: cnt8:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrb %dil, %al
-; X64-NDD-NEXT:    andb $85, %al
-; X64-NDD-NEXT:    subb %al, %dil, %al
-; X64-NDD-NEXT:    andb $51, %al, %cl
-; X64-NDD-NEXT:    shrb $2, %al
-; X64-NDD-NEXT:    andb $51, %al
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    shrb $4, %al, %cl
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    andb $15, %al
+; X64-NDD-NEXT:    movzbl %dil, %eax
+; X64-NDD-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT:    shrl $3, %eax
+; X64-NDD-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    shrl $28, %eax
+; X64-NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NDD-NEXT:    retq
   %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
   ret i8 %cnt

llvmbot · 2024-01-31T16:42:34Z

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

Fixes #79823

Full diff: https://github.com/llvm/llvm-project/pull/79989.diff

5 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+6)
(modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+18)
(modified) llvm/lib/Target/X86/X86ISelLowering.h (+4)
(modified) llvm/test/CodeGen/X86/ctpop-combine.ll (+7-14)
(modified) llvm/test/CodeGen/X86/popcnt.ll (+21-37)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d39094aa7fed7..f4e9d7baf82b6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3258,6 +3258,12 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if CTPOP/CTTZ/CTLZ/PARITY expansions should try to use integer
+  /// multiples should the input value be suitable.
+  virtual bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const {
+    return false;
+  }
+
   // Should we fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x))
   // A) where y has a single bit set?
   virtual bool shouldFoldSelectWithSingleBitTest(EVT VT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5828822e062b1..f224930bb5bb9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8639,6 +8639,24 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
     return SDValue();
 
+  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+  if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&
+      isOperationLegal(ISD::AND, MVT::i32) &&
+      isOperationLegal(ISD::SRL, MVT::i32) &&
+      isOperationLegal(ISD::MUL, MVT::i32)) {
+    SDValue Mask11 = DAG.getConstant(0x11111111U, dl, MVT::i32);
+    Op = DAG.getZExtOrTrunc(Op, dl, MVT::i32);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op,
+                     DAG.getConstant(0x08040201U, dl, MVT::i32));
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(3, MVT::i32, dl));
+    Op = DAG.getNode(ISD::AND, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::MUL, dl, MVT::i32, Op, Mask11);
+    Op = DAG.getNode(ISD::SRL, dl, MVT::i32, Op,
+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
   // This is the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   SDValue Mask55 =
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 32745400a38b7..c87e29dc46db9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1174,6 +1174,10 @@ namespace llvm {
 
     bool shouldSplatInsEltVarIndex(EVT VT) const override;
 
+    bool shouldAllowMultiplyInBitCounts(EVT CntVT, EVT MulVT) const override {
+      return CntVT.isScalarInteger() && isOperationLegal(ISD::MUL, MulVT);
+    }
+
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
       // Converting to sat variants holds little benefit on X86 as we will just
       // need to saturate the value back using fp arithmatic.
diff --git a/llvm/test/CodeGen/X86/ctpop-combine.ll b/llvm/test/CodeGen/X86/ctpop-combine.ll
index fba44218e0572..73152e9f909cf 100644
--- a/llvm/test/CodeGen/X86/ctpop-combine.ll
+++ b/llvm/test/CodeGen/X86/ctpop-combine.ll
@@ -88,20 +88,13 @@ define i8 @test4(i8 %x) nounwind readnone {
 ;
 ; NO-POPCOUNT-LABEL: test4:
 ; NO-POPCOUNT:       # %bb.0:
-; NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; NO-POPCOUNT-NEXT:    andb $127, %cl
-; NO-POPCOUNT-NEXT:    shrb %dil
-; NO-POPCOUNT-NEXT:    andb $21, %dil
-; NO-POPCOUNT-NEXT:    subb %dil, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    andb $51, %al
-; NO-POPCOUNT-NEXT:    shrb $2, %cl
-; NO-POPCOUNT-NEXT:    andb $51, %cl
-; NO-POPCOUNT-NEXT:    addb %al, %cl
-; NO-POPCOUNT-NEXT:    movl %ecx, %eax
-; NO-POPCOUNT-NEXT:    shrb $4, %al
-; NO-POPCOUNT-NEXT:    addb %cl, %al
-; NO-POPCOUNT-NEXT:    andb $15, %al
+; NO-POPCOUNT-NEXT:    andl $127, %edi
+; NO-POPCOUNT-NEXT:    imull $134480385, %edi, %eax # imm = 0x8040201
+; NO-POPCOUNT-NEXT:    shrl $3, %eax
+; NO-POPCOUNT-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; NO-POPCOUNT-NEXT:    shrl $28, %eax
+; NO-POPCOUNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NO-POPCOUNT-NEXT:    retq
   %x2 = and i8 %x, 127
   %count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index a9d77fd2c0a61..c8d060dfee182 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -10,37 +10,24 @@
 define i8 @cnt8(i8 %x) nounwind readnone {
 ; X86-LABEL: cnt8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb %al
-; X86-NEXT:    andb $85, %al
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andb $51, %al
-; X86-NEXT:    shrb $2, %cl
-; X86-NEXT:    andb $51, %cl
-; X86-NEXT:    addb %al, %cl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $4, %al
-; X86-NEXT:    addb %cl, %al
-; X86-NEXT:    andb $15, %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X86-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X86-NEXT:    shrl $28, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cnt8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrb %al
-; X64-NEXT:    andb $85, %al
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shrb $2, %dil
-; X64-NEXT:    andb $51, %dil
-; X64-NEXT:    addb %dil, %cl
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    shrb $4, %al
-; X64-NEXT:    addb %cl, %al
-; X64-NEXT:    andb $15, %al
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NEXT:    shrl $28, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: cnt8:
@@ -59,16 +46,13 @@ define i8 @cnt8(i8 %x) nounwind readnone {
 ;
 ; X64-NDD-LABEL: cnt8:
 ; X64-NDD:       # %bb.0:
-; X64-NDD-NEXT:    shrb %dil, %al
-; X64-NDD-NEXT:    andb $85, %al
-; X64-NDD-NEXT:    subb %al, %dil, %al
-; X64-NDD-NEXT:    andb $51, %al, %cl
-; X64-NDD-NEXT:    shrb $2, %al
-; X64-NDD-NEXT:    andb $51, %al
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    shrb $4, %al, %cl
-; X64-NDD-NEXT:    addb %cl, %al
-; X64-NDD-NEXT:    andb $15, %al
+; X64-NDD-NEXT:    movzbl %dil, %eax
+; X64-NDD-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT:    shrl $3, %eax
+; X64-NDD-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    shrl $28, %eax
+; X64-NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NDD-NEXT:    retq
   %cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
   ret i8 %cnt

phoebewang

LGTM.

phoebewang · 2024-02-01T06:06:27Z

llvm/test/CodeGen/X86/popcnt.ll

+; X64-NDD-NEXT:    movzbl %dil, %eax
+; X64-NDD-NEXT:    imull $134480385, %eax, %eax # imm = 0x8040201
+; X64-NDD-NEXT:    shrl $3, %eax
+; X64-NDD-NEXT:    andl $286331153, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    imull $286331153, %eax, %eax # imm = 0x11111111
+; X64-NDD-NEXT:    shrl $28, %eax
+; X64-NDD-NEXT:    # kill: def $al killed $al killed $eax


Any chance to merge the three together?

phoebewang · 2024-02-01T06:08:35Z

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

+                     DAG.getShiftAmountConstant(28, MVT::i32, dl));
+    return DAG.getZExtOrTrunc(Op, dl, MVT::i8);
+  }
+
  // This is the "best" algorithm from


So it is not the "best" algorithm now :)

The algorithm is not the "best" in all cases. Although it does work for the common cases of counting the 32-bit and 64-bit integers. :)

arsenm · 2024-02-01T06:22:26Z

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

@@ -8639,6 +8639,24 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
  if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
    return SDValue();

+  // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
+  if (VT == MVT::i8 && shouldAllowMultiplyInBitCounts(MVT::i8, MVT::i32) &&


Do we really need yet another extremely specific target hook? I would expect the isOperationLegal checks be the base implementation if so

So far only x86 (and maybe arm) have benefited from this

Would checking the immediate cost be able to distinguish targets where this is profitable?

I've started going down that route, but its not looking great - I'm very tempted to just make this X86 only for now.

Fixes llvm#79823

RKSimon · 2024-02-01T17:55:00Z

Limited this to X86

github-actions · 2024-02-01T17:56:47Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff ea2984287d91b96f5e2cc0aa66d146d6dbd1d1bb 618e28224775186af3e0d219eebeb8fae1c8acc8 -- llvm/lib/Target/X86/X86ISelLowering.cpp

View the diff from clang-format here.

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index de2df5c036..8315745f57 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -427,7 +427,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // on the dest that popcntl hasn't had since Cannon Lake.
     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
   } else {
-    setOperationAction(ISD::CTPOP          , MVT::i8   , Custom);
+    setOperationAction(ISD::CTPOP, MVT::i8, Custom);
     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
     if (Subtarget.is64Bit())

phoebewang

LGTM.

…less active bits Extend #79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in #79823

This is the first basic proposal in llvm#79823 - we can investigate improving support for other widths if we can find further use cases.

…less active bits Extend llvm#79989 slightly to use KnownBits on the CTPOP input - this should make it easier to add additional cases identified in llvm#79823

Updated to match #79989 / 9410019

RKSimon force-pushed the ctpop8 branch 3 times, most recently from a5f3837 to 64594e7 Compare January 31, 2024 16:34

RKSimon marked this pull request as ready for review January 31, 2024 16:42

llvmbot added backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Jan 31, 2024

RKSimon requested review from nikic, phoebewang, topperc and goldsteinn January 31, 2024 16:42

phoebewang approved these changes Feb 1, 2024

View reviewed changes

arsenm reviewed Feb 1, 2024

View reviewed changes

RKSimon force-pushed the ctpop8 branch 2 times, most recently from 65b452e to a23c221 Compare February 1, 2024 12:29

[X86] Add i8 CTPOP lowering using i32 MUL

618e282

Fixes llvm#79823

RKSimon force-pushed the ctpop8 branch from a23c221 to 618e282 Compare February 1, 2024 17:54

RKSimon changed the title ~~[DAG] Add generic i8 CTPOP lowering using i32 MUL~~ [X86] Add i8 CTPOP lowering using i32 MUL Feb 1, 2024

RKSimon requested a review from phoebewang February 1, 2024 17:54

phoebewang approved these changes Feb 2, 2024

View reviewed changes

RKSimon merged commit 9410019 into llvm:main Feb 2, 2024

RKSimon deleted the ctpop8 branch February 2, 2024 10:41

RKSimon added a commit that referenced this pull request Feb 21, 2024

[CostModel][X86] Fix expanded CTPOP i8 costs

a0869b1

Updated to match #79989 / 9410019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[X86] Add i8 CTPOP lowering using i32 MUL #79989

[X86] Add i8 CTPOP lowering using i32 MUL #79989

RKSimon commented Jan 30, 2024 •

edited

Loading

Uh oh!

llvmbot commented Jan 31, 2024

Uh oh!

llvmbot commented Jan 31, 2024

Uh oh!

phoebewang left a comment

Uh oh!

phoebewang Feb 1, 2024

Uh oh!

phoebewang Feb 1, 2024

Uh oh!

Explorer09 Feb 3, 2024

Uh oh!

arsenm Feb 1, 2024

Uh oh!

RKSimon Feb 1, 2024

Uh oh!

nikic Feb 1, 2024

Uh oh!

RKSimon Feb 1, 2024

Uh oh!

RKSimon commented Feb 1, 2024

Uh oh!

github-actions bot commented Feb 1, 2024

Uh oh!

phoebewang left a comment

Uh oh!

Uh oh!

[X86] Add i8 CTPOP lowering using i32 MUL #79989

[X86] Add i8 CTPOP lowering using i32 MUL #79989

Conversation

RKSimon commented Jan 30, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 31, 2024

Uh oh!

llvmbot commented Jan 31, 2024

Uh oh!

phoebewang left a comment

Choose a reason for hiding this comment

Uh oh!

phoebewang Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

phoebewang Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

Explorer09 Feb 3, 2024

Choose a reason for hiding this comment

Uh oh!

arsenm Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

RKSimon Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

nikic Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

RKSimon Feb 1, 2024

Choose a reason for hiding this comment

Uh oh!

RKSimon commented Feb 1, 2024

Uh oh!

github-actions bot commented Feb 1, 2024

Uh oh!

phoebewang left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

RKSimon commented Jan 30, 2024 •

edited

Loading