[X86] Improve helper for simplifying demanded bits of compares

goldsteinn · goldsteinn · commit c67e83a2dc07 · 2024-03-20T16:15:37.000-05:00
We currently only handle a single case for `pcmpgt`. This patch
extends that to work for `cmpp` and handles comparitors more
generically.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41346,6 +41346,156 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Simplify a decomposed (sext (setcc)). Assumes prior check that
+// bitwidth(sext)==bitwidth(setcc operands).
+static SDValue simplifySExtOfDecomposedSetCCImpl(
+    SelectionDAG &DAG, const SDLoc &DL, ISD::CondCode CC, SDValue Op0,
+    SDValue Op1, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, bool AllowNOT, unsigned Depth) {
+  // Possible TODO: We could handle any power of two demanded bit + unsigned
+  // comparison. There are no x86 specific comparisons that are unsigned so its
+  // unneeded.
+  if (!OriginalDemandedBits.isSignMask())
+    return SDValue();
+
+  EVT OpVT = Op0.getValueType();
+  // We need need nofpclass(nan inf nzero) to handle floats.
+  auto hasOkayFPFlags = [](SDValue Op) {
+    return Op.getOpcode() == ISD::SINT_TO_FP ||
+           Op.getOpcode() == ISD::UINT_TO_FP ||
+           (Op->getFlags().hasNoNaNs() && Op->getFlags().hasNoInfs() &&
+            Op->getFlags().hasNoSignedZeros());
+  };
+
+  if (OpVT.isFloatingPoint() && !hasOkayFPFlags(Op0))
+    return SDValue();
+
+  auto ValsEq = [OpVT](const APInt &V0, APInt V1) -> bool {
+    if (OpVT.isFloatingPoint()) {
+      const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT);
+      return V0.eq(APFloat(Sem, V1).bitcastToAPInt());
+    }
+    return V0.eq(V1);
+  };
+
+  // Assume we canonicalized constants to Op1. That isn't always true but we
+  // call this function twice with inverted CC/Operands so its fine either way.
+  APInt Op1C;
+  unsigned ValWidth = OriginalDemandedBits.getBitWidth();
+  if (ISD::isConstantSplatVectorAllZeros(Op1.getNode())) {
+    Op1C = APInt::getZero(ValWidth);
+  } else if (ISD::isConstantSplatVectorAllOnes(Op1.getNode())) {
+    Op1C = APInt::getAllOnes(ValWidth);
+  } else if (auto *C = dyn_cast<ConstantFPSDNode>(Op1)) {
+    Op1C = C->getValueAPF().bitcastToAPInt();
+  } else if (auto *C = dyn_cast<ConstantSDNode>(Op1)) {
+    Op1C = C->getAPIntValue();
+  } else if (ISD::isConstantSplatVector(Op1.getNode(), Op1C)) {
+    // isConstantSplatVector sets `Op1C`.
+  } else {
+    return SDValue();
+  }
+
+  bool Not = false;
+  bool Okay = false;
+  assert(OriginalDemandedBits.getBitWidth() == Op1C.getBitWidth() &&
+         "Invalid constant operand");
+
+  switch (CC) {
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    Not = true;
+    [[fallthrough]];
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    // signbit(sext(x s< 0)) == signbit(x)
+    // signbit(sext(x s>= 0)) == signbit(~x)
+    Okay = ValsEq(Op1C, APInt::getZero(ValWidth));
+    // For float ops we need to ensure Op0 is de-norm. Otherwise DAZ can break
+    // this fold.
+    // NB: We only need de-norm check here, for the rest of the constants any
+    // relationship with a de-norm value and zero will be identical.
+    if (Okay && OpVT.isFloatingPoint()) {
+      // Values from integers are always normal.
+      if (Op0.getOpcode() == ISD::SINT_TO_FP ||
+          Op0.getOpcode() == ISD::UINT_TO_FP)
+        break;
+
+      // See if we can prove normal with known bits.
+      KnownBits Op0Known =
+          DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth);
+      // Negative/positive doesn't matter.
+      Op0Known.One.clearSignBit();
+      Op0Known.Zero.clearSignBit();
+
+      // Get min normal value.
+      const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT);
+      KnownBits MinNormal = KnownBits::makeConstant(
+          APFloat::getSmallestNormalized(Sem).bitcastToAPInt());
+      // Are we above de-norm range?
+      std::optional<bool> Op0Normal = KnownBits::uge(Op0Known, MinNormal);
+      Okay = Op0Normal.value_or(false);
+    }
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    Not = true;
+    [[fallthrough]];
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    // signbit(sext(x s<= -1)) == signbit(x)
+    // signbit(sext(x s> -1)) == signbit(~x)
+    Okay = ValsEq(Op1C, APInt::getAllOnes(ValWidth));
+    break;
+  case ISD::SETULT:
+    Not = true;
+    [[fallthrough]];
+  case ISD::SETUGE:
+    // signbit(sext(x u>= SIGNED_MIN)) == signbit(x)
+    // signbit(sext(x u< SIGNED_MIN)) == signbit(~x)
+    Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits);
+    break;
+  case ISD::SETULE:
+    Not = true;
+    [[fallthrough]];
+  case ISD::SETUGT:
+    // signbit(sext(x u> SIGNED_MAX)) == signbit(x)
+    // signbit(sext(x u<= SIGNED_MAX)) == signbit(~x)
+    Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits - 1);
+    break;
+  default:
+    break;
+  }
+
+  Okay &= Not ? AllowNOT : true;
+  if (!Okay)
+    return SDValue();
+
+  if (!Not)
+    return Op0;
+
+  if (!OpVT.isFloatingPoint())
+    return DAG.getNOT(DL, Op0, OpVT);
+
+  // Possible TODO: We could use `fneg` to do not.
+  return SDValue();
+}
+
+static SDValue simplifySExtOfDecomposedSetCC(SelectionDAG &DAG, const SDLoc &DL,
+                                             ISD::CondCode CC, SDValue Op0,
+                                             SDValue Op1,
+                                             const APInt &OriginalDemandedBits,
+                                             const APInt &OriginalDemandedElts,
+                                             bool AllowNOT, unsigned Depth) {
+  if (SDValue R = simplifySExtOfDecomposedSetCCImpl(
+          DAG, DL, CC, Op0, Op1, OriginalDemandedBits, OriginalDemandedElts,
+          AllowNOT, Depth))
+    return R;
+  return simplifySExtOfDecomposedSetCCImpl(
+      DAG, DL, ISD::getSetCCSwappedOperands(CC), Op1, Op0, OriginalDemandedBits,
+      OriginalDemandedElts, AllowNOT, Depth);
+}
+
 // Simplify variable target shuffle masks based on the demanded elements.
 // TODO: Handle DemandedBits in mask indices as well?
 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
@@ -42525,13 +42675,26 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     }
     break;
   }
-  case X86ISD::PCMPGT:
-    // icmp sgt(0, R) == ashr(R, BitWidth-1).
-    // iff we only need the sign bit then we can use R directly.
-    if (OriginalDemandedBits.isSignMask() &&
-        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
-      return TLO.CombineTo(Op, Op.getOperand(1));
+  case X86ISD::PCMPGT: {
+    SDLoc DL(Op);
+    if (SDValue R = simplifySExtOfDecomposedSetCC(
+            TLO.DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1),
+            OriginalDemandedBits, OriginalDemandedElts,
+            /*AllowNOT*/ true, Depth))
+      return TLO.CombineTo(Op, R);
+    break;
+  }
+  case X86ISD::CMPP: {
+    SDLoc DL(Op);
+    ISD::CondCode CC = X86::getCondForCMPPImm(
+        cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+    if (SDValue R = simplifySExtOfDecomposedSetCC(
+            TLO.DAG, DL, CC, Op.getOperand(0), Op.getOperand(1),
+            OriginalDemandedBits, OriginalDemandedElts,
+            !(TLO.LegalOperations() && TLO.LegalTypes()), Depth))
+      return TLO.CombineTo(Op, R);
     break;
+  }
   case X86ISD::MOVMSK: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
@@ -42715,13 +42878,25 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
     if (DemandedBits.isSignMask())
       return Op.getOperand(0);
     break;
-  case X86ISD::PCMPGT:
-    // icmp sgt(0, R) == ashr(R, BitWidth-1).
-    // iff we only need the sign bit then we can use R directly.
-    if (DemandedBits.isSignMask() &&
-        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
-      return Op.getOperand(1);
+  case X86ISD::PCMPGT: {
+    SDLoc DL(Op);
+    if (SDValue R = simplifySExtOfDecomposedSetCC(
+            DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1),
+            DemandedBits, DemandedElts, /*AllowNOT*/ false, Depth))
+      return R;
+    break;
+  }
+  case X86ISD::CMPP: {
+    SDLoc DL(Op);
+    ISD::CondCode CC = X86::getCondForCMPPImm(
+        cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+    if (SDValue R = simplifySExtOfDecomposedSetCC(DAG, DL, CC, Op.getOperand(0),
+                                                  Op.getOperand(1),
+                                                  DemandedBits, DemandedElts,
+                                                  /*AllowNOT*/ false, Depth))
+      return R;
     break;
+  }
   case X86ISD::BLENDV: {
     // BLENDV: Cond (MSB) ? LHS : RHS
     SDValue Cond = Op.getOperand(0);
@@ -48397,7 +48572,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
       TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3360,6 +3360,46 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
   }
 }
 
+ISD::CondCode X86::getCondForCMPPImm(unsigned Imm) {
+  assert(Imm <= 0x1f && "Invalid CMPP Imm");
+  switch (Imm & 0xf) {
+  default:
+    llvm_unreachable("Invalid CMPP Imm");
+  case 0:
+    return ISD::SETOEQ;
+  case 1:
+    return ISD::SETOLT;
+  case 2:
+    return ISD::SETOLE;
+  case 3:
+    return ISD::SETUO;
+  case 4:
+    return ISD::SETUNE;
+  case 5:
+    return ISD::SETUGE;
+  case 6:
+    return ISD::SETUGT;
+  case 7:
+    return ISD::SETO;
+  case 8:
+    return ISD::SETUEQ;
+  case 9:
+    return ISD::SETULT;
+  case 10:
+    return ISD::SETULE;
+  case 11:
+    return ISD::SETFALSE;
+  case 12:
+    return ISD::SETONE;
+  case 13:
+    return ISD::SETOGE;
+  case 14:
+    return ISD::SETOGT;
+  case 15:
+    return ISD::SETTRUE;
+  }
+}
+
 /// Get the VPCMP immediate if the operands are swapped.
 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
   switch (Imm) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -72,6 +72,9 @@ CondCode GetOppositeBranchCondition(CondCode CC);
 /// Get the VPCMP immediate for the given condition.
 unsigned getVPCMPImmForCond(ISD::CondCode CC);
 
+/// Get the CondCode from a CMPP immediate.
+ISD::CondCode getCondForCMPPImm(unsigned Imm);
+
 /// Get the VPCMP immediate if the opcodes are swapped.
 unsigned getSwappedVPCMPImm(unsigned Imm);
 
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -175,8 +175,6 @@ define i32 @testpsnzc_256_signbit(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movl %edi, %eax
 ; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vcmpltps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vtestps %ymm1, %ymm0
 ; AVX-NEXT:    cmovnel %esi, %eax
 ; AVX-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -862,8 +862,6 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vcmpltps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
@@ -1063,8 +1061,6 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
-; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vcmpltps %ymm5, %ymm2, %ymm2
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
@@ -1073,21 +1069,20 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
 ; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm7, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm7, %ymm8
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm6, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm7
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vcvtdq2ps %ymm3, %ymm3
-; AVX1-NEXT:    vcmpltps %ymm5, %ymm3, %ymm3
 ; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT:    vpsrad $31, %xmm7, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm3
 ; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm8, %ymm1
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm7, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1957,35 +1957,32 @@ define <4 x i64> @PR52504(<4 x i16> %t3) {
 ; SSE42-LABEL: PR52504:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovsxwq %xmm1, %xmm2
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm0, %xmm0
-; SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT:    por %xmm3, %xmm0
-; SSE42-NEXT:    pcmpgtq %xmm2, %xmm1
-; SSE42-NEXT:    por %xmm2, %xmm1
+; SSE42-NEXT:    pmovsxwq %xmm1, %xmm1
+; SSE42-NEXT:    pmovsxwq %xmm0, %xmm2
+; SSE42-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; SSE42-NEXT:    movdqa %xmm1, %xmm0
+; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: PR52504:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vblendvpd %xmm1, %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR52504:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: PR52504:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll