[X86] combineOr - don't demand operand elements if the other operand element is 'allones'

RKSimon · RKSimon · commit 4f2e183229b8 · 2021-12-14T15:36:33.000Z
If either operand has an element with allbits set, then we don't need the equivalent element from the other operand, as allbits are guaranteed to be set.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46647,6 +46647,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If this is SSE1 only convert to FOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
@@ -46663,7 +46664,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     SmallVector<APInt, 2> SrcPartials;
     if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
         SrcOps.size() == 1) {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
       EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
@@ -46724,11 +46724,36 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Attempt to recursively combine an OR of shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    // Attempt to recursively combine an OR of shuffles.
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
       return Res;
+
+    // If either operand is a constant mask, then only the elements that aren't
+    // allones are actually demanded by the other operand.
+    auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
+      APInt UndefElts;
+      SmallVector<APInt> EltBits;
+      int NumElts = VT.getVectorNumElements();
+      int EltSizeInBits = VT.getScalarSizeInBits();
+      if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
+        return false;
+
+      APInt DemandedElts = APInt::getZero(NumElts);
+      for (int I = 0; I != NumElts; ++I)
+        if (!EltBits[I].isAllOnes())
+          DemandedElts.setBit(I);
+
+      APInt KnownUndef, KnownZero;
+      return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef,
+                                            KnownZero, DCI);
+    };
+    if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   // We should fold "masked merge" patterns when `andn` is not available.
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -9,12 +9,11 @@
 define <8 x i16> @pr25080(<8 x i32> %a) {
 ; AVX-LABEL: pr25080:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vshift-6.ll b/llvm/test/CodeGen/X86/vshift-6.ll
@@ -30,12 +30,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movb %al, (%ecx)
-; X86-NEXT:    movd %eax, %xmm0
-; X86-NEXT:    psllq $56, %xmm0
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X86-NEXT:    movdqa %xmm2, %xmm1
-; X86-NEXT:    pandn %xmm0, %xmm1
-; X86-NEXT:    por %xmm2, %xmm1
+; X86-NEXT:    movd %eax, %xmm1
+; X86-NEXT:    psllq $56, %xmm1
+; X86-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-NEXT:    pcmpeqd %xmm3, %xmm3
 ; X86-NEXT:    psllw $5, %xmm1
 ; X86-NEXT:    pxor %xmm2, %xmm2
@@ -65,12 +62,9 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X64-LABEL: do_not_crash:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movb %r9b, (%rdi)
-; X64-NEXT:    movd %r9d, %xmm0
-; X64-NEXT:    psllq $56, %xmm0
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; X64-NEXT:    movdqa %xmm2, %xmm1
-; X64-NEXT:    pandn %xmm0, %xmm1
-; X64-NEXT:    por %xmm2, %xmm1
+; X64-NEXT:    movd %r9d, %xmm1
+; X64-NEXT:    psllq $56, %xmm1
+; X64-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    pcmpeqd %xmm2, %xmm2
 ; X64-NEXT:    psllw $5, %xmm1
 ; X64-NEXT:    pxor %xmm3, %xmm3