llvm · paulwalker-arm · Mar 14, 2024 · Jun 6, 2025
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1679,81 +1679,81 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
     Elt = ConstantInt::get(*getContext(), Elt->getValue());
 
   // In some cases the vector type is legal but the element type is illegal and
-  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
-  // inserted value (the type does not need to match the vector element type).
-  // Any extra bits introduced will be truncated away.
-  if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
-                           TargetLowering::TypePromoteInteger) {
-    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-    APInt NewVal;
-    if (TLI->isSExtCheaperThanZExt(VT.getScalarType(), EltVT))
-      NewVal = Elt->getValue().sextOrTrunc(EltVT.getSizeInBits());
-    else
-      NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
-    Elt = ConstantInt::get(*getContext(), NewVal);
-  }
-  // In other cases the element type is illegal and needs to be expanded, for
-  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
-  // the value into n parts and use a vector type with n-times the elements.
-  // Then bitcast to the type requested.
-  // Legalizing constants too early makes the DAGCombiner's job harder so we
-  // only legalize if the DAG tells us we must produce legal types.
-  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
-           TLI->getTypeAction(*getContext(), EltVT) ==
-               TargetLowering::TypeExpandInteger) {
-    const APInt &NewVal = Elt->getValue();
-    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
-    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
-
-    // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
-    if (VT.isScalableVector() ||
-        TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) {
-      assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
-             "Can only handle an even split!");
-      unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;
-
-      SmallVector<SDValue, 2> ScalarParts;
-      for (unsigned i = 0; i != Parts; ++i)
-        ScalarParts.push_back(getConstant(
-            NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
-            ViaEltVT, isT, isO));
-
-      return getNode(ISD::SPLAT_VECTOR_PARTS, DL, VT, ScalarParts);
-    }
+  // thus when necessary we "legalise" the constant here so as to simplify the
+  // job of calling this function.  NOTE: Only legalize when necessary so that
+  // we don't make DAGCombiner's job harder.
+  if (NewNodesMustHaveLegalTypes && VT.isVector()) {
+    // Promote the inserted value (the type does not need to match the vector
+    // element type). Any extra bits introduced will be truncated away.
+    if (TLI->getTypeAction(*getContext(), EltVT) ==
+        TargetLowering::TypePromoteInteger) {
+      EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+      APInt NewVal;
+      if (TLI->isSExtCheaperThanZExt(VT.getScalarType(), EltVT))
+        NewVal = Elt->getValue().sextOrTrunc(EltVT.getSizeInBits());
+      else
+        NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+      Elt = ConstantInt::get(*getContext(), NewVal);
+    }
+    // For expansion we find the nearest legal type, split the value into n
+    // parts and use a vector type with n-times the elements. Then bitcast to
+    // the type requested.
+    else if (TLI->getTypeAction(*getContext(), EltVT) ==
+             TargetLowering::TypeExpandInteger) {
+      const APInt &NewVal = Elt->getValue();
+      EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+      unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+
+      // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
+      if (VT.isScalableVector() ||
+          TLI->isOperationLegal(ISD::SPLAT_VECTOR, VT)) {
+        assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
+               "Can only handle an even split!");
+        unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;
+
+        SmallVector<SDValue, 2> ScalarParts;
+        for (unsigned i = 0; i != Parts; ++i)
+          ScalarParts.push_back(getConstant(
+              NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+              ViaEltVT, isT, isO));
+
+        return getNode(ISD::SPLAT_VECTOR_PARTS, DL, VT, ScalarParts);
+      }
 
-    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
-    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+      unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+      EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
 
-    // Check the temporary vector is the correct size. If this fails then
-    // getTypeToTransformTo() probably returned a type whose size (in bits)
-    // isn't a power-of-2 factor of the requested type size.
-    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+      // Check the temporary vector is the correct size. If this fails then
+      // getTypeToTransformTo() probably returned a type whose size (in bits)
+      // isn't a power-of-2 factor of the requested type size.
+      assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    SmallVector<SDValue, 2> EltParts;
-    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i)
-      EltParts.push_back(getConstant(
-          NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
-          ViaEltVT, isT, isO));
+      SmallVector<SDValue, 2> EltParts;
+      for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i)
+        EltParts.push_back(getConstant(
+            NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+            ViaEltVT, isT, isO));
 
-    // EltParts is currently in little endian order. If we actually want
-    // big-endian order then reverse it now.
-    if (getDataLayout().isBigEndian())
-      std::reverse(EltParts.begin(), EltParts.end());
+      // EltParts is currently in little endian order. If we actually want
+      // big-endian order then reverse it now.
+      if (getDataLayout().isBigEndian())
+        std::reverse(EltParts.begin(), EltParts.end());
 
-    // The elements must be reversed when the element order is different
-    // to the endianness of the elements (because the BITCAST is itself a
-    // vector shuffle in this situation). However, we do not need any code to
-    // perform this reversal because getConstant() is producing a vector
-    // splat.
-    // This situation occurs in MIPS MSA.
+      // The elements must be reversed when the element order is different
+      // to the endianness of the elements (because the BITCAST is itself a
+      // vector shuffle in this situation). However, we do not need any code to
+      // perform this reversal because getConstant() is producing a vector
+      // splat.
+      // This situation occurs in MIPS MSA.
 
-    SmallVector<SDValue, 8> Ops;
-    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-      llvm::append_range(Ops, EltParts);
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+        llvm::append_range(Ops, EltParts);
 
-    SDValue V =
-        getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
-    return V;
+      SDValue V =
+          getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
+      return V;
+    }
   }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1791,26 +1791,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
-      SDLoc DL = getCurSDLoc();
-
-      // DAG.getConstant() may attempt to legalise the vector constant which can
-      // significantly change the combines applied to the DAG. To reduce the
-      // divergence when enabling ConstantInt based vectors we try to construct
-      // the DAG in the same way as shufflevector based splats. TODO: The
-      // divergence sometimes leads to better optimisations. Ideally we should
-      // prevent DAG.getConstant() from legalising too early but there are some
-      // degradations preventing this.
-      if (VT.isScalableVector())
-        return DAG.getNode(
-            ISD::SPLAT_VECTOR, DL, VT,
-            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
-      if (VT.isFixedLengthVector())
-        return DAG.getSplatBuildVector(
-            VT, DL,
-            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
-      return DAG.getConstant(*CI, DL, VT);
-    }
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return DAG.getConstant(*CI, getCurSDLoc(), VT);
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45540,6 +45540,10 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
     return SDValue();
 
+  // WIP: Fixes one of the failures but triggers more.
+  //if (isBitwiseNot(Op))
+  //  return SDValue();
+
   // If either operand was bitcast from DstVT, then perform logic with DstVT (at
   // least one of the getBitcast() will fold away).
   if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
@@ -48138,8 +48142,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Check if the first operand is all zeros and Cond type is vXi1.
   // If this an avx512 target we can improve the use of zero masking by
   // swapping the operands and inverting the condition.
-  if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
-      Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
+      Cond.hasOneUse() && Subtarget.hasAVX512() &&
+      CondVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorAllZeros(LHS.getNode()) &&
       !ISD::isBuildVectorAllZeros(RHS.getNode())) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)

diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll
@@ -8,9 +8,9 @@ define <16 x i8> @div16xi8(<16 x i8> %x) {
 ; CHECK-SD-NEXT:    movi v1.16b, #41
 ; CHECK-SD-NEXT:    smull2 v2.8h, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    uzp2 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT:    sshr v0.16b, v0.16b, #2
-; CHECK-SD-NEXT:    usra v0.16b, v0.16b, #7
+; CHECK-SD-NEXT:    uzp2 v1.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    sshr v0.16b, v1.16b, #2
+; CHECK-SD-NEXT:    usra v0.16b, v1.16b, #7
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: div16xi8:
@@ -78,9 +78,9 @@ define <8 x i16> @div8xi16(<8 x i16> %x) {
 ; CHECK-SD-NEXT:    smull2 v2.4s, v0.8h, v1.8h
 ; CHECK-SD-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-SD-NEXT:    add v0.8h, v1.8h, v0.8h
-; CHECK-SD-NEXT:    sshr v0.8h, v0.8h, #12
-; CHECK-SD-NEXT:    usra v0.8h, v0.8h, #15
+; CHECK-SD-NEXT:    add v1.8h, v1.8h, v0.8h
+; CHECK-SD-NEXT:    sshr v0.8h, v1.8h, #12
+; CHECK-SD-NEXT:    usra v0.8h, v1.8h, #15
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: div8xi16:

diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -14,10 +14,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_2]
 ; CHECK-NEXT:    adrp x8, .LCPI0_3
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI0_3]
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshl v2.4h, v1.4h, v2.4h
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_3]
+; CHECK-NEXT:    mls v0.4h, v2.4h, v1.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
   ret <4 x i16> %1
@@ -27,14 +27,14 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; CHECK-LABEL: fold_srem_vec_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #44151 // =0xac77
-; CHECK-NEXT:    movi v2.4h, #95
+; CHECK-NEXT:    movi v3.4h, #95
 ; CHECK-NEXT:    dup v1.4h, w8
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshr v1.4h, v1.4h, #6
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshr v2.4h, v1.4h, #6
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    mls v0.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   ret <4 x i16> %1
@@ -46,15 +46,15 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; CHECK-LABEL: combine_srem_sdiv:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #44151 // =0xac77
-; CHECK-NEXT:    movi v2.4h, #95
+; CHECK-NEXT:    movi v3.4h, #95
 ; CHECK-NEXT:    dup v1.4h, w8
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshr v1.4h, v1.4h, #6
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
-; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    sshr v2.4h, v1.4h, #6
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    mls v0.4h, v2.4h, v3.4h
+; CHECK-NEXT:    add v0.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -74,10 +74,10 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v0.4h
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI3_2]
-; CHECK-NEXT:    usra v1.4h, v1.4h, #15
-; CHECK-NEXT:    mls v0.4h, v1.4h, v2.4h
+; CHECK-NEXT:    sshl v2.4h, v1.4h, v2.4h
+; CHECK-NEXT:    usra v2.4h, v1.4h, #15
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_2]
+; CHECK-NEXT:    mls v0.4h, v2.4h, v1.4h
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -91,14 +91,14 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
 ; CHECK-NEXT:    adrp x8, .LCPI4_1
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI4_1]
+; CHECK-NEXT:    adrp x8, .LCPI4_2
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    and v2.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_1]
-; CHECK-NEXT:    adrp x8, .LCPI4_2
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
+; CHECK-NEXT:    sshl v1.4h, v1.4h, v3.4h
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI4_2]
@@ -118,12 +118,12 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) {
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_0]
 ; CHECK-NEXT:    adrp x8, .LCPI5_2
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ldr d3, [x8, :lo12:.LCPI5_2]
+; CHECK-NEXT:    adrp x8, .LCPI5_3
 ; CHECK-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-NEXT:    mla v1.4h, v0.4h, v2.4h
-; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_2]
-; CHECK-NEXT:    adrp x8, .LCPI5_3
-; CHECK-NEXT:    sshl v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ushr v2.4h, v1.4h, #15
+; CHECK-NEXT:    sshl v1.4h, v1.4h, v3.4h
 ; CHECK-NEXT:    mov v2.h[0], wzr
 ; CHECK-NEXT:    add v1.4h, v1.4h, v2.4h
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI5_3]
@@ -181,13 +181,13 @@ define <16 x i8> @fold_srem_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: fold_srem_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.16b, #103
+; CHECK-NEXT:    movi v3.16b, #10
 ; CHECK-NEXT:    smull2 v2.8h, v0.16b, v1.16b
 ; CHECK-NEXT:    smull v1.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.16b, #10
-; CHECK-NEXT:    sshr v1.16b, v1.16b, #2
-; CHECK-NEXT:    usra v1.16b, v1.16b, #7
-; CHECK-NEXT:    mls v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sshr v2.16b, v1.16b, #2
+; CHECK-NEXT:    usra v2.16b, v1.16b, #7
+; CHECK-NEXT:    mls v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = srem <16 x i8> %x, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
   ret <16 x i8> %1
@@ -199,8 +199,8 @@ define <8 x i8> @fold_srem_v8i8(<8 x i8> %x) {
 ; CHECK-NEXT:    movi v1.8b, #103
 ; CHECK-NEXT:    movi v2.8b, #10
 ; CHECK-NEXT:    smull v1.8h, v0.8b, v1.8b
-; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
-; CHECK-NEXT:    sshr v1.8b, v1.8b, #2
+; CHECK-NEXT:    sshr v1.8h, v1.8h, #10
+; CHECK-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-NEXT:    usra v1.8b, v1.8b, #7
 ; CHECK-NEXT:    mls v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
@@ -212,14 +212,14 @@ define <8 x i16> @fold_srem_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: fold_srem_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #26215 // =0x6667
+; CHECK-NEXT:    movi v3.8h, #10
 ; CHECK-NEXT:    dup v1.8h, w8
 ; CHECK-NEXT:    smull2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smull v1.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
-; CHECK-NEXT:    movi v2.8h, #10
-; CHECK-NEXT:    sshr v1.8h, v1.8h, #2
-; CHECK-NEXT:    usra v1.8h, v1.8h, #15
-; CHECK-NEXT:    mls v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    sshr v2.8h, v1.8h, #2
+; CHECK-NEXT:    usra v2.8h, v1.8h, #15
+; CHECK-NEXT:    mls v0.8h, v2.8h, v3.8h
 ; CHECK-NEXT:    ret
   %1 = srem <8 x i16> %x, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
   ret <8 x i16> %1

diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -356,9 +356,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.16b, #1
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z