llvm
diff --git a/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 119 additions & 45 deletions b/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 119 additions & 45 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64InstrInfo.td
Lines changed: 6 additions & 10 deletions b/‎llvm/lib/Target/AArch64/AArch64InstrInfo.td
Lines changed: 6 additions & 10 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
Lines changed: 8 additions & 13 deletions b/‎llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
Lines changed: 8 additions & 13 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
Lines changed: 2 additions & 3 deletions b/‎llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
Lines changed: 2 additions & 3 deletions
@@ -11369,54 +11369,105 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
-// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
-// v4i32s. This is really a truncate, which we can construct out of (legal)
-// concats and truncate nodes.
-static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
-  if (V.getValueType() != MVT::v16i8)
-    return SDValue();
-  assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
-
-  for (unsigned X = 0; X < 4; X++) {
-    // Check the first item in each group is an extract from lane 0 of a v4i32
-    // or v4i16.
-    SDValue BaseExt = V.getOperand(X * 4);
-    if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
-         BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
-        !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
-        BaseExt.getConstantOperandVal(1) != 0)
+// Detect patterns like a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3, that
+// are truncates, which we can construct out of (legal) concats and truncate
+// nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V,
+                                                  SelectionDAG &DAG) {
+  EVT BVTy = V.getValueType();
+  if (BVTy != MVT::v16i8 && BVTy != MVT::v8i16 && BVTy != MVT::v8i8 &&
+      BVTy != MVT::v4i16)
+    return SDValue();
+
+  // Only handle truncating BVs.
+  if (V.getOperand(0).getValueType().getSizeInBits() ==
+      BVTy.getScalarSizeInBits())
+    return SDValue();
+
+  SmallVector<SDValue, 4> Sources;
+  uint64_t LastIdx = 0;
+  uint64_t MaxIdx = 0;
+  // Check for sequential indices e.g. i=0, i+1, ..., i=0, i+1, ...
+  for (SDValue Extr : V->ops()) {
+    SDValue SourceVec = Extr.getOperand(0);
+    EVT SourceVecTy = SourceVec.getValueType();
+
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(SourceVecTy))
       return SDValue();
-    SDValue Base = BaseExt.getOperand(0);
-    // And check the other items are extracts from the same vector.
-    for (unsigned Y = 1; Y < 4; Y++) {
-      SDValue Ext = V.getOperand(X * 4 + Y);
-      if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Ext.getOperand(0) != Base ||
-          !isa<ConstantSDNode>(Ext.getOperand(1)) ||
-          Ext.getConstantOperandVal(1) != Y)
+    if (!isa<ConstantSDNode>(Extr.getOperand(1)))
+      return SDValue();
+
+    uint64_t CurIdx = Extr.getConstantOperandVal(1);
+    // Allow repeat of sources.
+    if (CurIdx == 0) {
+      // Check if all lanes are used by the BV.
+      if (Sources.size() && Sources[Sources.size() - 1]
+                                    .getValueType()
+                                    .getVectorMinNumElements() != LastIdx + 1)
         return SDValue();
-    }
+      Sources.push_back(SourceVec);
+    } else if (CurIdx != LastIdx + 1)
+      return SDValue();
+
+    LastIdx = CurIdx;
+    MaxIdx = std::max(MaxIdx, CurIdx);
   }
 
-  // Turn the buildvector into a series of truncates and concates, which will
-  // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
-  // concat together to produce 2 v8i16. These are both truncated and concat
-  // together.
+  // Check if all lanes are used by the BV.
+  if (Sources[Sources.size() - 1].getValueType().getVectorMinNumElements() !=
+      LastIdx + 1)
+    return SDValue();
+  if (Sources.size() % 2 != 0)
+    return SDValue();
+
+  // At this point we know that we have a truncating BV of extract_vector_elt.
+  // We can just truncate and concat them.
   SDLoc DL(V);
-  SDValue Trunc[4] = {
-      V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
-      V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
-  for (SDValue &V : Trunc)
-    if (V.getValueType() == MVT::v4i32)
-      V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
-  SDValue Concat0 =
-      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
-  SDValue Concat1 =
-      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
-  SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
-  SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+  LLVMContext &Ctx = *DAG.getContext();
+  while (Sources.size() > 1) {
+    for (unsigned i = 0; i < Sources.size(); i += 2) {
+      SDValue V1 = Sources[i];
+      SDValue V2 = Sources[i + 1];
+      EVT VT1 = V1.getValueType();
+      EVT VT2 = V2.getValueType();
+
+      if (VT1.is128BitVector()) {
+        VT1 = VT1.changeVectorElementType(
+            VT1.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+        V1 = DAG.getNode(ISD::TRUNCATE, DL, VT1, V1);
+      }
+      if (VT2.is128BitVector()) {
+        VT2 = VT2.changeVectorElementType(
+            VT2.getVectorElementType().getHalfSizedIntegerVT(Ctx));
+        V2 = DAG.getNode(ISD::TRUNCATE, DL, VT2, V2);
+      }
+
+      assert(VT1 == VT2 && "Mismatched types.");
+      Sources[i / 2] =
+          DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                      VT1.getDoubleNumVectorElementsVT(Ctx), V1, V2);
+    }
+    Sources.resize(Sources.size() / 2);
+  }
+
+  // We might not have the final type in some cases e.g. <4i32, 4i32> -> 8i8. Do
+  // a final truncating shuffle instead of a concat + trunc.
+  if (Sources[0].getValueType() != BVTy) {
+    SDValue V1 = Sources[0].getOperand(0);
+    SDValue V2 = Sources[0].getOperand(1);
+    V1 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+                                                          : AArch64ISD::NVCAST,
+                     DL, BVTy, V1);
+    V2 = DAG.getNode(DAG.getDataLayout().isLittleEndian() ? ISD::BITCAST
+                                                          : AArch64ISD::NVCAST,
+                     DL, BVTy, V2);
+
+    SmallVector<int, 8> MaskVec;
+    for (unsigned i = 0; i < BVTy.getVectorNumElements() * 2; i += 2)
+      MaskVec.push_back(i);
+    return DAG.getVectorShuffle(BVTy, DL, V1, V2, MaskVec);
+  }
+  return Sources[0];
 }
 
 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
@@ -13305,8 +13356,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
   // v4i32s. This is really a truncate, which we can construct out of (legal)
   // concats and truncate nodes.
-  if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
-    return M;
+  if (AllLanesExtractElt)
+    if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+      return M;
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
@@ -19096,6 +19148,28 @@ static SDValue performBuildVectorCombine(SDNode *N,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
+  //    BUILD_VECTOR (extract_elt(Assert[S|Z]ext(x)))
+  // => BUILD_VECTOR (extract_elt(x))
+  SmallVector<SDValue, 8> Ops;
+  bool ExtractExtended = false;
+  for (SDValue Extr : N->ops()) {
+    if (Extr.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      ExtractExtended = false;
+      break;
+    }
+    SDValue ExtractBase = Extr.getOperand(0);
+    if (ExtractBase.getOpcode() == ISD::AssertSext ||
+        ExtractBase.getOpcode() == ISD::AssertZext) {
+      ExtractExtended = true;
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                Extr.getValueType(), ExtractBase.getOperand(0),
+                                Extr.getOperand(1)));
+    } else
+      Ops.push_back(Extr);
+  }
+  if (ExtractExtended)
+    return DAG.getBuildVector(VT, DL, Ops);
+
   // A build vector of two extracted elements is equivalent to an
   // extract subvector where the inner vector is any-extended to the
   // extract_vector_elt VT.
 
@@ -7595,17 +7595,13 @@ defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
                 TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
 
-def VImm0080:         PatLeaf<(AArch64movi_shift (i32 128), (i32 0))>;
-def VImm00008000:     PatLeaf<(AArch64movi_shift (i32 128), (i32 8))>;
-def VImm0000000080000000: PatLeaf<(AArch64NvCast (v2f64 (fneg (AArch64NvCast (v4i32 (AArch64movi_shift (i32 128), (i32 24)))))))>;
-
 // RADDHN patterns for when RSHRN shifts by half the size of the vector element
-def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))),
+def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))),
           (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
-def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))),
+def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))),
           (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
 let AddedComplexity = 5 in
-def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))),
+def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))),
           (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
 def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
           (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
@@ -7617,20 +7613,20 @@ def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
 // RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
 def : Pat<(v16i8 (concat_vectors
                  (v8i8 V64:$Vd),
-                 (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))))),
+                 (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))))),
           (RADDHNv8i16_v16i8
                  (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
                  (v8i16 (MOVIv2d_ns (i32 0))))>;
 def : Pat<(v8i16 (concat_vectors
                  (v4i16 V64:$Vd),
-                 (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))))),
+                 (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))))),
           (RADDHNv4i32_v8i16
                  (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
                  (v4i32 (MOVIv2d_ns (i32 0))))>;
 let AddedComplexity = 5 in
 def : Pat<(v4i32 (concat_vectors
                  (v2i32 V64:$Vd),
-                 (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))))),
+                 (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))))),
           (RADDHNv2i64_v4i32
                  (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
                  (v2i64 (MOVIv2d_ns (i32 0))))>;
 
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x double>, ptr %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -72,9 +68,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
 
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs.4s v1, v1
 ; CHECK-NEXT:    fcvtzs.4s v0, v0
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    uzp1.8b v0, v0, v1
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
 entry: