Remove nxv4i64 case

SamTebbs33 · SamTebbs33 · commit f6c58393ddc0 · 2024-08-29T11:21:13.000+01:00
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21795,35 +21795,6 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   if (A.getValueType() != B.getValueType())
     return SDValue();
 
-  // The fully-reduced type. Should be a vector of i32 or i64
-  EVT FullType = N->getValueType(0);
-  // The type that is extended to the wide type. Should be an i8 or i16
-  EVT ExtendedType = A.getValueType();
-  // The wide type with four times as many elements as the reduced type. Should
-  // be a vector of i32 or i64, the same as the fully-reduced type
-  EVT WideType = MulOp.getValueType();
-  if (WideType.getScalarSizeInBits() != FullType.getScalarSizeInBits())
-    return SDValue();
-  // Dot products operate on chunks of four elements so there must be four times
-  // as many elements in the wide type
-  if (WideType.getVectorMinNumElements() / FullType.getVectorMinNumElements() !=
-      4)
-    return SDValue();
-  switch (FullType.getScalarSizeInBits()) {
-  case 32:
-    if (ExtendedType.getScalarSizeInBits() != 8)
-      return SDValue();
-    break;
-  case 64:
-    // i8 to i64 can be done with an extended i32 dot product
-    if (ExtendedType.getScalarSizeInBits() != 8 &&
-        ExtendedType.getScalarSizeInBits() != 16)
-      return SDValue();
-    break;
-  default:
-    return SDValue();
-  }
-
   unsigned DotIntrinsicId = Intrinsic::not_intrinsic;
 
   if (IsSExt)
@@ -21834,33 +21805,31 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   assert(DotIntrinsicId != Intrinsic::not_intrinsic &&
          "Unexpected dot product case encountered.");
 
-  EVT Type = NarrowOp.getValueType();
+  auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
 
-  // 8 bit input to 64 bit output can be done by doing a 32 bit dot product
-  // and extending the output
-  bool Extend = A->getValueType(0).getScalarSizeInBits() == 8 &&
-                Type.getScalarSizeInBits() == 64;
-  SDValue Accumulator = NarrowOp;
-  if (Extend) {
-    Type =
-        Type.changeVectorElementType(EVT::getIntegerVT(*DAG.getContext(), 32));
-    // The accumulator is of the wider type so we insert a 0 accumulator and
-    // add the proper one after extending
-    Accumulator = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv4i32,
-                              DAG.getConstant(0, DL, MVT::i32));
-  }
+  // The fully-reduced type. Should be a vector of i32 or i64
+  EVT ReducedType = N->getValueType(0);
+  // The type that is extended to the wide type. Should be an i8 or i16
+  EVT ExtendedType = A.getValueType();
+  // The wide type with four times as many elements as the reduced type. Should
+  // be a vector of i32 or i64, the same as the fully-reduced type
+  EVT WideType = MulOp.getValueType();
+  if (WideType.getScalarSizeInBits() != ReducedType.getScalarSizeInBits())
+    return SDValue();
 
-  auto IntrinsicId = DAG.getConstant(DotIntrinsicId, DL, MVT::i64);
-  auto DotProduct = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Type,
-                                {IntrinsicId, Accumulator, A, B});
-  if (Extend) {
-    auto Extended = DAG.getNode(IsZExt ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
-                                DL, NarrowOp.getValueType(), {DotProduct});
-    auto AccAdd = DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(),
-                              {NarrowOp, Extended});
-    DotProduct = AccAdd;
-  }
-  return DotProduct;
+  // Dot products operate on chunks of four elements so there must be four times
+  // as many elements in the wide type
+  if (WideType == MVT::nxv16i32 && ReducedType == MVT::nxv4i32 &&
+      ExtendedType == MVT::nxv16i8)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv4i32,
+                       {IntrinsicId, NarrowOp, A, B});
+
+  if (WideType == MVT::nxv8i64 && ReducedType == MVT::nxv2i64 &&
+      ExtendedType == MVT::nxv8i16)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i64,
+                       {IntrinsicId, NarrowOp, A, B});
+
+  return SDValue();
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/partial-reduce-dot-product.ll
@@ -61,78 +61,6 @@ entry:
   ret <vscale x 2 x i64> %partial.reduce
 }
 
-define <vscale x 4 x i64> @dotp_8to64(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: dotp_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
-; CHECK-NEXT:    udot z2.s, z0.b, z1.b
-; CHECK-NEXT:    uunpklo z0.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z2.s
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
-  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
-  <vscale x 4 x i64> zeroinitializer, <vscale x 16 x i64> %mult)
-  ret <vscale x 4 x i64> %partial.reduce
-}
-
-define <vscale x 4 x i64> @dotp_sext_8to64(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: dotp_sext_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
-; CHECK-NEXT:    sdot z2.s, z0.b, z1.b
-; CHECK-NEXT:    sunpklo z0.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z2.s
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
-  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
-  <vscale x 4 x i64> zeroinitializer, <vscale x 16 x i64> %mult)
-  ret <vscale x 4 x i64> %partial.reduce
-}
-
-define <vscale x 4 x i64> @dotp_8to64_accumulator(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i64> %acc) {
-; CHECK-LABEL: dotp_8to64_accumulator:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEXT:    udot z4.s, z0.b, z1.b
-; CHECK-NEXT:    uunpklo z0.d, z4.s
-; CHECK-NEXT:    uunpkhi z1.d, z4.s
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
-  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
-  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
-  ret <vscale x 4 x i64> %partial.reduce
-}
-
-define <vscale x 4 x i64> @dotp_sext_8to64_accumulator(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 4 x i64> %acc) {
-; CHECK-LABEL: dotp_sext_8to64_accumulator:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEXT:    sdot z4.s, z0.b, z1.b
-; CHECK-NEXT:    sunpklo z0.d, z4.s
-; CHECK-NEXT:    sunpkhi z1.d, z4.s
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
-  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
-  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
-  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(
-  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
-  ret <vscale x 4 x i64> %partial.reduce
-}
-
 define <vscale x 4 x i32> @not_dotp(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: not_dotp:
 ; CHECK:       // %bb.0: // %entry