[AArch64][NEON][SVE] Lower mixed sign/zero extended partial reductions to usdot (llvm#107566)

SamTebbs33 · tmsri · commit 7923fdc96d76 · 2024-09-19T21:33:33.000Z
This PR adds lowering for partial reductions of a mix of sign/zero
extended inputs to the usdot intrinsic.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2701,6 +2701,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::SADDLV)
     MAKE_CASE(AArch64ISD::SDOT)
     MAKE_CASE(AArch64ISD::UDOT)
+    MAKE_CASE(AArch64ISD::USDOT)
     MAKE_CASE(AArch64ISD::SMINV)
     MAKE_CASE(AArch64ISD::UMINV)
     MAKE_CASE(AArch64ISD::SMAXV)
@@ -6114,6 +6115,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
+  case Intrinsic::aarch64_neon_usdot:
+  case Intrinsic::aarch64_sve_usdot: {
+    return DAG.getNode(AArch64ISD::USDOT, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  }
   case Intrinsic::get_active_lane_mask: {
     SDValue ID =
         DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
@@ -21849,37 +21855,50 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
 
   auto ExtA = MulOp->getOperand(0);
   auto ExtB = MulOp->getOperand(1);
-  bool IsSExt = ExtA->getOpcode() == ISD::SIGN_EXTEND;
-  bool IsZExt = ExtA->getOpcode() == ISD::ZERO_EXTEND;
-  if (ExtA->getOpcode() != ExtB->getOpcode() || (!IsSExt && !IsZExt))
+
+  if (!ISD::isExtOpcode(ExtA->getOpcode()) ||
+      !ISD::isExtOpcode(ExtB->getOpcode()))
     return SDValue();
+  bool AIsSigned = ExtA->getOpcode() == ISD::SIGN_EXTEND;
+  bool BIsSigned = ExtB->getOpcode() == ISD::SIGN_EXTEND;
 
   auto A = ExtA->getOperand(0);
   auto B = ExtB->getOperand(0);
   if (A.getValueType() != B.getValueType())
     return SDValue();
 
-  unsigned Opcode = 0;
-
-  if (IsSExt)
-    Opcode = AArch64ISD::SDOT;
-  else if (IsZExt)
-    Opcode = AArch64ISD::UDOT;
-
-  assert(Opcode != 0 && "Unexpected dot product case encountered.");
-
   EVT ReducedType = N->getValueType(0);
   EVT MulSrcType = A.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if ((ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) ||
-      (ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) ||
-      (ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) ||
-      (ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
-    return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
+  if (!(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
+      !(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) &&
+      !(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) &&
+      !(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
+    return SDValue();
 
-  return SDValue();
+  // If the extensions are mixed, we should lower it to a usdot instead
+  unsigned Opcode = 0;
+  if (AIsSigned != BIsSigned) {
+    if (!Subtarget->hasMatMulInt8())
+      return SDValue();
+
+    bool Scalable = N->getValueType(0).isScalableVT();
+    // There's no nxv2i64 version of usdot
+    if (Scalable && ReducedType != MVT::nxv4i32)
+      return SDValue();
+
+    Opcode = AArch64ISD::USDOT;
+    // USDOT expects the signed operand to be last
+    if (!BIsSigned)
+      std::swap(A, B);
+  } else if (AIsSigned)
+    Opcode = AArch64ISD::SDOT;
+  else
+    Opcode = AArch64ISD::UDOT;
+
+  return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -280,9 +280,10 @@ enum NodeType : unsigned {
   SADDLP,
   UADDLP,
 
-  // udot/sdot instructions
+  // udot/sdot/usdot instructions
   UDOT,
   SDOT,
+  USDOT,
 
   // Vector across-lanes min/max
   // Only the lower result lane is defined.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -857,6 +857,7 @@ def AArch64frsqrts  : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
 
 def AArch64sdot     : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>;
 def AArch64udot     : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>;
+def AArch64usdot    : SDNode<"AArch64ISD::USDOT", SDT_AArch64Dot>;
 
 def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
 def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
@@ -1419,8 +1420,8 @@ let Predicates = [HasMatMulInt8] in {
 def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
 def  UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
 def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
-defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
-defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>;
 
 // sudot lane has a pattern where usdot is expected (there is no sudot).
 // The second operand is used in the dup operation to repeat the indexed
@@ -1432,7 +1433,7 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
                                         lhs_kind, rhs_kind, RegType, AccumType,
                                         InputType, null_frag> {
   let Pattern = [(set (AccumType RegType:$dst),
-                      (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+                      (AccumType (AArch64usdot (AccumType RegType:$Rd),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4i32 V128:$Rm),
                                         VectorIndexS:$idx)))),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3433,7 +3433,7 @@ let Predicates = [HasSVE, HasMatMulInt8] in {
 } // End HasSVE, HasMatMulInt8
 
 let Predicates = [HasSVEorSME, HasMatMulInt8] in {
-  defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
+  defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", AArch64usdot>;
   defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
   defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
 } // End HasSVEorSME, HasMatMulInt8
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-DOT-LABEL: udot:
@@ -102,7 +103,115 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
   ret <2 x i32> %partial.reduce
 }
 
-define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
+define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
+; CHECK-NOI8MM-LABEL: usdot:
+; CHECK-NOI8MM:       // %bb.0:
+; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: usdot:
+; CHECK-I8MM:       // %bb.0:
+; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-I8MM-NEXT:    ret
+  %u.wide = zext <16 x i8> %u to <16 x i32>
+  %s.wide = sext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
+; CHECK-NOI8MM-LABEL: usdot_narrow:
+; CHECK-NOI8MM:       // %bb.0:
+; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
+; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
+; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: usdot_narrow:
+; CHECK-I8MM:       // %bb.0:
+; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-I8MM-NEXT:    ret
+  %u.wide = zext <8 x i8> %u to <8 x i32>
+  %s.wide = sext <8 x i8> %s to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
+; CHECK-NOI8MM-LABEL: sudot:
+; CHECK-NOI8MM:       // %bb.0:
+; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: sudot:
+; CHECK-I8MM:       // %bb.0:
+; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-I8MM-NEXT:    ret
+  %u.wide = sext <16 x i8> %u to <16 x i32>
+  %s.wide = zext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
+; CHECK-NOI8MM-LABEL: sudot_narrow:
+; CHECK-NOI8MM:       // %bb.0:
+; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
+; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
+; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    ret
+;
+; CHECK-I8MM-LABEL: sudot_narrow:
+; CHECK-I8MM:       // %bb.0:
+; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-I8MM-NEXT:    ret
+  %u.wide = sext <8 x i8> %u to <8 x i32>
+  %s.wide = zext <8 x i8> %s to <8 x i32>
+  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
+  ret <2 x i32> %partial.reduce
+}
+
+define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll