[AArch64] Sink splat shuffles to lane index intrinsics

davemgreen · fhahn · commit 167747a63d16 · 2022-01-14T10:49:57.000Z
This teaches AArch64TargetLowering::shouldSinkOperands to sink splat shuffles to certain neon intrinsics, so that they can make use of the lane variants of the instructions that are available. Differential Revision: https://reviews.llvm.org/D112994 (cherry-picked from 760d4d0)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12107,6 +12107,12 @@ static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
   return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
 }
 
+static bool isSplatShuffle(Value *V) {
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
+    return is_splat(Shuf->getShuffleMask());
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -12117,12 +12123,24 @@ bool AArch64TargetLowering::shouldSinkOperands(
 
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     switch (II->getIntrinsicID()) {
+    case Intrinsic::aarch64_neon_smull:
     case Intrinsic::aarch64_neon_umull:
-      if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
-        return false;
-      Ops.push_back(&II->getOperandUse(0));
-      Ops.push_back(&II->getOperandUse(1));
-      return true;
+      if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
+        Ops.push_back(&II->getOperandUse(0));
+        Ops.push_back(&II->getOperandUse(1));
+        return true;
+      }
+      LLVM_FALLTHROUGH;
+
+    case Intrinsic::aarch64_neon_sqdmull:
+    case Intrinsic::aarch64_neon_sqdmulh:
+    case Intrinsic::aarch64_neon_sqrdmulh:
+      // Sink splats for index lane variants
+      if (isSplatShuffle(II->getOperand(0)))
+        Ops.push_back(&II->getOperandUse(0));
+      if (isSplatShuffle(II->getOperand(1)))
+        Ops.push_back(&II->getOperandUse(1));
+      return !Ops.empty();
 
     case Intrinsic::aarch64_neon_pmull64:
       if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll
@@ -7,12 +7,11 @@ define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB0_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    smlal v0.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    b.eq .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
@@ -40,12 +39,11 @@ define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB1_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    b.eq .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %l2
 ; CHECK-NEXT:    ret
@@ -73,12 +71,11 @@ define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB2_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    sqadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -107,12 +104,11 @@ define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB3_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqrdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    sqsub v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB3_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -141,12 +137,11 @@ define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) {
 ; CHECK-NEXT:    mov v1.16b, v0.16b
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4s, v1.s[3]
 ; CHECK-NEXT:  .LBB4_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr q2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqdmulh v2.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sqdmulh v2.4s, v2.4s, v1.s[3]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB4_1
 ; CHECK-NEXT:  // %bb.2: // %l2
@@ -175,12 +170,11 @@ define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) {
 ; CHECK-NEXT:    fmov d1, d0
 ; CHECK-NEXT:    mov w8, #1
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    dup v1.4h, v1.h[3]
 ; CHECK-NEXT:  .LBB5_1: // %l1
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr d2, [x0]
 ; CHECK-NEXT:    subs w8, w8, #1
-; CHECK-NEXT:    sqdmull v2.4s, v2.4h, v1.4h
+; CHECK-NEXT:    sqdmull v2.4s, v2.4h, v1.h[3]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    b.eq .LBB5_1
 ; CHECK-NEXT:  // %bb.2: // %l2
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@@ -150,6 +150,38 @@ if.else:
   ret <8 x i16> %vmull1
 }
 
+; The masks used are suitable for umull, sink shufflevector to users.
+define <8 x i16> @sink_shufflevector_smull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_smull(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
+; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
+;
+entry:
+  %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  ret <8 x i16> %vmull0
+
+if.else:
+  %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  ret <8 x i16> %vmull1
+}
+
 ; Both exts and their shufflevector operands can be sunk.
 define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: @sink_shufflevector_ext_subadd(
@@ -271,8 +303,8 @@ if.else:
 }
 
 
-; Function Attrs: nounwind readnone
-declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
 
 ; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur.
 define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {