[ARM] Perform lane interleaving from reductions.

davemgreen · davemgreen · commit f634096f6b7b · 2023-02-07T14:12:51.000Z
We have a pass for MVE to perform lane interleaving to make use of top/bottom instructions, that adds shuffles before extends and after truncates. This extends it to also start from add reductions, where the order of lanes does not matter so the shuffle is not needed. We need to be careful about not breaking the form of existing reductions, but otherwise can save some instructions and awkward extends. Differential Revision: https://reviews.llvm.org/D143396
diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp
@@ -154,7 +154,6 @@ static bool isProfitableToInterleave(SmallSetVector<Instruction *, 4> &Exts,
 static bool tryInterleave(Instruction *Start,
                           SmallPtrSetImpl<Instruction *> &Visited) {
   LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n");
-  auto *VT = cast<FixedVectorType>(Start->getType());
 
   if (!isa<Instruction>(Start->getOperand(0)))
     return false;
@@ -165,6 +164,7 @@ static bool tryInterleave(Instruction *Start,
   Worklist.push_back(cast<Instruction>(Start->getOperand(0)));
 
   SmallSetVector<Instruction *, 4> Truncs;
+  SmallSetVector<Instruction *, 4> Reducts;
   SmallSetVector<Instruction *, 4> Exts;
   SmallSetVector<Use *, 4> OtherLeafs;
   SmallSetVector<Instruction *, 4> Ops;
@@ -198,6 +198,13 @@ static bool tryInterleave(Instruction *Start,
       if (!II)
         return false;
 
+      if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) {
+        if (!Reducts.insert(I))
+          continue;
+        Visited.insert(I);
+        break;
+      }
+
       switch (II->getIntrinsicID()) {
       case Intrinsic::abs:
       case Intrinsic::smin:
@@ -267,21 +274,32 @@ static bool tryInterleave(Instruction *Start,
     return false;
 
   LLVM_DEBUG({
-    dbgs() << "Found group:\n  Exts:";
+    dbgs() << "Found group:\n  Exts:\n";
     for (auto *I : Exts)
       dbgs() << "  " << *I << "\n";
-    dbgs() << "  Ops:";
+    dbgs() << "  Ops:\n";
     for (auto *I : Ops)
       dbgs() << "  " << *I << "\n";
-    dbgs() << "  OtherLeafs:";
+    dbgs() << "  OtherLeafs:\n";
     for (auto *I : OtherLeafs)
       dbgs() << "  " << *I->get() << " of " << *I->getUser() << "\n";
-    dbgs() << "Truncs:";
+    dbgs() << "  Truncs:\n";
     for (auto *I : Truncs)
       dbgs() << "  " << *I << "\n";
+    dbgs() << "  Reducts:\n";
+    for (auto *I : Reducts)
+      dbgs() << "  " << *I << "\n";
   });
 
-  assert(!Truncs.empty() && "Expected some truncs");
+  assert((!Truncs.empty() || !Reducts.empty()) &&
+         "Expected some truncs or reductions");
+  if (Truncs.empty() && Exts.empty())
+    return false;
+
+  auto *VT = !Truncs.empty()
+                 ? cast<FixedVectorType>(Truncs[0]->getType())
+                 : cast<FixedVectorType>(Exts[0]->getOperand(0)->getType());
+  LLVM_DEBUG(dbgs() << "Using VT:" << *VT << "\n");
 
   // Check types
   unsigned NumElts = VT->getNumElements();
@@ -311,6 +329,14 @@ static bool tryInterleave(Instruction *Start,
   // Check that it looks beneficial
   if (!isProfitableToInterleave(Exts, Truncs))
     return false;
+  if (!Reducts.empty() && (Ops.empty() || all_of(Ops, [](Instruction *I) {
+                             return I->getOpcode() == Instruction::Mul ||
+                                    I->getOpcode() == Instruction::Select ||
+                                    I->getOpcode() == Instruction::ICmp;
+                           }))) {
+    LLVM_DEBUG(dbgs() << "Reduction does not look profitable\n");
+    return false;
+  }
 
   // Create new shuffles around the extends / truncs / other leaves.
   IRBuilder<> Builder(Start);
@@ -367,6 +393,14 @@ static bool tryInterleave(Instruction *Start,
   return true;
 }
 
+// Add reductions are fairly common and associative, meaning we can start the
+// interleaving from them and don't need to emit a shuffle.
+static bool isAddReduction(Instruction &I) {
+  if (auto *II = dyn_cast<IntrinsicInst>(&I))
+    return II->getIntrinsicID() == Intrinsic::vector_reduce_add;
+  return false;
+}
+
 bool MVELaneInterleaving::runOnFunction(Function &F) {
   if (!EnableInterleave)
     return false;
@@ -380,8 +414,10 @@ bool MVELaneInterleaving::runOnFunction(Function &F) {
 
   SmallPtrSet<Instruction *, 16> Visited;
   for (Instruction &I : reverse(instructions(F))) {
-    if (I.getType()->isVectorTy() &&
-        (isa<TruncInst>(I) || isa<FPTruncInst>(I)) && !Visited.count(&I))
+    if (((I.getType()->isVectorTy() &&
+          (isa<TruncInst>(I) || isa<FPTruncInst>(I))) ||
+         isAddReduction(I)) &&
+        !Visited.count(&I))
       Changed |= tryInterleave(&I, Visited);
   }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll
@@ -4,23 +4,12 @@
 define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: reduce_v16i16_shift_mul:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstrw.32 q1, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
-; CHECK-NEXT:    vldrb.u16 q1, [r1, #8]
-; CHECK-NEXT:    vldrb.u16 q2, [r1]
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vldrb.u16 q1, [r0]
+; CHECK-NEXT:    vmullt.u8 q2, q0, q1
+; CHECK-NEXT:    vmullb.u8 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q2, q2, #14
 ; CHECK-NEXT:    vshr.s16 q0, q0, #14
-; CHECK-NEXT:    vmul.i16 q1, q2, q1
-; CHECK-NEXT:    vaddv.u16 r0, q0
-; CHECK-NEXT:    vshr.s16 q1, q1, #14
-; CHECK-NEXT:    vaddva.u16 r0, q1
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vaddv.u16 r0, q2
+; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i16>
@@ -50,23 +39,16 @@ entry:
 define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) {
 ; CHECK-LABEL: reduce_v16i16_shift_sub:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    vstrw.32 q1, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
-; CHECK-NEXT:    vldrb.u16 q1, [r1, #8]
-; CHECK-NEXT:    vldrb.u16 q2, [r1]
-; CHECK-NEXT:    vsub.i16 q0, q1, q0
-; CHECK-NEXT:    vldrb.u16 q1, [r0]
+; CHECK-NEXT:    vmovlt.u8 q2, q1
+; CHECK-NEXT:    vmovlt.u8 q3, q0
+; CHECK-NEXT:    vsub.i16 q2, q3, q2
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vshr.s16 q2, q2, #14
+; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vaddv.u16 r0, q2
 ; CHECK-NEXT:    vshr.s16 q0, q0, #14
-; CHECK-NEXT:    vsub.i16 q1, q2, q1
-; CHECK-NEXT:    vaddv.u16 r0, q0
-; CHECK-NEXT:    vshr.s16 q1, q1, #14
-; CHECK-NEXT:    vaddva.u16 r0, q1
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vaddva.u16 r0, q0
 ; CHECK-NEXT:    bx lr
 entry:
   %s0s = zext <16 x i8> %s0 to <16 x i16>
@@ -190,17 +172,15 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef
 ; CHECK-NEXT:  .LBB4_8: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrh.s32 q2, [r5], #16
-; CHECK-NEXT:    vldrh.s32 q1, [r4], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r4], #16
+; CHECK-NEXT:    vldrh.u16 q2, [r5], #16
 ; CHECK-NEXT:    rsb.w r1, r12, #0
-; CHECK-NEXT:    vmul.i32 q1, q2, q1
-; CHECK-NEXT:    vldrh.s32 q2, [r4, #-8]
-; CHECK-NEXT:    vldrh.s32 q3, [r5, #-8]
+; CHECK-NEXT:    vmullb.s16 q3, q2, q1
+; CHECK-NEXT:    vmullt.s16 q1, q2, q1
+; CHECK-NEXT:    vshl.s32 q3, r1
 ; CHECK-NEXT:    vshl.s32 q1, r1
+; CHECK-NEXT:    vaddva.u32 r6, q3
 ; CHECK-NEXT:    vaddva.u32 r6, q1
-; CHECK-NEXT:    vmul.i32 q2, q3, q2
-; CHECK-NEXT:    vshl.s32 q2, r1
-; CHECK-NEXT:    vaddva.u32 r6, q2
 ; CHECK-NEXT:    le lr, .LBB4_8
 ; CHECK-NEXT:  @ %bb.9: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB4_4 Depth=1