[RISCV][SelectionDAG] Lower shuffles as bitrotates with vror.vi when possible

lukel97 · lukel97 · commit a61c4a0ef6f9 · 2023-08-30T11:01:47.000+01:00
Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and lower it as a vror.vi (if legal with zvbb enabled). We also need to make sure that the larger element type is a valid SEW, hence the tests for zve32x. X86 already did this, so I've extracted the logic for it and put it inside ShuffleVectorSDNode so it could be reused by RISC-V. I originally tried to add this as a generic combine in DAGCombiner.cpp, but it ended up causing worse codegen on X86 and PPC. Reviewed By: reames, pengfei Differential Revision: https://reviews.llvm.org/D157417
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
@@ -2444,6 +2444,21 @@ class ShuffleVectorInst : public Instruction {
     return isInterleaveMask(Mask, Factor, NumInputElts, StartIndexes);
   }
 
+  /// Checks if the shuffle is a bit rotation of the first operand across
+  /// multiple subelements, e.g:
+  ///
+  /// shuffle <8 x i8> %a, <8 x i8> poison, <8 x i32> <1, 0, 3, 2, 5, 4, 7, 6>
+  ///
+  /// could be expressed as
+  ///
+  /// rotl <4 x i16> %a, 8
+  ///
+  /// If it can be expressed as a rotation, returns the number of subelements to
+  /// group by in NumSubElts and the number of bits to rotate left in RotateAmt.
+  static bool isBitRotateMask(ArrayRef<int> Mask, unsigned EltSizeInBits,
+                              unsigned MinSubElts, unsigned MaxSubElts,
+                              unsigned &NumSubElts, unsigned &RotateAmt);
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ShuffleVector;
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
@@ -2833,6 +2833,45 @@ bool ShuffleVectorInst::isInterleaveMask(
   return true;
 }
 
+/// Try to lower a vector shuffle as a bit rotation.
+///
+/// Look for a repeated rotation pattern in each sub group.
+/// Returns an element-wise left bit rotation amount or -1 if failed.
+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
+  int NumElts = Mask.size();
+  assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
+
+  int RotateAmt = -1;
+  for (int i = 0; i != NumElts; i += NumSubElts) {
+    for (int j = 0; j != NumSubElts; ++j) {
+      int M = Mask[i + j];
+      if (M < 0)
+        continue;
+      if (M < i || M >= i + NumSubElts)
+        return -1;
+      int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
+      if (0 <= RotateAmt && Offset != RotateAmt)
+        return -1;
+      RotateAmt = Offset;
+    }
+  }
+  return RotateAmt;
+}
+
+bool ShuffleVectorInst::isBitRotateMask(
+    ArrayRef<int> Mask, unsigned EltSizeInBits, unsigned MinSubElts,
+    unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt) {
+  for (NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+    int EltRotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+    if (EltRotateAmt < 0)
+      continue;
+    RotateAmt = EltRotateAmt * EltSizeInBits;
+    return true;
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4260,6 +4260,51 @@ static SDValue lowerBitreverseShuffle(ShuffleVectorSDNode *SVN,
   return Res;
 }
 
+// Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can
+// reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and
+// lower it as a vror.vi (if legal with zvbb enabled).
+static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
+                                           SelectionDAG &DAG,
+                                           const RISCVSubtarget &Subtarget) {
+  SDLoc DL(SVN);
+
+  EVT VT = SVN->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumSubElts, RotateAmt;
+  if (!ShuffleVectorInst::isBitRotateMask(SVN->getMask(), EltSizeInBits, 2,
+                                          NumElts, NumSubElts, RotateAmt))
+    return SDValue();
+  MVT RotateVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits * NumSubElts),
+                                  NumElts / NumSubElts);
+
+  // We might have a RotateVT that isn't legal, e.g. v4i64 on zve32x.
+  if (!Subtarget.getTargetLowering()->isOperationLegalOrCustom(ISD::ROTL,
+                                                               RotateVT))
+    return SDValue();
+
+  // If we just create the shift amount with
+  //
+  // DAG.getConstant(RotateAmt, DL, RotateVT)
+  //
+  // then for e64 we get a weird bitcasted build_vector on RV32 that we're
+  // unable to detect as a splat during pattern matching. So directly lower it
+  // to a vmv.v.x which gets matched to vror.vi.
+  MVT ContainerVT = getContainerForFixedLengthVector(DAG, RotateVT, Subtarget);
+  SDValue VL =
+      getDefaultVLOps(RotateVT, ContainerVT, DL, DAG, Subtarget).second;
+  SDValue RotateAmtSplat = DAG.getNode(
+      RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+      DAG.getConstant(RotateAmt, DL, Subtarget.getXLenVT()), VL);
+  RotateAmtSplat =
+      convertFromScalableVector(RotateVT, RotateAmtSplat, DAG, Subtarget);
+
+  SDValue Rotate =
+      DAG.getNode(ISD::ROTL, DL, RotateVT,
+                  DAG.getBitcast(RotateVT, SVN->getOperand(0)), RotateAmtSplat);
+  return DAG.getBitcast(VT, Rotate);
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -4270,6 +4315,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   unsigned NumElts = VT.getVectorNumElements();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
+  // Lower to a vror.vi of a larger element type if possible. Do this before we
+  // promote i1s to i8s.
+  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+    return V;
+
   if (VT.getVectorElementType() == MVT::i1) {
     if (SDValue V = lowerBitreverseShuffle(SVN, DAG, Subtarget))
       return V;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10984,31 +10984,6 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
   return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
 }
 
-/// Try to lower a vector shuffle as a bit rotation.
-///
-/// Look for a repeated rotation pattern in each sub group.
-/// Returns a ISD::ROTL element rotation amount or -1 if failed.
-static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
-  int NumElts = Mask.size();
-  assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
-
-  int RotateAmt = -1;
-  for (int i = 0; i != NumElts; i += NumSubElts) {
-    for (int j = 0; j != NumSubElts; ++j) {
-      int M = Mask[i + j];
-      if (M < 0)
-        continue;
-      if (!isInRange(M, i, i + NumSubElts))
-        return -1;
-      int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
-      if (0 <= RotateAmt && Offset != RotateAmt)
-        return -1;
-      RotateAmt = Offset;
-    }
-  }
-  return RotateAmt;
-}
-
 static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
                                    const X86Subtarget &Subtarget,
                                    ArrayRef<int> Mask) {
@@ -11018,18 +10993,14 @@ static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
   // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
   int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
   int MaxSubElts = 64 / EltSizeInBits;
-  for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
-    int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
-    if (RotateAmt < 0)
-      continue;
-
-    int NumElts = Mask.size();
-    MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
-    RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
-    return RotateAmt * EltSizeInBits;
-  }
-
-  return -1;
+  unsigned RotateAmt, NumSubElts;
+  if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
+                                          MaxSubElts, NumSubElts, RotateAmt))
+    return -1;
+  unsigned NumElts = Mask.size();
+  MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+  RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+  return RotateAmt;
 }
 
 /// Lower shuffle using X86ISD::VROTLI rotations.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -169,13 +169,19 @@ define <1 x i8> @reverse_v1i8(<1 x i8> %a) {
 }
 
 define <2 x i8> @reverse_v2i8(<2 x i8> %a) {
-; CHECK-LABEL: reverse_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; NO-ZVBB-LABEL: reverse_v2i8:
+; NO-ZVBB:       # %bb.0:
+; NO-ZVBB-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; NO-ZVBB-NEXT:    vslidedown.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vslideup.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vmv1r.v v8, v9
+; NO-ZVBB-NEXT:    ret
+;
+; ZVBB-LABEL: reverse_v2i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVBB-NEXT:    vror.vi v8, v8, 8
+; ZVBB-NEXT:    ret
   %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
   ret <2 x i8> %res
 }
@@ -258,13 +264,19 @@ define <1 x i16> @reverse_v1i16(<1 x i16> %a) {
 }
 
 define <2 x i16> @reverse_v2i16(<2 x i16> %a) {
-; CHECK-LABEL: reverse_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; NO-ZVBB-LABEL: reverse_v2i16:
+; NO-ZVBB:       # %bb.0:
+; NO-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; NO-ZVBB-NEXT:    vslidedown.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vslideup.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vmv1r.v v8, v9
+; NO-ZVBB-NEXT:    ret
+;
+; ZVBB-LABEL: reverse_v2i16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVBB-NEXT:    vror.vi v8, v8, 16
+; ZVBB-NEXT:    ret
   %res = call <2 x i16> @llvm.experimental.vector.reverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %res
 }
@@ -332,13 +344,19 @@ define <1 x i32> @reverse_v1i32(<1 x i32> %a) {
 }
 
 define <2 x i32> @reverse_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: reverse_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; NO-ZVBB-LABEL: reverse_v2i32:
+; NO-ZVBB:       # %bb.0:
+; NO-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO-ZVBB-NEXT:    vslidedown.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vslideup.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vmv1r.v v8, v9
+; NO-ZVBB-NEXT:    ret
+;
+; ZVBB-LABEL: reverse_v2i32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVBB-NEXT:    vror.vi v8, v8, 32
+; ZVBB-NEXT:    ret
   %res = call <2 x i32> @llvm.experimental.vector.reverse.v2i32(<2 x i32> %a)
   ret <2 x i32> %res
 }
@@ -572,13 +590,19 @@ define <1 x half> @reverse_v1f16(<1 x half> %a) {
 }
 
 define <2 x half> @reverse_v2f16(<2 x half> %a) {
-; CHECK-LABEL: reverse_v2f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; NO-ZVBB-LABEL: reverse_v2f16:
+; NO-ZVBB:       # %bb.0:
+; NO-ZVBB-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; NO-ZVBB-NEXT:    vslidedown.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vslideup.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vmv1r.v v8, v9
+; NO-ZVBB-NEXT:    ret
+;
+; ZVBB-LABEL: reverse_v2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; ZVBB-NEXT:    vror.vi v8, v8, 16
+; ZVBB-NEXT:    ret
   %res = call <2 x half> @llvm.experimental.vector.reverse.v2f16(<2 x half> %a)
   ret <2 x half> %res
 }
@@ -646,13 +670,19 @@ define <1 x float> @reverse_v1f32(<1 x float> %a) {
 }
 
 define <2 x float> @reverse_v2f32(<2 x float> %a) {
-; CHECK-LABEL: reverse_v2f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v8, 1
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv1r.v v8, v9
-; CHECK-NEXT:    ret
+; NO-ZVBB-LABEL: reverse_v2f32:
+; NO-ZVBB:       # %bb.0:
+; NO-ZVBB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; NO-ZVBB-NEXT:    vslidedown.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vslideup.vi v9, v8, 1
+; NO-ZVBB-NEXT:    vmv1r.v v8, v9
+; NO-ZVBB-NEXT:    ret
+;
+; ZVBB-LABEL: reverse_v2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; ZVBB-NEXT:    vror.vi v8, v8, 32
+; ZVBB-NEXT:    ret
   %res = call <2 x float> @llvm.experimental.vector.reverse.v2f32(<2 x float> %a)
   ret <2 x float> %res
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll