[RISCV] Lower shuffles as rotates without zvbb

lukel97 · lukel97 · commit 6098d7d5f653 · 2023-09-04T09:35:12.000+01:00
Now that the codegen for the expanded ISD::ROTL sequence has been improved, it's probably profitable to lower a shuffle that's a rotate to the vsll+vsrl+vor sequence to avoid a vrgather where possible, even if we don't have the vror instruction. This patch relaxes the restriction on ISD::ROTL being legal in lowerVECTOR_SHUFFLEAsRotate. It also attempts to do the lowering twice: Once if zvbb is enabled before any of the interleave/deinterleave/vmerge lowerings, and a second time unconditionally just before it falls back to the vrgather. This way it doesn't interfere with any of the above patterns that may be more profitable than the expanded ISD::ROTL sequence. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D159353
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4230,8 +4230,8 @@ static SDValue lowerBitreverseShuffle(ShuffleVectorSDNode *SVN,
 }
 
 // Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can
-// reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and
-// lower it as a vror.vi (if legal with zvbb enabled).
+// reinterpret it as a v2i32 and rotate it right by 8 instead. We can lower this
+// as a vror.vi if we have zvbb, or otherwise as a vsll, vsrl and vor.
 static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
                                            SelectionDAG &DAG,
                                            const RISCVSubtarget &Subtarget) {
@@ -4248,8 +4248,7 @@ static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
                                   NumElts / NumSubElts);
 
   // We might have a RotateVT that isn't legal, e.g. v4i64 on zve32x.
-  if (!Subtarget.getTargetLowering()->isOperationLegalOrCustom(ISD::ROTL,
-                                                               RotateVT))
+  if (!Subtarget.getTargetLowering()->isTypeLegal(RotateVT))
     return SDValue();
 
   SDValue Op = DAG.getBitcast(RotateVT, SVN->getOperand(0));
@@ -4276,12 +4275,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   unsigned NumElts = VT.getVectorNumElements();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
-  // Lower to a vror.vi of a larger element type if possible. Do this before we
-  // promote i1s to i8s.
-  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
-    return V;
-
   if (VT.getVectorElementType() == MVT::i1) {
+    // Lower to a vror.vi of a larger element type if possible before we promote
+    // i1s to i8s.
+    if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+      return V;
     if (SDValue V = lowerBitreverseShuffle(SVN, DAG, Subtarget))
       return V;
 
@@ -4384,6 +4382,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
           lowerVECTOR_SHUFFLEAsVSlidedown(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
+  // A bitrotate will be one instruction on zvbb, so try to lower to it first if
+  // available.
+  if (Subtarget.hasStdExtZvbb())
+    if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+      return V;
+
   // Lower rotations to a SLIDEDOWN and a SLIDEUP. One of the source vectors may
   // be undef which can be handled with a single SLIDEDOWN/UP.
   int LoSrc, HiSrc;
@@ -4510,6 +4514,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   if (IsSelect)
     return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
 
+  // We might be able to express the shuffle as a bitrotate. But even if we
+  // don't have zvbb and have to expand, the expanded sequence of approx. 2
+  // shifts and a vor will have a higher throughput than a vrgather.
+  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+    return V;
+
   if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
     // On such a large vector we're unable to use i8 as the index type.
     // FIXME: We could promote the index to i16 and use vrgatherei16, but that
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -787,23 +787,13 @@ define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
 ; This interleaves the first 2 elements of a vector in opposite order. With
 ; undefs for the remaining elements. We use to miscompile this.
 define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) {
-; V128-LABEL: unary_interleave_10uu_v4i8:
-; V128:       # %bb.0:
-; V128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; V128-NEXT:    vid.v v9
-; V128-NEXT:    vrsub.vi v10, v9, 1
-; V128-NEXT:    vrgather.vv v9, v8, v10
-; V128-NEXT:    vmv1r.v v8, v9
-; V128-NEXT:    ret
-;
-; V512-LABEL: unary_interleave_10uu_v4i8:
-; V512:       # %bb.0:
-; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
-; V512-NEXT:    vid.v v9
-; V512-NEXT:    vrsub.vi v10, v9, 1
-; V512-NEXT:    vrgather.vv v9, v8, v10
-; V512-NEXT:    vmv1r.v v8, v9
-; V512-NEXT:    ret
+; CHECK-LABEL: unary_interleave_10uu_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 8
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    ret
   %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
   ret <4 x i8> %a
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll