Skip to content

Commit 6b3cbf2

Browse files
authored
[RISCV] Decompose locally repeating shuffles (without exact VLEN) (#125735)
High LMUL shuffles are expensive on typical SIMD implementations. Without exact vector length knowledge, we struggle to map elements within the vector to the register within the vector register group. However, there are some patterns where we can perform a vector length agnostic (VLA) shuffle by leveraging knowledge of the pattern performed even without the ability to map individual elements to registers. An existing in tree example is vector reverse. This patch introduces another such case. Specifically, if we have a shuffle where the a local rearrangement of elements is happening within a 128b (really zvlNb) chunk, and we're applying the same pattern to each chunk, we can decompose a high LMUL shuffle into a linear number of m1 shuffles. We take advantage of the fact the tail of the operation is undefined, and repeat the pattern for all elements in the source register group - not just the ones the fixed vector type covers. This is an optimization for typical SIMD vrgather designs, but could be a pessimation on hardware for which vrgather's execution cost is not independent of the runtime VL.
1 parent fc4210f commit 6b3cbf2

File tree

3 files changed

+236
-75
lines changed

3 files changed

+236
-75
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5325,6 +5325,21 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
53255325
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
53265326
}
53275327

5328+
/// Is this mask local (i.e. elements only move within their local span), and
5329+
/// repeating (that is, the same rearrangement is being done within each span)?
5330+
static bool isLocalRepeatingShuffle(ArrayRef<int> Mask, int Span) {
5331+
// TODO: Could improve the case where undef elements exist in the first span.
5332+
for (auto [I, M] : enumerate(Mask)) {
5333+
if (M == -1)
5334+
continue;
5335+
int ChunkLo = I - (I % Span);
5336+
int ChunkHi = ChunkLo + Span;
5337+
if (M < ChunkLo || M >= ChunkHi || M - ChunkLo != Mask[I % Span])
5338+
return false;
5339+
}
5340+
return true;
5341+
}
5342+
53285343
/// Try to widen element type to get a new mask value for a better permutation
53295344
/// sequence. This doesn't try to inspect the widened mask for profitability;
53305345
/// we speculate the widened form is equal or better. This has the effect of
@@ -5686,10 +5701,43 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
56865701
: DAG.getUNDEF(XLenVT));
56875702
}
56885703
SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5689-
LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
5690-
Subtarget);
5691-
SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5692-
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5704+
LHSIndices =
5705+
convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
5706+
5707+
SDValue Gather;
5708+
// If we have a locally repeating mask, then we can reuse the first register
5709+
// in the index register group for all registers within the source register
5710+
// group. TODO: This generalizes to m2, and m4. Also, this is currently
5711+
// picking up cases with a fully undef tail which could be more directly
5712+
// handled with fewer redundant vrgathers
5713+
const MVT M1VT = getLMUL1VT(ContainerVT);
5714+
auto VLMAX = RISCVTargetLowering::computeVLMAXBounds(M1VT, Subtarget).first;
5715+
if (ContainerVT.bitsGT(M1VT) && isLocalRepeatingShuffle(Mask, VLMAX)) {
5716+
EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
5717+
SDValue SubIndex =
5718+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices,
5719+
DAG.getVectorIdxConstant(0, DL));
5720+
auto [InnerTrueMask, InnerVL] =
5721+
getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
5722+
int N = ContainerVT.getVectorMinNumElements() /
5723+
M1VT.getVectorMinNumElements();
5724+
assert(isPowerOf2_32(N) && N <= 8);
5725+
Gather = DAG.getUNDEF(ContainerVT);
5726+
for (int i = 0; i < N; i++) {
5727+
SDValue SubIdx =
5728+
DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL);
5729+
SDValue SubV1 =
5730+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, SubIdx);
5731+
SDValue SubVec =
5732+
DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex,
5733+
DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL);
5734+
Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather,
5735+
SubVec, SubIdx);
5736+
}
5737+
} else {
5738+
Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5739+
DAG.getUNDEF(ContainerVT), TrueMask, VL);
5740+
}
56935741
return convertFromScalableVector(VT, Gather, DAG, Subtarget);
56945742
}
56955743

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll

Lines changed: 72 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -874,27 +874,30 @@ define <16 x i8> @reverse_v16i8_2(<8 x i8> %a, <8 x i8> %b) {
874874
define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
875875
; CHECK-LABEL: reverse_v32i8_2:
876876
; CHECK: # %bb.0:
877-
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
878-
; CHECK-NEXT: vmv1r.v v10, v9
879877
; CHECK-NEXT: csrr a0, vlenb
880-
; CHECK-NEXT: vid.v v12
881-
; CHECK-NEXT: addi a1, a0, -1
882-
; CHECK-NEXT: vrsub.vx v12, v12, a1
883-
; CHECK-NEXT: lui a1, 16
884-
; CHECK-NEXT: addi a1, a1, -1
878+
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
879+
; CHECK-NEXT: vid.v v10
880+
; CHECK-NEXT: li a1, 32
881+
; CHECK-NEXT: addi a2, a0, -1
882+
; CHECK-NEXT: vrsub.vx v10, v10, a2
883+
; CHECK-NEXT: lui a2, 16
885884
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
886-
; CHECK-NEXT: vrgatherei16.vv v15, v8, v12
887-
; CHECK-NEXT: vrgatherei16.vv v14, v9, v12
885+
; CHECK-NEXT: vrgatherei16.vv v15, v8, v10
886+
; CHECK-NEXT: vrgatherei16.vv v14, v12, v10
887+
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
888+
; CHECK-NEXT: vid.v v10
889+
; CHECK-NEXT: addi a2, a2, -1
890+
; CHECK-NEXT: vrsub.vi v10, v10, 15
891+
; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
892+
; CHECK-NEXT: vrgather.vv v17, v13, v10
893+
; CHECK-NEXT: vrgather.vv v16, v9, v10
888894
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
889-
; CHECK-NEXT: vmv.s.x v0, a1
890-
; CHECK-NEXT: li a1, 32
895+
; CHECK-NEXT: vmv.s.x v0, a2
891896
; CHECK-NEXT: slli a0, a0, 1
892-
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu
893-
; CHECK-NEXT: vid.v v8
894897
; CHECK-NEXT: addi a0, a0, -32
895-
; CHECK-NEXT: vrsub.vi v12, v8, 15
898+
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
896899
; CHECK-NEXT: vslidedown.vx v8, v14, a0
897-
; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t
900+
; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0
898901
; CHECK-NEXT: ret
899902
%res = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
900903
ret <32 x i8> %res
@@ -943,23 +946,25 @@ define <8 x i16> @reverse_v8i16_2(<4 x i16> %a, <4 x i16> %b) {
943946
define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
944947
; CHECK-LABEL: reverse_v16i16_2:
945948
; CHECK: # %bb.0:
946-
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
947-
; CHECK-NEXT: vmv1r.v v10, v9
949+
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
950+
; CHECK-NEXT: vid.v v10
948951
; CHECK-NEXT: csrr a0, vlenb
952+
; CHECK-NEXT: vrsub.vi v10, v10, 7
953+
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
954+
; CHECK-NEXT: vrgather.vv v13, v12, v10
955+
; CHECK-NEXT: vrgather.vv v12, v9, v10
949956
; CHECK-NEXT: vid.v v9
950957
; CHECK-NEXT: srli a1, a0, 1
951958
; CHECK-NEXT: addi a1, a1, -1
952959
; CHECK-NEXT: vrsub.vx v9, v9, a1
953-
; CHECK-NEXT: vrgather.vv v13, v8, v9
954-
; CHECK-NEXT: vrgather.vv v12, v11, v9
955-
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
956-
; CHECK-NEXT: vid.v v8
957960
; CHECK-NEXT: li a1, 255
958961
; CHECK-NEXT: addi a0, a0, -16
959-
; CHECK-NEXT: vrsub.vi v14, v8, 7
962+
; CHECK-NEXT: vrgather.vv v15, v8, v9
963+
; CHECK-NEXT: vrgather.vv v14, v10, v9
960964
; CHECK-NEXT: vmv.s.x v0, a1
961-
; CHECK-NEXT: vslidedown.vx v8, v12, a0
962-
; CHECK-NEXT: vrgather.vv v8, v10, v14, v0.t
965+
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
966+
; CHECK-NEXT: vslidedown.vx v8, v14, a0
967+
; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
963968
; CHECK-NEXT: ret
964969
%res = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
965970
ret <16 x i16> %res
@@ -1024,24 +1029,27 @@ define <4 x i32> @reverse_v4i32_2(<2 x i32> %a, < 2 x i32> %b) {
10241029
define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
10251030
; CHECK-LABEL: reverse_v8i32_2:
10261031
; CHECK: # %bb.0:
1027-
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1028-
; CHECK-NEXT: vmv1r.v v10, v9
1032+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1033+
; CHECK-NEXT: vid.v v10
10291034
; CHECK-NEXT: csrr a0, vlenb
1030-
; CHECK-NEXT: vid.v v9
1031-
; CHECK-NEXT: srli a1, a0, 2
1032-
; CHECK-NEXT: addi a1, a1, -1
1033-
; CHECK-NEXT: vrsub.vx v9, v9, a1
1034-
; CHECK-NEXT: vrgather.vv v13, v8, v9
1035-
; CHECK-NEXT: vrgather.vv v12, v11, v9
1035+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
1036+
; CHECK-NEXT: vid.v v12
10361037
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1037-
; CHECK-NEXT: vid.v v8
1038-
; CHECK-NEXT: vmv.v.i v0, 15
1038+
; CHECK-NEXT: vrsub.vi v10, v10, 3
1039+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
1040+
; CHECK-NEXT: vrgatherei16.vv v15, v11, v10
1041+
; CHECK-NEXT: vrgatherei16.vv v14, v9, v10
1042+
; CHECK-NEXT: srli a1, a0, 2
10391043
; CHECK-NEXT: srli a0, a0, 1
1040-
; CHECK-NEXT: vrsub.vi v14, v8, 3
1044+
; CHECK-NEXT: addi a1, a1, -1
10411045
; CHECK-NEXT: addi a0, a0, -8
1042-
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
1046+
; CHECK-NEXT: vrsub.vx v10, v12, a1
1047+
; CHECK-NEXT: vrgather.vv v13, v8, v10
1048+
; CHECK-NEXT: vrgather.vv v12, v9, v10
1049+
; CHECK-NEXT: vmv.v.i v0, 15
1050+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
10431051
; CHECK-NEXT: vslidedown.vx v8, v12, a0
1044-
; CHECK-NEXT: vrgatherei16.vv v8, v10, v14, v0.t
1052+
; CHECK-NEXT: vmerge.vvm v8, v8, v14, v0
10451053
; CHECK-NEXT: ret
10461054
%res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
10471055
ret <8 x i32> %res
@@ -1197,23 +1205,25 @@ define <8 x half> @reverse_v8f16_2(<4 x half> %a, <4 x half> %b) {
11971205
define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) {
11981206
; CHECK-LABEL: reverse_v16f16_2:
11991207
; CHECK: # %bb.0:
1200-
; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
1201-
; CHECK-NEXT: vmv1r.v v10, v9
1208+
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
1209+
; CHECK-NEXT: vid.v v10
12021210
; CHECK-NEXT: csrr a0, vlenb
1211+
; CHECK-NEXT: vrsub.vi v10, v10, 7
1212+
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
1213+
; CHECK-NEXT: vrgather.vv v13, v12, v10
1214+
; CHECK-NEXT: vrgather.vv v12, v9, v10
12031215
; CHECK-NEXT: vid.v v9
12041216
; CHECK-NEXT: srli a1, a0, 1
12051217
; CHECK-NEXT: addi a1, a1, -1
12061218
; CHECK-NEXT: vrsub.vx v9, v9, a1
1207-
; CHECK-NEXT: vrgather.vv v13, v8, v9
1208-
; CHECK-NEXT: vrgather.vv v12, v11, v9
1209-
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu
1210-
; CHECK-NEXT: vid.v v8
12111219
; CHECK-NEXT: li a1, 255
12121220
; CHECK-NEXT: addi a0, a0, -16
1213-
; CHECK-NEXT: vrsub.vi v14, v8, 7
1221+
; CHECK-NEXT: vrgather.vv v15, v8, v9
1222+
; CHECK-NEXT: vrgather.vv v14, v10, v9
12141223
; CHECK-NEXT: vmv.s.x v0, a1
1215-
; CHECK-NEXT: vslidedown.vx v8, v12, a0
1216-
; CHECK-NEXT: vrgather.vv v8, v10, v14, v0.t
1224+
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
1225+
; CHECK-NEXT: vslidedown.vx v8, v14, a0
1226+
; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0
12171227
; CHECK-NEXT: ret
12181228
%res = shufflevector <8 x half> %a, <8 x half> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
12191229
ret <16 x half> %res
@@ -1269,24 +1279,27 @@ define <4 x float> @reverse_v4f32_2(<2 x float> %a, <2 x float> %b) {
12691279
define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) {
12701280
; CHECK-LABEL: reverse_v8f32_2:
12711281
; CHECK: # %bb.0:
1272-
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1273-
; CHECK-NEXT: vmv1r.v v10, v9
1282+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1283+
; CHECK-NEXT: vid.v v10
12741284
; CHECK-NEXT: csrr a0, vlenb
1275-
; CHECK-NEXT: vid.v v9
1276-
; CHECK-NEXT: srli a1, a0, 2
1277-
; CHECK-NEXT: addi a1, a1, -1
1278-
; CHECK-NEXT: vrsub.vx v9, v9, a1
1279-
; CHECK-NEXT: vrgather.vv v13, v8, v9
1280-
; CHECK-NEXT: vrgather.vv v12, v11, v9
1285+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
1286+
; CHECK-NEXT: vid.v v12
12811287
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
1282-
; CHECK-NEXT: vid.v v8
1283-
; CHECK-NEXT: vmv.v.i v0, 15
1288+
; CHECK-NEXT: vrsub.vi v10, v10, 3
1289+
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
1290+
; CHECK-NEXT: vrgatherei16.vv v15, v11, v10
1291+
; CHECK-NEXT: vrgatherei16.vv v14, v9, v10
1292+
; CHECK-NEXT: srli a1, a0, 2
12841293
; CHECK-NEXT: srli a0, a0, 1
1285-
; CHECK-NEXT: vrsub.vi v14, v8, 3
1294+
; CHECK-NEXT: addi a1, a1, -1
12861295
; CHECK-NEXT: addi a0, a0, -8
1287-
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu
1296+
; CHECK-NEXT: vrsub.vx v10, v12, a1
1297+
; CHECK-NEXT: vrgather.vv v13, v8, v10
1298+
; CHECK-NEXT: vrgather.vv v12, v9, v10
1299+
; CHECK-NEXT: vmv.v.i v0, 15
1300+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
12881301
; CHECK-NEXT: vslidedown.vx v8, v12, a0
1289-
; CHECK-NEXT: vrgatherei16.vv v8, v10, v14, v0.t
1302+
; CHECK-NEXT: vmerge.vvm v8, v8, v14, v0
12901303
; CHECK-NEXT: ret
12911304
%res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
12921305
ret <8 x float> %res

0 commit comments

Comments
 (0)