Skip to content

[RISCV] Narrow indices to e16 for LMUL > 1 when lowering vector_reverse #104427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10321,8 +10321,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,

// If this is SEW=8 and VLMAX is potentially more than 256, we need
// to use vrgatherei16.vv.
// TODO: It's also possible to use vrgatherei16.vv for other types to
// decrease register width for the index calculation.
if (MaxVLMAX > 256 && EltSize == 8) {
// If this is LMUL=8, we have to split before can use vrgatherei16.vv.
// Reverse each half, then reassemble them in reverse order.
Expand All @@ -10348,6 +10346,15 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
}

// At LMUL > 1, do the index computation in 16 bits to reduce register
// pressure.
if (IntVT.getScalarType().bitsGT(MVT::i16) &&
IntVT.bitsGT(getLMUL1VT(IntVT))) {
assert(isUInt<16>(MaxVLMAX - 1)); // Largest VLMAX is 65536 @ zvl65536b
GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
IntVT = IntVT.changeVectorElementType(MVT::i16);
}

MVT XLenVT = Subtarget.getXLenVT();
auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);

Expand Down
84 changes: 50 additions & 34 deletions llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1285,10 +1285,11 @@ define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrsub.vx v12, v10, a0
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%res = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
Expand All @@ -1300,10 +1301,11 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vrsub.vx v16, v12, a0
; CHECK-NEXT: vrgather.vv v12, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%res = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
Expand All @@ -1316,10 +1318,11 @@ define <vscale x 16 x i32> @reverse_nxv16i32(<vscale x 16 x i32> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vrsub.vx v24, v16, a0
; CHECK-NEXT: vrgather.vv v16, v8, v24
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%res = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> %a)
Expand Down Expand Up @@ -1348,10 +1351,11 @@ define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrsub.vx v12, v10, a0
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%res = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
Expand All @@ -1364,10 +1368,11 @@ define <vscale x 4 x i64> @reverse_nxv4i64(<vscale x 4 x i64> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vrsub.vx v16, v12, a0
; CHECK-NEXT: vrgather.vv v12, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%res = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> %a)
Expand All @@ -1379,10 +1384,11 @@ define <vscale x 8 x i64> @reverse_nxv8i64(<vscale x 8 x i64> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vrsub.vx v24, v16, a0
; CHECK-NEXT: vrgather.vv v16, v8, v24
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%res = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> %a)
Expand Down Expand Up @@ -1526,10 +1532,11 @@ define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrsub.vx v12, v10, a0
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%res = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
Expand All @@ -1541,10 +1548,11 @@ define <vscale x 8 x float> @reverse_nxv8f32(<vscale x 8 x float> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vrsub.vx v16, v12, a0
; CHECK-NEXT: vrgather.vv v12, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%res = call <vscale x 8 x float> @llvm.vector.reverse.nxv8f32(<vscale x 8 x float> %a)
Expand All @@ -1557,10 +1565,11 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vrsub.vx v24, v16, a0
; CHECK-NEXT: vrgather.vv v16, v8, v24
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%res = call <vscale x 16 x float> @llvm.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
Expand Down Expand Up @@ -1589,10 +1598,11 @@ define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
; CHECK-NEXT: vid.v v10
; CHECK-NEXT: vrsub.vx v12, v10, a0
; CHECK-NEXT: vrgather.vv v10, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
; CHECK-NEXT: vmv.v.v v8, v10
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @llvm.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
Expand All @@ -1605,10 +1615,11 @@ define <vscale x 4 x double> @reverse_nxv4f64(<vscale x 4 x double> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vrsub.vx v16, v12, a0
; CHECK-NEXT: vrgather.vv v12, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%res = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %a)
Expand All @@ -1620,10 +1631,11 @@ define <vscale x 8 x double> @reverse_nxv8f64(<vscale x 8 x double> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vrsub.vx v24, v16, a0
; CHECK-NEXT: vrgather.vv v16, v8, v24
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-NEXT: vrgatherei16.vv v16, v8, v24
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%res = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> %a)
Expand All @@ -1638,10 +1650,11 @@ define <vscale x 3 x i64> @reverse_nxv3i64(<vscale x 3 x i64> %a) {
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 1
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vrsub.vx v12, v12, a0
; CHECK-NEXT: vrgather.vv v16, v8, v12
; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma
; CHECK-NEXT: vrgatherei16.vv v16, v8, v12
; CHECK-NEXT: vmv1r.v v8, v17
; CHECK-NEXT: vmv1r.v v9, v18
; CHECK-NEXT: vmv1r.v v10, v19
Expand All @@ -1655,10 +1668,11 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a0, a0, -1
; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vrsub.vx v16, v16, a0
; CHECK-NEXT: vrgather.vv v24, v8, v16
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; CHECK-NEXT: vrgatherei16.vv v24, v8, v16
; CHECK-NEXT: vmv2r.v v8, v26
; CHECK-NEXT: vmv2r.v v10, v28
; CHECK-NEXT: vmv2r.v v12, v30
Expand All @@ -1684,12 +1698,13 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
; RV32-NEXT: andi sp, sp, -64
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: addi a1, a0, -1
; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
; RV32-NEXT: vid.v v24
; RV32-NEXT: vrsub.vx v24, v24, a1
; RV32-NEXT: vrgather.vv v0, v16, v24
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV32-NEXT: vrgatherei16.vv v0, v16, v24
; RV32-NEXT: vmv4r.v v16, v4
; RV32-NEXT: vrgather.vv v0, v8, v24
; RV32-NEXT: vrgatherei16.vv v0, v8, v24
; RV32-NEXT: vmv4r.v v20, v0
; RV32-NEXT: slli a0, a0, 3
; RV32-NEXT: addi a1, sp, 64
Expand Down Expand Up @@ -1720,12 +1735,13 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
; RV64-NEXT: andi sp, sp, -64
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: addi a1, a0, -1
; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; RV64-NEXT: vsetvli a2, zero, e16, m2, ta, ma
; RV64-NEXT: vid.v v24
; RV64-NEXT: vrsub.vx v24, v24, a1
; RV64-NEXT: vrgather.vv v0, v16, v24
; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma
; RV64-NEXT: vrgatherei16.vv v0, v16, v24
; RV64-NEXT: vmv4r.v v16, v4
; RV64-NEXT: vrgather.vv v0, v8, v24
; RV64-NEXT: vrgatherei16.vv v0, v8, v24
; RV64-NEXT: vmv4r.v v20, v0
; RV64-NEXT: slli a0, a0, 3
; RV64-NEXT: addi a1, sp, 64
Expand Down
Loading