-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Fold vector shift of sext/zext to widening multiply #121563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
pfusik
commented
Jan 3, 2025
(shl (sext X), C) -> (vwmulsu X, 1u << C) (shl (zext X), C) -> (vwmulu X, 1u << C)
@llvm/pr-subscribers-backend-risc-v Author: Piotr Fusik (pfusik) Changes
Patch is 115.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121563.diff 7 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 04dd23d9cdaa20..955a15393ca8a1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -17341,6 +17341,78 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG,
return DAG.getZExtOrTrunc(Pop, DL, VT);
}
+static SDValue combineSHL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const RISCVSubtarget &Subtarget) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ // (shl (zext x), y) -> (vwsll x, y)
+ if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ return V;
+
+ // (shl (sext x), C) -> (vwmulsu x, 1u << C)
+ // (shl (zext x), C) -> (vwmulu x, 1u << C)
+
+ SDValue LHS = N->getOperand(0);
+ if (!LHS.hasOneUse())
+ return SDValue();
+ unsigned Opcode;
+ switch (LHS.getOpcode()) {
+ case ISD::SIGN_EXTEND:
+ Opcode = RISCVISD::VWMULSU_VL;
+ break;
+ case ISD::ZERO_EXTEND:
+ Opcode = RISCVISD::VWMULU_VL;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue RHS = N->getOperand(1);
+ APInt ShAmt;
+ if (!ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
+ return SDValue();
+
+ // Better foldings:
+ // (shl (sext x), 1) -> (vwadd x, x)
+ // (shl (zext x), 1) -> (vwaddu x, x)
+ uint64_t ShAmtInt = ShAmt.getZExtValue();
+ if (ShAmtInt <= 1)
+ return SDValue();
+
+ SDValue NarrowOp = LHS.getOperand(0);
+ EVT NarrowVT = NarrowOp.getValueType();
+ uint64_t NarrowBits = NarrowVT.getScalarSizeInBits();
+ if (ShAmtInt >= NarrowBits)
+ return SDValue();
+ EVT VT = N->getValueType(0);
+ if (NarrowBits * 2 != VT.getScalarSizeInBits())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue Passthru, Mask, VL;
+ switch (N->getOpcode()) {
+ case ISD::SHL:
+ if (!VT.isScalableVector())
+ return SDValue();
+ Passthru = DAG.getUNDEF(VT);
+ std::tie(Mask, VL) =
+ getDefaultScalableVLOps(VT.getSimpleVT(), DL, DAG, Subtarget);
+ break;
+ case RISCVISD::SHL_VL:
+ Passthru = N->getOperand(2);
+ Mask = N->getOperand(3);
+ VL = N->getOperand(4);
+ break;
+ default:
+ llvm_unreachable("Expected SHL");
+ }
+ return DAG.getNode(Opcode, DL, VT, NarrowOp,
+ DAG.getConstant(1ULL << ShAmtInt, SDLoc(RHS), NarrowVT),
+ Passthru, Mask, VL);
+}
+
SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -17970,7 +18042,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
case RISCVISD::SHL_VL:
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineSHL(N, DCI, Subtarget))
return V;
[[fallthrough]];
case RISCVISD::SRA_VL:
@@ -17995,7 +18067,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SRL:
case ISD::SHL: {
if (N->getOpcode() == ISD::SHL) {
- if (SDValue V = combineOp_VLToVWOp_VL(N, DCI, Subtarget))
+ if (SDValue V = combineSHL(N, DCI, Subtarget))
return V;
}
SDValue ShAmt = N->getOperand(1);
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 9ee2324f615dd8..0fad09f27007c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -775,11 +775,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i8_nxv8i32(ptr %base, <vscal
define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 2
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT: vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v12, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -791,10 +791,11 @@ define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i8_nxv8i32(ptr %base, <vscal
define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -815,10 +816,11 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i16_nxv8i32(ptr %base, <vscale x
define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -840,10 +842,11 @@ define <vscale x 8 x i32> @mgather_baseidx_sext_nxv8i16_nxv8i32(ptr %base, <vsca
define <vscale x 8 x i32> @mgather_baseidx_zext_nxv8i16_nxv8i32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i32> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vzext.vf2 v16, v8
-; CHECK-NEXT: vsll.vi v8, v16, 2
-; CHECK-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v16, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vluxei32.v v12, (a0), v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -863,10 +866,9 @@ define <vscale x 8 x i32> @mgather_baseidx_nxv8i32(ptr %base, <vscale x 8 x i32>
;
; RV64-LABEL: mgather_baseidx_nxv8i32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v8
-; RV64-NEXT: vsll.vi v16, v16, 2
-; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT: li a1, 4
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT: vwmulsu.vx v16, v8, a1
; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
@@ -1034,11 +1036,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i8_nxv8i64(ptr %base, <vscal
define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v16, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -1050,11 +1052,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i8_nxv8i64(ptr %base, <vscal
define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -1074,11 +1076,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i16_nxv8i64(ptr %base, <vscale x
define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -1099,11 +1101,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i16_nxv8i64(ptr %base, <vsca
define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i16_nxv8i64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x i64> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
-; CHECK-NEXT: vsll.vi v8, v12, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v12, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei32.v v16, (a0), v12, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -1124,10 +1126,11 @@ define <vscale x 8 x i64> @mgather_baseidx_nxv8i32_nxv8i64(ptr %base, <vscale x
;
; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%ptrs = getelementptr inbounds i64, ptr %base, <vscale x 8 x i32> %idxs
@@ -1147,10 +1150,11 @@ define <vscale x 8 x i64> @mgather_baseidx_sext_nxv8i32_nxv8i64(ptr %base, <vsca
;
; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%eidxs = sext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1171,10 +1175,11 @@ define <vscale x 8 x i64> @mgather_baseidx_zext_nxv8i32_nxv8i64(ptr %base, <vsca
;
; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vzext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%eidxs = zext <vscale x 8 x i32> %idxs to <vscale x 8 x i64>
@@ -1845,11 +1850,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i8_nxv8f32(ptr %base, <vsc
define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 2
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
-; CHECK-NEXT: vluxei16.v v12, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v12, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i32>
@@ -1861,10 +1866,11 @@ define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i8_nxv8f32(ptr %base, <vsc
define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -1885,10 +1891,11 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8i16_nxv8f32(ptr %base, <vscale
define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vsext.vf2 v16, v8
-; RV32-NEXT: vsll.vi v8, v16, 2
-; RV32-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; RV32-NEXT: li a1, 4
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v16, v8, a1
+; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT: vluxei32.v v12, (a0), v16, v0.t
; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
@@ -1910,10 +1917,11 @@ define <vscale x 8 x float> @mgather_baseidx_sext_nxv8i16_nxv8f32(ptr %base, <vs
define <vscale x 8 x float> @mgather_baseidx_zext_nxv8i16_nxv8f32(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x float> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; CHECK-NEXT: vzext.vf2 v16, v8
-; CHECK-NEXT: vsll.vi v8, v16, 2
-; CHECK-NEXT: vluxei32.v v12, (a0), v8, v0.t
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v16, v8, a1
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT: vluxei32.v v12, (a0), v16, v0.t
; CHECK-NEXT: vmv.v.v v8, v12
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i32>
@@ -1933,10 +1941,9 @@ define <vscale x 8 x float> @mgather_baseidx_nxv8f32(ptr %base, <vscale x 8 x i3
;
; RV64-LABEL: mgather_baseidx_nxv8f32:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT: vsext.vf2 v16, v8
-; RV64-NEXT: vsll.vi v16, v16, 2
-; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu
+; RV64-NEXT: li a1, 4
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, mu
+; RV64-NEXT: vwmulsu.vx v16, v8, a1
; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t
; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
@@ -2104,11 +2111,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i8_nxv8f64(ptr %base, <vs
define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vscale x 8 x i8> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT: vzext.vf2 v10, v8
-; CHECK-NEXT: vsll.vi v8, v10, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vwmulu.vx v10, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei16.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei16.v v16, (a0), v10, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i8> %idxs to <vscale x 8 x i64>
@@ -2120,11 +2127,11 @@ define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i8_nxv8f64(ptr %base, <vs
define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -2144,11 +2151,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i16_nxv8f64(ptr %base, <vscale
define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT: vsext.vf2 v12, v8
-; RV32-NEXT: vsll.vi v8, v12, 3
+; RV32-NEXT: li a1, 8
+; RV32-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; RV32-NEXT: vwmulsu.vx v12, v8, a1
; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v12, v0.t
; RV32-NEXT: vmv.v.v v8, v16
; RV32-NEXT: ret
;
@@ -2169,11 +2176,11 @@ define <vscale x 8 x double> @mgather_baseidx_sext_nxv8i16_nxv8f64(ptr %base, <v
define <vscale x 8 x double> @mgather_baseidx_zext_nxv8i16_nxv8f64(ptr %base, <vscale x 8 x i16> %idxs, <vscale x 8 x i1> %m, <vscale x 8 x double> %passthru) {
; CHECK-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT: vzext.vf2 v12, v8
-; CHECK-NEXT: vsll.vi v8, v12, 3
+; CHECK-NEXT: li a1, 8
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vwmulu.vx v12, v8, a1
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT: vluxei32.v v16, (a0), v8, v0.t
+; CHECK-NEXT: vluxei32.v v16, (a0), v12, v0.t
; CHECK-NEXT: vmv.v.v v8, v16
; CHECK-NEXT: ret
%eidxs = zext <vscale x 8 x i16> %idxs to <vscale x 8 x i64>
@@ -2194,10 +2201,11 @@ define <vscale x 8 x double> @mgather_baseidx_nxv8i32_nxv8f64(ptr %base, <vscale
;
; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8f64:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT: vsext.vf2 v24, v8
-; RV64-NEXT: vsll.vi v8, v24, 3
-; RV64-NEXT: vluxei64.v v16, (a0), v8, v0.t
+; RV64-NEXT: li a1, 8
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vwmulsu.vx v24, v8, a1
+; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT: vluxei64.v v16, (a0), v24, v0.t
; RV64-NEXT: vmv.v.v v8, v16
; RV64-NEXT: ret
%ptrs = getelementptr inb...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the absence of Zvbb vwsll.vi
, it can be profitable to use a widening multiply instead of sign/zero extension followed by a left shift.
This is the case on BPI-F3. Each of vsext.vf2
, vsll.vi
and vwmul[s]u.vx
has 2*LMUL cycles throughtput: https://camel-cdr.github.io/rvv-bench-results/bpi_f3/
I confirmed this transform improves some benchmarks on a BPI-F3 board.
Looking at https://camel-cdr.github.io/rvv-bench-results/canmv_k230/, it should also apply there.
I don't know whether this is profitable for other RVV CPUs. Please advise if this should be restricted to certain CPUs and which ones.
|
||
SDValue LHS = N->getOperand(0); | ||
if (!LHS.hasOneUse()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For now, this only handles single-use of sext/zext.
I can rewrite it to be part of combineOp_VLToVWOp_VL
so that it handles multi-use too.
; CHECK-RV32: # %bb.0: | ||
; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma | ||
; CHECK-RV32-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-RV32-NEXT: vsll.vi v8, v10, 2, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The transform did not apply on RV32 here:
17373 if (!ISD::isConstantSplatVector(RHS.getNode(), ShAmt))
(gdb)
17374 return SDValue();
(gdb) call RHS->dump()
t20: nxv2i64 = splat_vector_parts Constant:i32<2>, Constant:i32<0>
I'm guessing it's an effect of type legalization. Please advise on how to fix.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
splat_vector_parts is used to splat an e64 element on rv32, i.e. when the EEW > XLEN. You could manually handle it in a follow up PR, I don't think it's critical for this PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rv32 RVV is rare, I suppose?
I'd prefer to do it right away, so that the test in vwsll-sdnode.ll
doesn't disappear. Handling it here prioritizes vwmulu
over vwsll
-- I'll debug it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done. combineOp_VLToVWOp_VL
handles RISCVISD::VMV_V_X_VL
in AfterLegalizeDAG
instead of ISD::SPLAT_VECTOR_PARTS
in AfterLegalizeTypes
. I do the same, so that vwsll
has priority over vwmulu
.
; CHECK-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-NEXT: vsll.vi v8, v10, 2 | ||
; CHECK-NEXT: ret | ||
; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is same problem as in the vwsll-vp.ll
file - see my comment there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense to me. I guess multiplies might not have the same number of available execution ports, but it's still less vector ops at the end of the day.
return SDValue(); | ||
|
||
SDValue NarrowOp = LHS.getOperand(0); | ||
EVT NarrowVT = NarrowOp.getValueType(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's better to use MVT + getSimpleValueType explicitly since this is past type legalization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? I use EVT for DAG.getConstant
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NarrowVT.getScalarSizeInBits()
generates less code if NarrowVT
is MVT.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
if (!VT.isScalableVector()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be worthwhile to leave a TODO to handle fixed length vectors later. You would need to use the ContainerVT = getContainerForFixedLengthVector(...)
pattern that we use elsewhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO added
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
; CHECK-RV32: # %bb.0: | ||
; CHECK-RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma | ||
; CHECK-RV32-NEXT: vzext.vf2 v10, v8 | ||
; CHECK-RV32-NEXT: vsll.vi v8, v10, 2, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
splat_vector_parts is used to splat an e64 element on rv32, i.e. when the EEW > XLEN. You could manually handle it in a follow up PR, I don't think it's critical for this PR
; RV32ZVE32F-NEXT: andi a3, t0, 1 | ||
; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 | ||
; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma | ||
; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vsext
+vsll
+vadd
folded into vwmaccus
.
; RV32: # %bb.0: | ||
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma | ||
; RV32-NEXT: vzext.vf2 v10, v8 | ||
; RV32-NEXT: vsll.vi v8, v10, 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not fold on RV32. I will investigate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by handling VSEXT_VL/VZEXT_VL.
; CHECK-ZVBB-RV64: # %bb.0: | ||
; CHECK-ZVBB-RV64-NEXT: li a0, 4 | ||
; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma | ||
; CHECK-ZVBB-RV64-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pessimized. I will investigate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by delaying this transform.
; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-ZVBB-NEXT: vwsll.vi v10, v8, 2 | ||
; CHECK-ZVBB-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pessimized.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by delaying this transform.
if (!VT.isScalableVector()) | ||
return SDValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I addressed all the comments and issues. Please review.
; RV32: # %bb.0: | ||
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma | ||
; RV32-NEXT: vzext.vf2 v10, v8 | ||
; RV32-NEXT: vsll.vi v8, v10, 2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by handling VSEXT_VL/VZEXT_VL.
; CHECK-ZVBB-RV64: # %bb.0: | ||
; CHECK-ZVBB-RV64-NEXT: li a0, 4 | ||
; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma | ||
; CHECK-ZVBB-RV64-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by delaying this transform.
; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-ZVBB-NEXT: vwsll.vi v10, v8, 2 | ||
; CHECK-ZVBB-NEXT: vwmulu.vx v10, v8, a0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed by delaying this transform.
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I've heard that this transform won't be profitable on other CPUs, so I added a commit that enables it on BPI's SpacemiT X60 only. Basing on https://camel-cdr.github.io/rvv-bench-results/canmv_k230/ perhaps K230 too, but I don't have one to confirm. The sole presence of Zvbb doesn't preclude this transform, because |
It's probably profitable on SiFive x280. The vzext/vsext can produce DLEN bits per cycle where DLEN=VLEN/2. The latency until the first DLEN is ready is 4. The shift also produces DLEN bits per cycle. The latency until the first DLEN is ready is 8. The widening multiply produces DLEN*2 bits per cycle. The first 2 DLENs complete in 8 cycles. |
I might have the latency wrong shifts on x280. The scheduler model says 4, but I have other docs internally that say 8. I'm confirming. |
For my micro-architecture, this should be at most a very minor regression, and possible a slight improvement. From our perspective, I'm 100% fine with this being default enabled without a tuning flag. |
@@ -17341,6 +17341,85 @@ static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG, | |||
return DAG.getZExtOrTrunc(Pop, DL, VT); | |||
} | |||
|
|||
static SDValue combineSHL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The naming convention in this file would have this routine named performSHLCombine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed, thanks!
@@ -1417,6 +1417,9 @@ def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "V | |||
def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush", | |||
"true", "VXRM writes causes pipeline flush">; | |||
|
|||
def TuneCheapVWMul : SubtargetFeature<"cheap-vwmul", "HasCheapVWMul", "true", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure the tune flag here is warranted. I would bias towards not having it unless we know some particular core is unprofitable.
…T X60" This reverts commit 63f902d.
I reverted restricting the transform to SpacemiT X60. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM