-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Lower SEW<=32 vector_deinterleave(2) via vunzip2{a,b} #136463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Lower SEW<=32 vector_deinterleave(2) via vunzip2{a,b} #136463
Conversation
This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.
@llvm/pr-subscribers-backend-risc-v Author: Philip Reames (preames) ChangesThis is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code. Patch is 29.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136463.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 98fba9e86e88a..7cf0d2db42ba1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4569,12 +4569,13 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
VL);
}
-// Can this shuffle be performed on exactly one (possibly larger) input?
-static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
-
- if (V2.isUndef())
- return V1;
-
+/// If concat_vector(V1,V2) could be folded away to some existing
+/// vector source, return it. Note that the source may be larger
+/// than the requested concat_vector (i.e. a extract_subvector
+/// might be required.)
+static SDValue FoldConcatVector(SDValue V1, SDValue V2) {
+ EVT VT = V1.getValueType();
+ assert(VT == V1.getValueType() && "precondition");
// Both input must be extracts.
if (V1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
V2.getOpcode() != ISD::EXTRACT_SUBVECTOR)
@@ -4582,23 +4583,34 @@ static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
// Extracting from the same source.
SDValue Src = V1.getOperand(0);
- if (Src != V2.getOperand(0))
- return SDValue();
-
- // Src needs to have twice the number of elements.
- unsigned NumElts = VT.getVectorNumElements();
- if (!Src.getValueType().isFixedLengthVector() ||
- Src.getValueType().getVectorNumElements() != (NumElts * 2))
+ if (Src != V2.getOperand(0) ||
+ VT.isScalableVector() != Src.getValueType().isScalableVector())
return SDValue();
// The extracts must extract the two halves of the source.
if (V1.getConstantOperandVal(1) != 0 ||
- V2.getConstantOperandVal(1) != NumElts)
+ V2.getConstantOperandVal(1) != VT.getVectorMinNumElements())
return SDValue();
return Src;
}
+// Can this shuffle be performed on exactly one (possibly larger) input?
+static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) {
+
+ if (V2.isUndef())
+ return V1;
+
+ unsigned NumElts = VT.getVectorNumElements();
+ // Src needs to have twice the number of elements.
+ // TODO: Update shuffle lowering to add the extract subvector
+ if (SDValue Src = FoldConcatVector(V1, V2);
+ Src && Src.getValueType().getVectorNumElements() == (NumElts * 2))
+ return Src;
+
+ return SDValue();
+}
+
/// Is this shuffle interleaving contiguous elements from one vector into the
/// even elements and contiguous elements from another vector into the odd
/// elements. \p EvenSrc will contain the element that should be in the first
@@ -11510,12 +11522,27 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
return DAG.getMergeValues(Res, DL);
}
- // TODO: Remove the e64 restriction once the fractional LMUL lowering
- // is improved to always beat the vnsrl lowering below.
- if (Subtarget.hasVendorXRivosVizip() && Factor == 2 &&
- VecVT.getVectorElementType().getSizeInBits() == 64) {
+ if (Subtarget.hasVendorXRivosVizip() && Factor == 2) {
+ MVT VT = Op->getSimpleValueType(0);
SDValue V1 = Op->getOperand(0);
SDValue V2 = Op->getOperand(1);
+
+ // For fractional LMUL, check if we can use a higher LMUL
+ // instruction to avoid a vslidedown.
+ if (SDValue Src = FoldConcatVector(V1, V2);
+ Src && getLMUL1VT(VT).bitsGT(VT)) {
+ EVT NewVT = VT.getDoubleNumVectorElementsVT();
+ SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+ Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewVT, Src, ZeroIdx);
+ SDValue Even = lowerVZIP(RISCVISD::RI_VUNZIP2A_VL, Src,
+ DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
+ SDValue Odd = lowerVZIP(RISCVISD::RI_VUNZIP2B_VL, Src,
+ DAG.getUNDEF(NewVT), DL, DAG, Subtarget);
+ Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Even, ZeroIdx);
+ Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Odd, ZeroIdx);
+ return DAG.getMergeValues({Even, Odd}, DL);
+ }
+
SDValue Even =
lowerVZIP(RISCVISD::RI_VUNZIP2A_VL, V1, V2, DL, DAG, Subtarget);
SDValue Odd =
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index b0b2390b1de37..8a71cd0826672 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -6,62 +6,106 @@
; Integers
define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT: vslidedown.vx v0, v0, a0
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 8
-; CHECK-NEXT: vmsne.vi v0, v12, 0
-; CHECK-NEXT: vmsne.vi v8, v14, 0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmv.v.i v10, 0
+; V-NEXT: csrr a0, vlenb
+; V-NEXT: vmerge.vim v8, v10, 1, v0
+; V-NEXT: srli a0, a0, 2
+; V-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; V-NEXT: vslidedown.vx v0, v0, a0
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vmerge.vim v10, v10, 1, v0
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 8
+; V-NEXT: vmsne.vi v0, v12, 0
+; V-NEXT: vmsne.vi v8, v14, 0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmv.v.i v8, 0
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: vmerge.vim v10, v8, 1, v0
+; ZIP-NEXT: srli a0, a0, 2
+; ZIP-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
+; ZIP-NEXT: vslidedown.vx v0, v0, a0
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: vmerge.vim v8, v8, 1, v0
+; ZIP-NEXT: ri.vunzip2a.vv v12, v10, v8
+; ZIP-NEXT: ri.vunzip2b.vv v14, v10, v8
+; ZIP-NEXT: vmsne.vi v0, v12, 0
+; ZIP-NEXT: vmsne.vi v8, v14, 0
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
}
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv32i8(<vscale x 32 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 8
-; CHECK-NEXT: vmv.v.v v8, v12
-; CHECK-NEXT: vmv.v.v v10, v14
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 8
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: vmv.v.v v10, v14
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
}
define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv16i16(<vscale x 16 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT: vnsrl.wi v12, v8, 0
-; CHECK-NEXT: vnsrl.wi v14, v8, 16
-; CHECK-NEXT: vmv.v.v v8, v12
-; CHECK-NEXT: vmv.v.v v10, v14
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT: vnsrl.wi v12, v8, 0
+; V-NEXT: vnsrl.wi v14, v8, 16
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: vmv.v.v v10, v14
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
}
define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv8i32(<vscale x 8 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT: vnsrl.wx v12, v8, a0
-; CHECK-NEXT: vnsrl.wi v14, v8, 0
-; CHECK-NEXT: vmv.v.v v8, v14
-; CHECK-NEXT: vmv.v.v v10, v12
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; V: # %bb.0:
+; V-NEXT: li a0, 32
+; V-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; V-NEXT: vnsrl.wx v12, v8, a0
+; V-NEXT: vnsrl.wi v14, v8, 0
+; V-NEXT: vmv.v.v v8, v14
+; V-NEXT: vmv.v.v v10, v12
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10
+; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10
+; ZIP-NEXT: vmv.v.v v8, v12
+; ZIP-NEXT: vmv.v.v v10, v14
+; ZIP-NEXT: ret
%retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
}
@@ -122,69 +166,122 @@ ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmv.v.i v24, 0
-; CHECK-NEXT: vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v24, v24, 1, v0
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vnsrl.wi v8, v16, 0
-; CHECK-NEXT: vnsrl.wi v0, v16, 8
-; CHECK-NEXT: vnsrl.wi v12, v24, 0
-; CHECK-NEXT: vnsrl.wi v4, v24, 8
-; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT: vmsne.vi v16, v8, 0
-; CHECK-NEXT: vmsne.vi v8, v0, 0
-; CHECK-NEXT: vmv1r.v v0, v16
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmv.v.i v24, 0
+; V-NEXT: vmerge.vim v16, v24, 1, v0
+; V-NEXT: vmv1r.v v0, v8
+; V-NEXT: vmerge.vim v24, v24, 1, v0
+; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; V-NEXT: vnsrl.wi v8, v16, 0
+; V-NEXT: vnsrl.wi v0, v16, 8
+; V-NEXT: vnsrl.wi v12, v24, 0
+; V-NEXT: vnsrl.wi v4, v24, 8
+; V-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; V-NEXT: vmsne.vi v16, v8, 0
+; V-NEXT: vmsne.vi v8, v0, 0
+; V-NEXT: vmv1r.v v0, v16
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmv1r.v v9, v0
+; ZIP-NEXT: vmv1r.v v0, v8
+; ZIP-NEXT: vmv.v.i v24, 0
+; ZIP-NEXT: vmerge.vim v16, v24, 1, v0
+; ZIP-NEXT: vmv1r.v v0, v9
+; ZIP-NEXT: vmerge.vim v24, v24, 1, v0
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v12, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v8, v24, v28
+; ZIP-NEXT: ri.vunzip2b.vv v0, v24, v28
+; ZIP-NEXT: vsetvli a0, zero, e8, m8, ta, ma
+; ZIP-NEXT: vmsne.vi v16, v8, 0
+; ZIP-NEXT: vmsne.vi v8, v0, 0
+; ZIP-NEXT: vmv1r.v v0, v16
+; ZIP-NEXT: ret
%retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
}
define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv128i8(<vscale x 128 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vnsrl.wi v8, v24, 0
-; CHECK-NEXT: vnsrl.wi v0, v24, 8
-; CHECK-NEXT: vnsrl.wi v12, v16, 0
-; CHECK-NEXT: vnsrl.wi v4, v16, 8
-; CHECK-NEXT: vmv8r.v v16, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vnsrl.wi v8, v24, 0
+; V-NEXT: vnsrl.wi v0, v24, 8
+; V-NEXT: vnsrl.wi v12, v16, 0
+; V-NEXT: vnsrl.wi v4, v16, 8
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv64i8_nxv128i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e8, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8> %vec)
ret {<vscale x 64 x i8>, <vscale x 64 x i8>} %retval
}
define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_nxv64i16(<vscale x 64 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v8
-; CHECK-NEXT: vnsrl.wi v8, v24, 0
-; CHECK-NEXT: vnsrl.wi v0, v24, 16
-; CHECK-NEXT: vnsrl.wi v12, v16, 0
-; CHECK-NEXT: vnsrl.wi v4, v16, 16
-; CHECK-NEXT: vmv8r.v v16, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vnsrl.wi v8, v24, 0
+; V-NEXT: vnsrl.wi v0, v24, 16
+; V-NEXT: vnsrl.wi v12, v16, 0
+; V-NEXT: vnsrl.wi v4, v16, 16
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv32i16_nxv64i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16> %vec)
ret {<vscale x 32 x i16>, <vscale x 32 x i16>} %retval
}
define {<vscale x 16 x i32>, <vscale x 16 x i32>} @vector_deinterleave_nxv16i32_nxvv32i32(<vscale x 32 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
-; CHECK-NEXT: vmv8r.v v24, v16
-; CHECK-NEXT: li a0, 32
-; CHECK-NEXT: vnsrl.wx v20, v24, a0
-; CHECK-NEXT: vnsrl.wx v16, v8, a0
-; CHECK-NEXT: vnsrl.wi v0, v8, 0
-; CHECK-NEXT: vnsrl.wi v4, v24, 0
-; CHECK-NEXT: vmv8r.v v8, v0
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v16
+; V-NEXT: li a0, 32
+; V-NEXT: vnsrl.wx v20, v24, a0
+; V-NEXT: vnsrl.wx v16, v8, a0
+; V-NEXT: vnsrl.wi v0, v8, 0
+; V-NEXT: vnsrl.wi v4, v24, 0
+; V-NEXT: vmv8r.v v8, v0
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv16i32_nxvv32i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v28, v16, v20
+; ZIP-NEXT: ri.vunzip2b.vv v4, v16, v20
+; ZIP-NEXT: ri.vunzip2a.vv v24, v8, v12
+; ZIP-NEXT: ri.vunzip2b.vv v0, v8, v12
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
%retval = call {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %vec)
ret {<vscale x 16 x i32>, <vscale x 16 x i32>} %retval
}
@@ -254,105 +351,175 @@ ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
; Floats
define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v9, v8, 16
-; CHECK-NEXT: vmv1r.v v8, v10
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v9, v8, 16
+; V-NEXT: vmv1r.v v8, v10
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
}
define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v9, v8, 16
-; CHECK-NEXT: vmv1r.v v8, v10
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v9, v8, 16
+; V-NEXT: vmv1r.v v8, v10
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
%retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
}
define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: vnsrl.wi v11, v8, 16
-; CHECK-NEXT: vmv.v.v v8, v10
-; CHECK-NEXT: vmv.v.v v9, v11
-; CHECK-NEXT: ret
+; V-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v11, v8, 16
+; V-NEXT: vmv.v.v v8, v10
+; V-NEXT: vmv.v.v v9, v11
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v11, v8, v9
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: vmv.v.v v9, v11
+; ZIP-NEXT: ret
%retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave2.nxv8bf16(<vscale x 8 x bfloat> %vec)
ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
}
define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
-; CHECK-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…36463) This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.
…36463) This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.
…36463) This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.
This is a continuation from 22d5890c and adds the neccessary logic to handle SEW!=64 profitably. The interesting case is needing to handle e.g. a single m1 which is split via extract_subvector into two operands, and form that back into a single m1 operation - instead of letting the vslidedown by vlenb/Constant sequence be generated. This is analogous to the getSingleShuffleSrc for vnsrl, and we can share a bunch of code.