-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[RISCV] Recognize de-interleave shuffles with 2 sources. #127272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Craig Topper (topperc) ChangesWe can use vnsrl+trunc on each source and concatenate the results For low LMUL it would be better to concat first, but I'm leaving Patch is 24.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127272.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index da04880348af6..36e434f245f5c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5579,6 +5579,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
if (SDValue Src = getSingleShuffleSrc(VT, ContainerVT, V1, V2))
return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
+ if (1 < count_if(Mask,
+ [&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
+ 1 < count_if(Mask, [&Mask](int Idx) {
+ return Idx >= (int)Mask.size();
+ })) {
+ // Narrow each source and concatenate them.
+ // FIXME: For small LMUL it is better to concatenate first.
+ MVT HalfVT = VT.getHalfNumVectorElementsVT();
+ SDValue Lo =
+ getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
+ SDValue Hi =
+ getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
}
}
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index b4634dbf5a5e8..e53dfc23a84bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vid.v v9
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT: vlm.v v8, (a0)
-; CHECK-NEXT: li a0, -256
+; CHECK-NEXT: vlm.v v0, (a0)
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vadd.vv v11, v9, v9
-; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT: vmv.s.x v9, a0
-; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT: vadd.vi v12, v11, -16
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v9, v8, 1, v0
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v0, v8, 2
+; CHECK-NEXT: vslidedown.vi v0, v0, 2
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vnsrl.wi v10, v9, 0
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vadd.vi v11, v11, -15
-; CHECK-NEXT: vmerge.vim v13, v10, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v10, v8, 0
+; CHECK-NEXT: vnsrl.wi v9, v9, 8
+; CHECK-NEXT: vnsrl.wi v11, v8, 0
; CHECK-NEXT: vnsrl.wi v8, v8, 8
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu
-; CHECK-NEXT: vrgather.vv v10, v13, v12, v0.t
-; CHECK-NEXT: vrgather.vv v8, v13, v11, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 8
+; CHECK-NEXT: vslideup.vi v9, v8, 8
; CHECK-NEXT: vmsne.vi v0, v10, 0
-; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v9, 0
; CHECK-NEXT: ret
%vec = load <32 x i1>, ptr %p
%deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 10dadbc022e02..ad18c801069f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -369,3 +369,89 @@ entry:
store <2 x i8> %shuffle.i5, ptr %out, align 1
ret void
}
+
+define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_0_i8_two_source:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a1)
+; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 4
+; CHECK-NEXT: vse8.v v9, (a2)
+; CHECK-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
+
+define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_8_i8_two_source:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a1)
+; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 8
+; CHECK-NEXT: vnsrl.wi v9, v9, 8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0
+; CHECK-NEXT: vnsrl.wi v9, v9, 0
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v9, v8, 4
+; CHECK-NEXT: vse8.v v9, (a2)
+; CHECK-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
+
+define void @deinterleave8_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_0_i8_two_source:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e8, mf2, ta, ma
+; CHECK-NEXT: vle8.v v9, (a1)
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vse8.v v8, (a2)
+; CHECK-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
+
+define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_8_i8_two_source:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a1)
+; CHECK-NEXT: vmv.v.i v0, -3
+; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t
+; CHECK-NEXT: vse8.v v9, (a2)
+; CHECK-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 3c28e978842b9..8190b5c45fe9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -551,3 +551,420 @@ entry:
store <64 x i32> %shuffle.i5, ptr %out, align 4
ret void
}
+
+define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i8_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT: vle8.v v8, (a1)
+; V-NEXT: vle8.v v9, (a0)
+; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 0
+; V-NEXT: vnsrl.wi v9, v9, 0
+; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 4
+; V-NEXT: vse8.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_i8_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT: vle8.v v8, (a1)
+; ZVE32F-NEXT: vle8.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; ZVE32F-NEXT: vse8.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
+
+define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_8_8_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT: vle8.v v8, (a1)
+; V-NEXT: vle8.v v9, (a0)
+; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 8
+; V-NEXT: vnsrl.wi v9, v9, 8
+; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 4
+; V-NEXT: vse8.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_8_8_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT: vle8.v v8, (a1)
+; ZVE32F-NEXT: vle8.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 8
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 8
+; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; ZVE32F-NEXT: vse8.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %in0, align 1
+ %1 = load <8 x i8>, ptr %in1, align 1
+ %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ store <8 x i8> %shuffle.i5, ptr %out, align 1
+ ret void
+}
+
+define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i16_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vle16.v v8, (a1)
+; V-NEXT: vle16.v v9, (a0)
+; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 0
+; V-NEXT: vnsrl.wi v9, v9, 0
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 2
+; V-NEXT: vse16.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_i16_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vle16.v v8, (a1)
+; ZVE32F-NEXT: vle16.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 2
+; ZVE32F-NEXT: vse16.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <4 x i16>, ptr %in0, align 2
+ %1 = load <4 x i16>, ptr %in1, align 2
+ %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ store <4 x i16> %shuffle.i5, ptr %out, align 2
+ ret void
+}
+
+define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_i16_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vle16.v v8, (a1)
+; V-NEXT: vle16.v v9, (a0)
+; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 16
+; V-NEXT: vnsrl.wi v9, v9, 16
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 2
+; V-NEXT: vse16.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_16_i16_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vle16.v v8, (a1)
+; ZVE32F-NEXT: vle16.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 2
+; ZVE32F-NEXT: vse16.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <4 x i16>, ptr %in0, align 2
+ %1 = load <4 x i16>, ptr %in1, align 2
+ %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ store <4 x i16> %shuffle.i5, ptr %out, align 2
+ ret void
+}
+
+define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_half_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vle16.v v8, (a1)
+; V-NEXT: vle16.v v9, (a0)
+; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 0
+; V-NEXT: vnsrl.wi v9, v9, 0
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 2
+; V-NEXT: vse16.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_half_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vle16.v v8, (a1)
+; ZVE32F-NEXT: vle16.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 2
+; ZVE32F-NEXT: vse16.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <4 x half>, ptr %in0, align 2
+ %1 = load <4 x half>, ptr %in1, align 2
+ %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ store <4 x half> %shuffle.i5, ptr %out, align 2
+ ret void
+}
+
+define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_half_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vle16.v v8, (a1)
+; V-NEXT: vle16.v v9, (a0)
+; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT: vnsrl.wi v8, v8, 16
+; V-NEXT: vnsrl.wi v9, v9, 16
+; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT: vslideup.vi v9, v8, 2
+; V-NEXT: vse16.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_16_half_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vle16.v v8, (a1)
+; ZVE32F-NEXT: vle16.v v9, (a0)
+; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT: vslideup.vi v9, v8, 2
+; ZVE32F-NEXT: vse16.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <4 x half>, ptr %in0, align 2
+ %1 = load <4 x half>, ptr %in1, align 2
+ %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ store <4 x half> %shuffle.i5, ptr %out, align 2
+ ret void
+}
+
+define void @vnsrl_0_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i32_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT: vle32.v v8, (a0)
+; V-NEXT: vle32.v v9, (a1)
+; V-NEXT: vslideup.vi v8, v9, 1
+; V-NEXT: vse32.v v8, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_i32_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT: vle32.v v8, (a0)
+; ZVE32F-NEXT: vle32.v v9, (a1)
+; ZVE32F-NEXT: vslideup.vi v8, v9, 1
+; ZVE32F-NEXT: vse32.v v8, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <2 x i32>, ptr %in0, align 4
+ %1 = load <2 x i32>, ptr %in1, align 4
+ %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 0, i32 2>
+ store <2 x i32> %shuffle.i5, ptr %out, align 4
+ ret void
+}
+
+define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_i32_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT: vle32.v v8, (a0)
+; V-NEXT: vle32.v v9, (a1)
+; V-NEXT: vmv.v.i v0, 1
+; V-NEXT: vrgather.vi v9, v8, 1, v0.t
+; V-NEXT: vse32.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_32_i32_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT: vle32.v v8, (a0)
+; ZVE32F-NEXT: vle32.v v9, (a1)
+; ZVE32F-NEXT: vmv.v.i v0, 1
+; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT: vse32.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <2 x i32>, ptr %in0, align 4
+ %1 = load <2 x i32>, ptr %in1, align 4
+ %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 1, i32 3>
+ store <2 x i32> %shuffle.i5, ptr %out, align 4
+ ret void
+}
+
+define void @vnsrl_0_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_float_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT: vle32.v v8, (a0)
+; V-NEXT: vle32.v v9, (a1)
+; V-NEXT: vslideup.vi v8, v9, 1
+; V-NEXT: vse32.v v8, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_float_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT: vle32.v v8, (a0)
+; ZVE32F-NEXT: vle32.v v9, (a1)
+; ZVE32F-NEXT: vslideup.vi v8, v9, 1
+; ZVE32F-NEXT: vse32.v v8, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <2 x float>, ptr %in0, align 4
+ %1 = load <2 x float>, ptr %in1, align 4
+ %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 0, i32 2>
+ store <2 x float> %shuffle.i5, ptr %out, align 4
+ ret void
+}
+
+define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_float_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT: vle32.v v8, (a0)
+; V-NEXT: vle32.v v9, (a1)
+; V-NEXT: vmv.v.i v0, 1
+; V-NEXT: vrgather.vi v9, v8, 1, v0.t
+; V-NEXT: vse32.v v9, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_32_float_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT: vle32.v v8, (a0)
+; ZVE32F-NEXT: vle32.v v9, (a1)
+; ZVE32F-NEXT: vmv.v.i v0, 1
+; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT: vse32.v v9, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <2 x float>, ptr %in0, align 4
+ %1 = load <2 x float>, ptr %in1, align 4
+ %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 1, i32 3>
+ store <2 x float> %shuffle.i5, ptr %out, align 4
+ ret void
+}
+
+define void @vnsrl_0_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i64_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; V-NEXT: vle64.v v8, (a0)
+; V-NEXT: vle64.v v9, (a1)
+; V-NEXT: vslideup.vi v8, v9, 1
+; V-NEXT: vse64.v v8, (a2)
+; V-NEXT: ret
+;
+; ZVE32F-LABEL: vnsrl_0_i64_two_source:
+; ZVE32F: # %bb.0: # %entry
+; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVE32F-NEXT: vle32.v v8, (a0)
+; ZVE32F-NEXT: vle32.v v9, (a1)
+; ZVE32F-NEXT: vslideup.vi v8, v9, 2
+; ZVE32F-NEXT: vse32.v v8, (a2)
+; ZVE32F-NEXT: ret
+entry:
+ %0 = load <2 x i64>, ptr %in0, align 8
+ %1 = load <2 x i64>, ptr %in1, align 8
+ %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> <i32 0, i32 2>
+ store <2 x i64> %shuffle.i5, ptr %out, align 8
+ ret void
+}
+
+define void @vnsrl_64_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_64_i64_two_source:
+; V: # %bb.0: # %entry
+; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT: vle64.v v8, (a0)
+; V-NEXT: vle64.v v9, (a1)
+; V-NEXT: vmv.v.i v0, 1
+; V-NEXT: vrgather.vi v9, v8, 1, v0.t
+; V-NEXT: vse64.v v9, (a2)
+; V-NEXT: ret
+;
+;...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' f75126eeabba13ce2aab53c2e4296fca12b9da0d 0cfa4ec30987d93b7af1d281ebcfa387fcdbdc08 llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Nice win, that was a much simpler patch than I'd had as in my head.
You might want to add deinterleave to isShuffleMaskLegal before adding the custom low LMUL lowering. I've seen some interactions there before. |
We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.
30fd9e3
to
0cfa4ec
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/13130 Here is the relevant piece of the build log for the reference
|
We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.
…132123) Given this shuffle: ``` shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef> ``` #127272 lowers it with a bunch of vnsrl. If we describe the result in terms of the shuffle mask, we expect: ``` <0, 4, 8, 12, u, u, u, u> ``` but we actually got: ``` <0, 4, u, u, 8, 12, u, u> ``` for factor larger than 2. This is caused by `CONCAT_VECTORS` on incorrect (sub) vector types. This patch fixes the said issue by building an aggregate vector with the correct sub vector types. Fix #132071
We can use vnsrl+trunc on each source and concatenate the results
with vslideup.
For low LMUL it would be better to concat first, but I'm leaving
this for later.