[RISCV] Recognize de-interleave shuffles with 2 sources. #127272

topperc · 2025-02-14T23:21:25Z

We can use vnsrl+trunc on each source and concatenate the results
with vslideup.

For low LMUL it would be better to concat first, but I'm leaving
this for later.

llvmbot · 2025-02-14T23:21:57Z

@llvm/pr-subscribers-backend-risc-v

Author: Craig Topper (topperc)

Changes

We can use vnsrl+trunc on each source and concatenate the results
with vslideup.

For low LMUL it would be better to concat first, but I'm leaving
this for later.

Patch is 24.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127272.diff

5 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+14)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll (+13-21)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll (+86)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll (+417)
(modified) llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll (+13-20)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index da04880348af6..36e434f245f5c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5579,6 +5579,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
           1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
         if (SDValue Src = getSingleShuffleSrc(VT, ContainerVT, V1, V2))
           return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
+        if (1 < count_if(Mask,
+                         [&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
+            1 < count_if(Mask, [&Mask](int Idx) {
+              return Idx >= (int)Mask.size();
+            })) {
+          // Narrow each source and concatenate them.
+          // FIXME: For small LMUL it is better to concatenate first.
+          MVT HalfVT = VT.getHalfNumVectorElementsVT();
+          SDValue Lo =
+              getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
+          SDValue Hi =
+              getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
+          return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index b4634dbf5a5e8..e53dfc23a84bb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    li a0, -256
+; CHECK-NEXT:    vlm.v v0, (a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vv v11, v9, v9
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v12, v11, -16
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v9, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vnsrl.wi v10, v9, 0
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v11, v11, -15
-; CHECK-NEXT:    vmerge.vim v13, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v10, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 8
+; CHECK-NEXT:    vnsrl.wi v11, v8, 0
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 8
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v13, v12, v0.t
-; CHECK-NEXT:    vrgather.vv v8, v13, v11, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 8
+; CHECK-NEXT:    vslideup.vi v9, v8, 8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v9, 0
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index 10dadbc022e02..ad18c801069f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -369,3 +369,89 @@ entry:
   store <2 x i8> %shuffle.i5, ptr %out, align 1
   ret void
 }
+
+define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_0_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 4
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave4_8_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a1)
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 8
+; CHECK-NEXT:    vnsrl.wi v9, v9, 8
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0
+; CHECK-NEXT:    vnsrl.wi v9, v9, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 4
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave8_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_0_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v9, (a1)
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vse8.v v8, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; CHECK-LABEL: deinterleave8_8_i8_two_source:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a1)
+; CHECK-NEXT:    vmv.v.i v0, -3
+; CHECK-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    vse8.v v9, (a2)
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 3c28e978842b9..8190b5c45fe9b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -551,3 +551,420 @@ entry:
   store <64 x i32> %shuffle.i5, ptr %out, align 4
   ret void
 }
+
+define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i8_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a1)
+; V-NEXT:    vle8.v v9, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 4
+; V-NEXT:    vse8.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i8_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a1)
+; ZVE32F-NEXT:    vle8.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 4
+; ZVE32F-NEXT:    vse8.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_8_8_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vle8.v v8, (a1)
+; V-NEXT:    vle8.v v9, (a0)
+; V-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 8
+; V-NEXT:    vnsrl.wi v9, v9, 8
+; V-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 4
+; V-NEXT:    vse8.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_8_8_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vle8.v v8, (a1)
+; ZVE32F-NEXT:    vle8.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 8
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 8
+; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 4
+; ZVE32F-NEXT:    vse8.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %in0, align 1
+  %1 = load <8 x i8>, ptr %in1, align 1
+  %shuffle.i5 = shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i8> %shuffle.i5, ptr %out, align 1
+  ret void
+}
+
+define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i16_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i16_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x i16>, ptr %in0, align 2
+  %1 = load <4 x i16>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x i16> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_i16_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 16
+; V-NEXT:    vnsrl.wi v9, v9, 16
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_16_i16_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x i16>, ptr %in0, align 2
+  %1 = load <4 x i16>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x i16> %0, <4 x i16> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x i16> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_half_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 0
+; V-NEXT:    vnsrl.wi v9, v9, 0
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_half_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 0
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 0
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x half>, ptr %in0, align 2
+  %1 = load <4 x half>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  store <4 x half> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_16_half_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vle16.v v8, (a1)
+; V-NEXT:    vle16.v v9, (a0)
+; V-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; V-NEXT:    vnsrl.wi v8, v8, 16
+; V-NEXT:    vnsrl.wi v9, v9, 16
+; V-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
+; V-NEXT:    vslideup.vi v9, v8, 2
+; V-NEXT:    vse16.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_16_half_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vle16.v v8, (a1)
+; ZVE32F-NEXT:    vle16.v v9, (a0)
+; ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vnsrl.wi v8, v8, 16
+; ZVE32F-NEXT:    vnsrl.wi v9, v9, 16
+; ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVE32F-NEXT:    vslideup.vi v9, v8, 2
+; ZVE32F-NEXT:    vse16.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <4 x half>, ptr %in0, align 2
+  %1 = load <4 x half>, ptr %in1, align 2
+  %shuffle.i5 = shufflevector <4 x half> %0, <4 x half> %1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x half> %shuffle.i5, ptr %out, align 2
+  ret void
+}
+
+define void @vnsrl_0_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i32_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse32.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i32_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i32>, ptr %in0, align 4
+  %1 = load <2 x i32>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x i32> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_i32_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse32.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_32_i32_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vmv.v.i v0, 1
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i32>, ptr %in0, align 4
+  %1 = load <2 x i32>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x i32> %0, <2 x i32> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x i32> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_0_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_float_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse32.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_float_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 1
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x float>, ptr %in0, align 4
+  %1 = load <2 x float>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x float> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_32_float_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; V-NEXT:    vle32.v v8, (a0)
+; V-NEXT:    vle32.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse32.v v9, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_32_float_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vmv.v.i v0, 1
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x float>, ptr %in0, align 4
+  %1 = load <2 x float>, ptr %in1, align 4
+  %shuffle.i5 = shufflevector <2 x float> %0, <2 x float> %1, <2 x i32> <i32 1, i32 3>
+  store <2 x float> %shuffle.i5, ptr %out, align 4
+  ret void
+}
+
+define void @vnsrl_0_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_0_i64_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vslideup.vi v8, v9, 1
+; V-NEXT:    vse64.v v8, (a2)
+; V-NEXT:    ret
+;
+; ZVE32F-LABEL: vnsrl_0_i64_two_source:
+; ZVE32F:       # %bb.0: # %entry
+; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVE32F-NEXT:    vle32.v v8, (a0)
+; ZVE32F-NEXT:    vle32.v v9, (a1)
+; ZVE32F-NEXT:    vslideup.vi v8, v9, 2
+; ZVE32F-NEXT:    vse32.v v8, (a2)
+; ZVE32F-NEXT:    ret
+entry:
+  %0 = load <2 x i64>, ptr %in0, align 8
+  %1 = load <2 x i64>, ptr %in1, align 8
+  %shuffle.i5 = shufflevector <2 x i64> %0, <2 x i64> %1, <2 x i32> <i32 0, i32 2>
+  store <2 x i64> %shuffle.i5, ptr %out, align 8
+  ret void
+}
+
+define void @vnsrl_64_i64_two_source(ptr %in0, ptr %in1, ptr %out) {
+; V-LABEL: vnsrl_64_i64_two_source:
+; V:       # %bb.0: # %entry
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vle64.v v8, (a0)
+; V-NEXT:    vle64.v v9, (a1)
+; V-NEXT:    vmv.v.i v0, 1
+; V-NEXT:    vrgather.vi v9, v8, 1, v0.t
+; V-NEXT:    vse64.v v9, (a2)
+; V-NEXT:    ret
+;
+;...
[truncated]

github-actions · 2025-02-14T23:25:15Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' f75126eeabba13ce2aab53c2e4296fca12b9da0d 0cfa4ec30987d93b7af1d281ebcfa387fcdbdc08 llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll

The following files introduce new uses of undef:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

preames

LGTM

Nice win, that was a much simpler patch than I'd had as in my head.

preames · 2025-02-15T03:17:18Z

For low LMUL it would be better to concat first, but I'm leaving
this for later.

You might want to add deinterleave to isShuffleMaskLegal before adding the custom low LMUL lowering. I've seen some interactions there before.

We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.

llvm-ci · 2025-02-17T04:53:48Z

LLVM Buildbot has detected a new failure on builder openmp-offload-libc-amdgpu-runtime running on omp-vega20-1 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/73/builds/13130

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/pgo1.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-generate      -Xclang "-fprofile-instrument=llvm"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-generate -Xclang -fprofile-instrument=llvm
# RUN: at line 3
env LLVM_PROFILE_FILE=llvm.profraw /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp 2>&1
# executed command: env LLVM_PROFILE_FILE=llvm.profraw /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp
# RUN: at line 4
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/llvm-profdata show --all-functions --counts      amdgcn-amd-amdhsa.llvm.profraw | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c      --check-prefix="LLVM-PGO"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/llvm-profdata show --all-functions --counts amdgcn-amd-amdhsa.llvm.profraw
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c --check-prefix=LLVM-PGO
# RUN: at line 8
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp    -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate      -Xclang "-fprofile-instrument=clang"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c -o /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp -Xoffload-linker -lc -Xoffload-linker -lm /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a -fprofile-instr-generate -Xclang -fprofile-instrument=clang
# RUN: at line 10
env LLVM_PROFILE_FILE=clang.profraw /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp 2>&1
# executed command: env LLVM_PROFILE_FILE=clang.profraw /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/pgo1.c.tmp
# RUN: at line 11
/home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/llvm-profdata show --all-functions --counts      amdgcn-amd-amdhsa.clang.profraw | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c      --check-prefix="CLANG-PGO"
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/llvm-profdata show --all-functions --counts amdgcn-amd-amdhsa.clang.profraw
# executed command: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c --check-prefix=CLANG-PGO
# .---command stderr------------
# | /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c:54:15: error: CLANG-PGO: expected string not found in input
# | // CLANG-PGO: Block counts: [11, 20]
# |               ^
# | <stdin>:5:19: note: scanning from here
# |  Function count: 0
# |                   ^
# | <stdin>:6:2: note: possible intended match here
# |  Block counts: [13, 20]
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/ompworker/bbot/openmp-offload-libc-amdgpu-runtime/llvm.src/offload/test/offloading/pgo1.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             1: Counters: 
# |             2:  pgo1.c:__omp_offloading_802_d8283c4_main_l27: 
# |             3:  Hash: 0x000000011b11b451 
# |             4:  Counters: 3 
# |             5:  Function count: 0 
# | check:54'0                       X error: no match found
# |             6:  Block counts: [13, 20] 
...

We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.

…132123) Given this shuffle: ``` shufflevector <8 x i8> %0, <8 x i8> %1, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef> ``` #127272 lowers it with a bunch of vnsrl. If we describe the result in terms of the shuffle mask, we expect: ``` <0, 4, 8, 12, u, u, u, u> ``` but we actually got: ``` <0, 4, u, u, 8, 12, u, u> ``` for factor larger than 2. This is caused by `CONCAT_VECTORS` on incorrect (sub) vector types. This patch fixes the said issue by building an aggregate vector with the correct sub vector types. Fix #132071

topperc requested review from preames, lukel97 and wangpc-pp February 14, 2025 23:21

llvmbot added the backend:RISC-V label Feb 14, 2025

preames approved these changes Feb 15, 2025

View reviewed changes

topperc added 2 commits February 16, 2025 12:38

[RISCV] Add fixed vector deinterleave tests with 2 sources. NFC

f50d787

[RISCV] Recognize de-interleave shuffles with 2 sources.

0cfa4ec

We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.

topperc force-pushed the pr/vnsrl-two-source branch from 30fd9e3 to 0cfa4ec Compare February 16, 2025 20:45

topperc merged commit 9b7282e into llvm:main Feb 17, 2025
7 of 8 checks passed

topperc deleted the pr/vnsrl-two-source branch February 17, 2025 04:40

mshockwave mentioned this pull request Mar 20, 2025

[RISCV] Fix incorrect slide offset when using vnsrl to de-interleave #132123

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISCV] Recognize de-interleave shuffles with 2 sources. #127272

[RISCV] Recognize de-interleave shuffles with 2 sources. #127272

Uh oh!

topperc commented Feb 14, 2025

Uh oh!

llvmbot commented Feb 14, 2025

Uh oh!

github-actions bot commented Feb 14, 2025 •

edited

Loading

Uh oh!

preames left a comment

Uh oh!

preames commented Feb 15, 2025

Uh oh!

Uh oh!

llvm-ci commented Feb 17, 2025

Uh oh!

Uh oh!

[RISCV] Recognize de-interleave shuffles with 2 sources. #127272

[RISCV] Recognize de-interleave shuffles with 2 sources. #127272

Uh oh!

Conversation

topperc commented Feb 14, 2025

Uh oh!

llvmbot commented Feb 14, 2025

Uh oh!

github-actions bot commented Feb 14, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

preames left a comment

Choose a reason for hiding this comment

Uh oh!

preames commented Feb 15, 2025

Uh oh!

Uh oh!

llvm-ci commented Feb 17, 2025

Uh oh!

Uh oh!

github-actions bot commented Feb 14, 2025 •

edited

Loading