Skip to content

Commit 0cfa4ec

Browse files
committed
[RISCV] Recognize de-interleave shuffles with 2 sources.
We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.
1 parent f50d787 commit 0cfa4ec

File tree

5 files changed

+116
-164
lines changed

5 files changed

+116
-164
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5593,6 +5593,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55935593
1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
55945594
if (SDValue Src = getSingleShuffleSrc(VT, V1, V2))
55955595
return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
5596+
if (1 < count_if(Mask,
5597+
[&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
5598+
1 < count_if(Mask, [&Mask](int Idx) {
5599+
return Idx >= (int)Mask.size();
5600+
})) {
5601+
// Narrow each source and concatenate them.
5602+
// FIXME: For small LMUL it is better to concatenate first.
5603+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
5604+
SDValue Lo =
5605+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
5606+
SDValue Hi =
5607+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
5608+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
5609+
}
55965610
}
55975611
}
55985612
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
1010
; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
1111
; CHECK: # %bb.0:
1212
; CHECK-NEXT: li a1, 32
13-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
14-
; CHECK-NEXT: vmv.v.i v10, 0
15-
; CHECK-NEXT: vid.v v9
1613
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
17-
; CHECK-NEXT: vlm.v v8, (a0)
18-
; CHECK-NEXT: li a0, -256
14+
; CHECK-NEXT: vlm.v v0, (a0)
1915
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
20-
; CHECK-NEXT: vadd.vv v11, v9, v9
21-
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
22-
; CHECK-NEXT: vmv.s.x v9, a0
23-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
24-
; CHECK-NEXT: vadd.vi v12, v11, -16
16+
; CHECK-NEXT: vmv.v.i v8, 0
17+
; CHECK-NEXT: vmerge.vim v9, v8, 1, v0
2518
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
26-
; CHECK-NEXT: vslidedown.vi v0, v8, 2
19+
; CHECK-NEXT: vslidedown.vi v0, v0, 2
20+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
21+
; CHECK-NEXT: vnsrl.wi v10, v9, 0
2722
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
28-
; CHECK-NEXT: vadd.vi v11, v11, -15
29-
; CHECK-NEXT: vmerge.vim v13, v10, 1, v0
30-
; CHECK-NEXT: vmv1r.v v0, v8
31-
; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
23+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
3224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
33-
; CHECK-NEXT: vnsrl.wi v10, v8, 0
25+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
26+
; CHECK-NEXT: vnsrl.wi v11, v8, 0
3427
; CHECK-NEXT: vnsrl.wi v8, v8, 8
35-
; CHECK-NEXT: vmv1r.v v0, v9
36-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu
37-
; CHECK-NEXT: vrgather.vv v10, v13, v12, v0.t
38-
; CHECK-NEXT: vrgather.vv v8, v13, v11, v0.t
28+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
29+
; CHECK-NEXT: vslideup.vi v10, v11, 8
30+
; CHECK-NEXT: vslideup.vi v9, v8, 8
3931
; CHECK-NEXT: vmsne.vi v0, v10, 0
40-
; CHECK-NEXT: vmsne.vi v8, v8, 0
32+
; CHECK-NEXT: vmsne.vi v8, v9, 0
4133
; CHECK-NEXT: ret
4234
%vec = load <32 x i1>, ptr %p
4335
%deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -374,19 +374,17 @@ define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
374374
; CHECK-LABEL: deinterleave4_0_i8_two_source:
375375
; CHECK: # %bb.0: # %entry
376376
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
377-
; CHECK-NEXT: vle8.v v8, (a0)
378-
; CHECK-NEXT: vle8.v v9, (a1)
379-
; CHECK-NEXT: vmv.v.i v0, 12
380-
; CHECK-NEXT: vid.v v10
381-
; CHECK-NEXT: vsll.vi v10, v10, 2
382-
; CHECK-NEXT: vadd.vi v10, v10, -8
377+
; CHECK-NEXT: vle8.v v8, (a1)
378+
; CHECK-NEXT: vle8.v v9, (a0)
383379
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
384380
; CHECK-NEXT: vnsrl.wi v8, v8, 0
381+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
385382
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
386383
; CHECK-NEXT: vnsrl.wi v8, v8, 0
387-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
388-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
389-
; CHECK-NEXT: vse8.v v8, (a2)
384+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
385+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
386+
; CHECK-NEXT: vslideup.vi v9, v8, 4
387+
; CHECK-NEXT: vse8.v v9, (a2)
390388
; CHECK-NEXT: ret
391389
entry:
392390
%0 = load <8 x i8>, ptr %in0, align 1
@@ -402,20 +400,15 @@ define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
402400
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
403401
; CHECK-NEXT: vle8.v v8, (a1)
404402
; CHECK-NEXT: vle8.v v9, (a0)
405-
; CHECK-NEXT: li a0, -1
406-
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
407-
; CHECK-NEXT: vslidedown.vi v10, v8, 4
408-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
409-
; CHECK-NEXT: vwaddu.vv v11, v8, v10
410-
; CHECK-NEXT: vwmaccu.vx v11, a0, v10
411-
; CHECK-NEXT: vmv.v.i v0, 12
412403
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
413-
; CHECK-NEXT: vnsrl.wi v8, v9, 8
404+
; CHECK-NEXT: vnsrl.wi v8, v8, 8
405+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
414406
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
415407
; CHECK-NEXT: vnsrl.wi v8, v8, 0
408+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
416409
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
417-
; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
418-
; CHECK-NEXT: vse8.v v8, (a2)
410+
; CHECK-NEXT: vslideup.vi v9, v8, 4
411+
; CHECK-NEXT: vse8.v v9, (a2)
419412
; CHECK-NEXT: ret
420413
entry:
421414
%0 = load <8 x i8>, ptr %in0, align 1

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll

Lines changed: 64 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -589,33 +589,27 @@ define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
589589
; V-LABEL: vnsrl_0_i8_two_source:
590590
; V: # %bb.0: # %entry
591591
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
592-
; V-NEXT: vle8.v v8, (a0)
593-
; V-NEXT: vle8.v v9, (a1)
594-
; V-NEXT: vmv.v.i v0, -16
595-
; V-NEXT: vid.v v10
596-
; V-NEXT: vadd.vv v10, v10, v10
597-
; V-NEXT: vadd.vi v10, v10, -8
592+
; V-NEXT: vle8.v v8, (a1)
593+
; V-NEXT: vle8.v v9, (a0)
598594
; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
599595
; V-NEXT: vnsrl.wi v8, v8, 0
600-
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
601-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
602-
; V-NEXT: vse8.v v8, (a2)
596+
; V-NEXT: vnsrl.wi v9, v9, 0
597+
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
598+
; V-NEXT: vslideup.vi v9, v8, 4
599+
; V-NEXT: vse8.v v9, (a2)
603600
; V-NEXT: ret
604601
;
605602
; ZVE32F-LABEL: vnsrl_0_i8_two_source:
606603
; ZVE32F: # %bb.0: # %entry
607604
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
608-
; ZVE32F-NEXT: vle8.v v8, (a0)
609-
; ZVE32F-NEXT: vle8.v v9, (a1)
610-
; ZVE32F-NEXT: vmv.v.i v0, -16
611-
; ZVE32F-NEXT: vid.v v10
612-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
613-
; ZVE32F-NEXT: vadd.vi v10, v10, -8
605+
; ZVE32F-NEXT: vle8.v v8, (a1)
606+
; ZVE32F-NEXT: vle8.v v9, (a0)
614607
; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
615608
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
616-
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
617-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
618-
; ZVE32F-NEXT: vse8.v v8, (a2)
609+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
610+
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
611+
; ZVE32F-NEXT: vslideup.vi v9, v8, 4
612+
; ZVE32F-NEXT: vse8.v v9, (a2)
619613
; ZVE32F-NEXT: ret
620614
entry:
621615
%0 = load <8 x i8>, ptr %in0, align 1
@@ -629,33 +623,27 @@ define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) {
629623
; V-LABEL: vnsrl_8_8_two_source:
630624
; V: # %bb.0: # %entry
631625
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
632-
; V-NEXT: vle8.v v8, (a0)
633-
; V-NEXT: vle8.v v9, (a1)
634-
; V-NEXT: vmv.v.i v0, -16
635-
; V-NEXT: vid.v v10
636-
; V-NEXT: vadd.vv v10, v10, v10
637-
; V-NEXT: vadd.vi v10, v10, -7
626+
; V-NEXT: vle8.v v8, (a1)
627+
; V-NEXT: vle8.v v9, (a0)
638628
; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
639629
; V-NEXT: vnsrl.wi v8, v8, 8
640-
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
641-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
642-
; V-NEXT: vse8.v v8, (a2)
630+
; V-NEXT: vnsrl.wi v9, v9, 8
631+
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
632+
; V-NEXT: vslideup.vi v9, v8, 4
633+
; V-NEXT: vse8.v v9, (a2)
643634
; V-NEXT: ret
644635
;
645636
; ZVE32F-LABEL: vnsrl_8_8_two_source:
646637
; ZVE32F: # %bb.0: # %entry
647638
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
648-
; ZVE32F-NEXT: vle8.v v8, (a0)
649-
; ZVE32F-NEXT: vle8.v v9, (a1)
650-
; ZVE32F-NEXT: vmv.v.i v0, -16
651-
; ZVE32F-NEXT: vid.v v10
652-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
653-
; ZVE32F-NEXT: vadd.vi v10, v10, -7
639+
; ZVE32F-NEXT: vle8.v v8, (a1)
640+
; ZVE32F-NEXT: vle8.v v9, (a0)
654641
; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
655642
; ZVE32F-NEXT: vnsrl.wi v8, v8, 8
656-
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
657-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
658-
; ZVE32F-NEXT: vse8.v v8, (a2)
643+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 8
644+
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
645+
; ZVE32F-NEXT: vslideup.vi v9, v8, 4
646+
; ZVE32F-NEXT: vse8.v v9, (a2)
659647
; ZVE32F-NEXT: ret
660648
entry:
661649
%0 = load <8 x i8>, ptr %in0, align 1
@@ -669,33 +657,27 @@ define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
669657
; V-LABEL: vnsrl_0_i16_two_source:
670658
; V: # %bb.0: # %entry
671659
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
672-
; V-NEXT: vle16.v v8, (a0)
673-
; V-NEXT: vle16.v v9, (a1)
674-
; V-NEXT: vid.v v10
675-
; V-NEXT: vadd.vv v10, v10, v10
676-
; V-NEXT: vadd.vi v10, v10, -4
677-
; V-NEXT: vmv.v.i v0, 12
660+
; V-NEXT: vle16.v v8, (a1)
661+
; V-NEXT: vle16.v v9, (a0)
678662
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
679663
; V-NEXT: vnsrl.wi v8, v8, 0
680-
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
681-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
682-
; V-NEXT: vse16.v v8, (a2)
664+
; V-NEXT: vnsrl.wi v9, v9, 0
665+
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
666+
; V-NEXT: vslideup.vi v9, v8, 2
667+
; V-NEXT: vse16.v v9, (a2)
683668
; V-NEXT: ret
684669
;
685670
; ZVE32F-LABEL: vnsrl_0_i16_two_source:
686671
; ZVE32F: # %bb.0: # %entry
687672
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
688-
; ZVE32F-NEXT: vle16.v v8, (a0)
689-
; ZVE32F-NEXT: vle16.v v9, (a1)
690-
; ZVE32F-NEXT: vid.v v10
691-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
692-
; ZVE32F-NEXT: vadd.vi v10, v10, -4
693-
; ZVE32F-NEXT: vmv.v.i v0, 12
673+
; ZVE32F-NEXT: vle16.v v8, (a1)
674+
; ZVE32F-NEXT: vle16.v v9, (a0)
694675
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
695676
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
696-
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
697-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
698-
; ZVE32F-NEXT: vse16.v v8, (a2)
677+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
678+
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
679+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
680+
; ZVE32F-NEXT: vse16.v v9, (a2)
699681
; ZVE32F-NEXT: ret
700682
entry:
701683
%0 = load <4 x i16>, ptr %in0, align 2
@@ -711,33 +693,25 @@ define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
711693
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
712694
; V-NEXT: vle16.v v8, (a1)
713695
; V-NEXT: vle16.v v9, (a0)
714-
; V-NEXT: li a0, -1
715696
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
716-
; V-NEXT: vslidedown.vi v10, v8, 2
717-
; V-NEXT: vwaddu.vv v11, v8, v10
718-
; V-NEXT: vwmaccu.vx v11, a0, v10
719-
; V-NEXT: vmv.v.i v0, 12
720-
; V-NEXT: vnsrl.wi v8, v9, 16
697+
; V-NEXT: vnsrl.wi v8, v8, 16
698+
; V-NEXT: vnsrl.wi v9, v9, 16
721699
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
722-
; V-NEXT: vmerge.vvm v8, v8, v11, v0
723-
; V-NEXT: vse16.v v8, (a2)
700+
; V-NEXT: vslideup.vi v9, v8, 2
701+
; V-NEXT: vse16.v v9, (a2)
724702
; V-NEXT: ret
725703
;
726704
; ZVE32F-LABEL: vnsrl_16_i16_two_source:
727705
; ZVE32F: # %bb.0: # %entry
728706
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
729707
; ZVE32F-NEXT: vle16.v v8, (a1)
730708
; ZVE32F-NEXT: vle16.v v9, (a0)
731-
; ZVE32F-NEXT: li a0, -1
732709
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
733-
; ZVE32F-NEXT: vslidedown.vi v10, v8, 2
734-
; ZVE32F-NEXT: vwaddu.vv v11, v8, v10
735-
; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10
736-
; ZVE32F-NEXT: vmv.v.i v0, 12
737-
; ZVE32F-NEXT: vnsrl.wi v8, v9, 16
710+
; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
711+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
738712
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
739-
; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0
740-
; ZVE32F-NEXT: vse16.v v8, (a2)
713+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
714+
; ZVE32F-NEXT: vse16.v v9, (a2)
741715
; ZVE32F-NEXT: ret
742716
entry:
743717
%0 = load <4 x i16>, ptr %in0, align 2
@@ -751,33 +725,27 @@ define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) {
751725
; V-LABEL: vnsrl_0_half_two_source:
752726
; V: # %bb.0: # %entry
753727
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
754-
; V-NEXT: vle16.v v8, (a0)
755-
; V-NEXT: vle16.v v9, (a1)
756-
; V-NEXT: vid.v v10
757-
; V-NEXT: vadd.vv v10, v10, v10
758-
; V-NEXT: vadd.vi v10, v10, -4
759-
; V-NEXT: vmv.v.i v0, 12
728+
; V-NEXT: vle16.v v8, (a1)
729+
; V-NEXT: vle16.v v9, (a0)
760730
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
761731
; V-NEXT: vnsrl.wi v8, v8, 0
762-
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
763-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
764-
; V-NEXT: vse16.v v8, (a2)
732+
; V-NEXT: vnsrl.wi v9, v9, 0
733+
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
734+
; V-NEXT: vslideup.vi v9, v8, 2
735+
; V-NEXT: vse16.v v9, (a2)
765736
; V-NEXT: ret
766737
;
767738
; ZVE32F-LABEL: vnsrl_0_half_two_source:
768739
; ZVE32F: # %bb.0: # %entry
769740
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
770-
; ZVE32F-NEXT: vle16.v v8, (a0)
771-
; ZVE32F-NEXT: vle16.v v9, (a1)
772-
; ZVE32F-NEXT: vid.v v10
773-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
774-
; ZVE32F-NEXT: vadd.vi v10, v10, -4
775-
; ZVE32F-NEXT: vmv.v.i v0, 12
741+
; ZVE32F-NEXT: vle16.v v8, (a1)
742+
; ZVE32F-NEXT: vle16.v v9, (a0)
776743
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
777744
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
778-
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
779-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
780-
; ZVE32F-NEXT: vse16.v v8, (a2)
745+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
746+
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
747+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
748+
; ZVE32F-NEXT: vse16.v v9, (a2)
781749
; ZVE32F-NEXT: ret
782750
entry:
783751
%0 = load <4 x half>, ptr %in0, align 2
@@ -793,33 +761,25 @@ define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) {
793761
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
794762
; V-NEXT: vle16.v v8, (a1)
795763
; V-NEXT: vle16.v v9, (a0)
796-
; V-NEXT: li a0, -1
797764
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
798-
; V-NEXT: vslidedown.vi v10, v8, 2
799-
; V-NEXT: vwaddu.vv v11, v8, v10
800-
; V-NEXT: vwmaccu.vx v11, a0, v10
801-
; V-NEXT: vmv.v.i v0, 12
802-
; V-NEXT: vnsrl.wi v8, v9, 16
765+
; V-NEXT: vnsrl.wi v8, v8, 16
766+
; V-NEXT: vnsrl.wi v9, v9, 16
803767
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
804-
; V-NEXT: vmerge.vvm v8, v8, v11, v0
805-
; V-NEXT: vse16.v v8, (a2)
768+
; V-NEXT: vslideup.vi v9, v8, 2
769+
; V-NEXT: vse16.v v9, (a2)
806770
; V-NEXT: ret
807771
;
808772
; ZVE32F-LABEL: vnsrl_16_half_two_source:
809773
; ZVE32F: # %bb.0: # %entry
810774
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
811775
; ZVE32F-NEXT: vle16.v v8, (a1)
812776
; ZVE32F-NEXT: vle16.v v9, (a0)
813-
; ZVE32F-NEXT: li a0, -1
814777
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
815-
; ZVE32F-NEXT: vslidedown.vi v10, v8, 2
816-
; ZVE32F-NEXT: vwaddu.vv v11, v8, v10
817-
; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10
818-
; ZVE32F-NEXT: vmv.v.i v0, 12
819-
; ZVE32F-NEXT: vnsrl.wi v8, v9, 16
778+
; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
779+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
820780
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
821-
; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0
822-
; ZVE32F-NEXT: vse16.v v8, (a2)
781+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
782+
; ZVE32F-NEXT: vse16.v v9, (a2)
823783
; ZVE32F-NEXT: ret
824784
entry:
825785
%0 = load <4 x half>, ptr %in0, align 2

0 commit comments

Comments
 (0)