Skip to content

Commit 30fd9e3

Browse files
committed
[RISCV] Recognize de-interleave shuffles with 2 sources.
We can use vnsrl+trunc on each source and concatenate the results with vslideup. For low LMUL it would be better to concat first, but I'm leaving this for later.
1 parent 227c75d commit 30fd9e3

File tree

5 files changed

+116
-164
lines changed

5 files changed

+116
-164
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5579,6 +5579,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55795579
1 < count_if(Mask, [](int Idx) { return Idx != -1; })) {
55805580
if (SDValue Src = getSingleShuffleSrc(VT, ContainerVT, V1, V2))
55815581
return getDeinterleaveShiftAndTrunc(DL, VT, Src, Factor, Index, DAG);
5582+
if (1 < count_if(Mask,
5583+
[&Mask](int Idx) { return Idx < (int)Mask.size(); }) &&
5584+
1 < count_if(Mask, [&Mask](int Idx) {
5585+
return Idx >= (int)Mask.size();
5586+
})) {
5587+
// Narrow each source and concatenate them.
5588+
// FIXME: For small LMUL it is better to concatenate first.
5589+
MVT HalfVT = VT.getHalfNumVectorElementsVT();
5590+
SDValue Lo =
5591+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V1, Factor, Index, DAG);
5592+
SDValue Hi =
5593+
getDeinterleaveShiftAndTrunc(DL, HalfVT, V2, Factor, Index, DAG);
5594+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
5595+
}
55825596
}
55835597
}
55845598
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
1010
; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
1111
; CHECK: # %bb.0:
1212
; CHECK-NEXT: li a1, 32
13-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
14-
; CHECK-NEXT: vmv.v.i v10, 0
15-
; CHECK-NEXT: vid.v v9
1613
; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma
17-
; CHECK-NEXT: vlm.v v8, (a0)
18-
; CHECK-NEXT: li a0, -256
14+
; CHECK-NEXT: vlm.v v0, (a0)
1915
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
20-
; CHECK-NEXT: vadd.vv v11, v9, v9
21-
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
22-
; CHECK-NEXT: vmv.s.x v9, a0
23-
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma
24-
; CHECK-NEXT: vadd.vi v12, v11, -16
16+
; CHECK-NEXT: vmv.v.i v8, 0
17+
; CHECK-NEXT: vmerge.vim v9, v8, 1, v0
2518
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
26-
; CHECK-NEXT: vslidedown.vi v0, v8, 2
19+
; CHECK-NEXT: vslidedown.vi v0, v0, 2
20+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
21+
; CHECK-NEXT: vnsrl.wi v10, v9, 0
2722
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
28-
; CHECK-NEXT: vadd.vi v11, v11, -15
29-
; CHECK-NEXT: vmerge.vim v13, v10, 1, v0
30-
; CHECK-NEXT: vmv1r.v v0, v8
31-
; CHECK-NEXT: vmerge.vim v8, v10, 1, v0
23+
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
3224
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
33-
; CHECK-NEXT: vnsrl.wi v10, v8, 0
25+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
26+
; CHECK-NEXT: vnsrl.wi v11, v8, 0
3427
; CHECK-NEXT: vnsrl.wi v8, v8, 8
35-
; CHECK-NEXT: vmv1r.v v0, v9
36-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu
37-
; CHECK-NEXT: vrgather.vv v10, v13, v12, v0.t
38-
; CHECK-NEXT: vrgather.vv v8, v13, v11, v0.t
28+
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
29+
; CHECK-NEXT: vslideup.vi v10, v11, 8
30+
; CHECK-NEXT: vslideup.vi v9, v8, 8
3931
; CHECK-NEXT: vmsne.vi v0, v10, 0
40-
; CHECK-NEXT: vmsne.vi v8, v8, 0
32+
; CHECK-NEXT: vmsne.vi v8, v9, 0
4133
; CHECK-NEXT: ret
4234
%vec = load <32 x i1>, ptr %p
4335
%deinterleaved.results = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -374,19 +374,17 @@ define void @deinterleave4_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
374374
; CHECK-LABEL: deinterleave4_0_i8_two_source:
375375
; CHECK: # %bb.0: # %entry
376376
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
377-
; CHECK-NEXT: vle8.v v8, (a0)
378-
; CHECK-NEXT: vle8.v v9, (a1)
379-
; CHECK-NEXT: vmv.v.i v0, 12
380-
; CHECK-NEXT: vid.v v10
381-
; CHECK-NEXT: vsll.vi v10, v10, 2
382-
; CHECK-NEXT: vadd.vi v10, v10, -8
377+
; CHECK-NEXT: vle8.v v8, (a1)
378+
; CHECK-NEXT: vle8.v v9, (a0)
383379
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
384380
; CHECK-NEXT: vnsrl.wi v8, v8, 0
381+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
385382
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
386383
; CHECK-NEXT: vnsrl.wi v8, v8, 0
387-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
388-
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t
389-
; CHECK-NEXT: vse8.v v8, (a2)
384+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
385+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
386+
; CHECK-NEXT: vslideup.vi v9, v8, 4
387+
; CHECK-NEXT: vse8.v v9, (a2)
390388
; CHECK-NEXT: ret
391389
entry:
392390
%0 = load <8 x i8>, ptr %in0, align 1
@@ -402,20 +400,15 @@ define void @deinterleave4_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
402400
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
403401
; CHECK-NEXT: vle8.v v8, (a1)
404402
; CHECK-NEXT: vle8.v v9, (a0)
405-
; CHECK-NEXT: li a0, -1
406-
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
407-
; CHECK-NEXT: vslidedown.vi v10, v8, 4
408-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
409-
; CHECK-NEXT: vwaddu.vv v11, v8, v10
410-
; CHECK-NEXT: vwmaccu.vx v11, a0, v10
411-
; CHECK-NEXT: vmv.v.i v0, 12
412403
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
413-
; CHECK-NEXT: vnsrl.wi v8, v9, 8
404+
; CHECK-NEXT: vnsrl.wi v8, v8, 8
405+
; CHECK-NEXT: vnsrl.wi v9, v9, 8
414406
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
415407
; CHECK-NEXT: vnsrl.wi v8, v8, 0
408+
; CHECK-NEXT: vnsrl.wi v9, v9, 0
416409
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
417-
; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0
418-
; CHECK-NEXT: vse8.v v8, (a2)
410+
; CHECK-NEXT: vslideup.vi v9, v8, 4
411+
; CHECK-NEXT: vse8.v v9, (a2)
419412
; CHECK-NEXT: ret
420413
entry:
421414
%0 = load <8 x i8>, ptr %in0, align 1

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll

Lines changed: 64 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -556,33 +556,27 @@ define void @vnsrl_0_i8_two_source(ptr %in0, ptr %in1, ptr %out) {
556556
; V-LABEL: vnsrl_0_i8_two_source:
557557
; V: # %bb.0: # %entry
558558
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
559-
; V-NEXT: vle8.v v8, (a0)
560-
; V-NEXT: vle8.v v9, (a1)
561-
; V-NEXT: vmv.v.i v0, -16
562-
; V-NEXT: vid.v v10
563-
; V-NEXT: vadd.vv v10, v10, v10
564-
; V-NEXT: vadd.vi v10, v10, -8
559+
; V-NEXT: vle8.v v8, (a1)
560+
; V-NEXT: vle8.v v9, (a0)
565561
; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
566562
; V-NEXT: vnsrl.wi v8, v8, 0
567-
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
568-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
569-
; V-NEXT: vse8.v v8, (a2)
563+
; V-NEXT: vnsrl.wi v9, v9, 0
564+
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
565+
; V-NEXT: vslideup.vi v9, v8, 4
566+
; V-NEXT: vse8.v v9, (a2)
570567
; V-NEXT: ret
571568
;
572569
; ZVE32F-LABEL: vnsrl_0_i8_two_source:
573570
; ZVE32F: # %bb.0: # %entry
574571
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
575-
; ZVE32F-NEXT: vle8.v v8, (a0)
576-
; ZVE32F-NEXT: vle8.v v9, (a1)
577-
; ZVE32F-NEXT: vmv.v.i v0, -16
578-
; ZVE32F-NEXT: vid.v v10
579-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
580-
; ZVE32F-NEXT: vadd.vi v10, v10, -8
572+
; ZVE32F-NEXT: vle8.v v8, (a1)
573+
; ZVE32F-NEXT: vle8.v v9, (a0)
581574
; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
582575
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
583-
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
584-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
585-
; ZVE32F-NEXT: vse8.v v8, (a2)
576+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
577+
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
578+
; ZVE32F-NEXT: vslideup.vi v9, v8, 4
579+
; ZVE32F-NEXT: vse8.v v9, (a2)
586580
; ZVE32F-NEXT: ret
587581
entry:
588582
%0 = load <8 x i8>, ptr %in0, align 1
@@ -596,33 +590,27 @@ define void @vnsrl_8_8_two_source(ptr %in0, ptr %in1, ptr %out) {
596590
; V-LABEL: vnsrl_8_8_two_source:
597591
; V: # %bb.0: # %entry
598592
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
599-
; V-NEXT: vle8.v v8, (a0)
600-
; V-NEXT: vle8.v v9, (a1)
601-
; V-NEXT: vmv.v.i v0, -16
602-
; V-NEXT: vid.v v10
603-
; V-NEXT: vadd.vv v10, v10, v10
604-
; V-NEXT: vadd.vi v10, v10, -7
593+
; V-NEXT: vle8.v v8, (a1)
594+
; V-NEXT: vle8.v v9, (a0)
605595
; V-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
606596
; V-NEXT: vnsrl.wi v8, v8, 8
607-
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
608-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
609-
; V-NEXT: vse8.v v8, (a2)
597+
; V-NEXT: vnsrl.wi v9, v9, 8
598+
; V-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
599+
; V-NEXT: vslideup.vi v9, v8, 4
600+
; V-NEXT: vse8.v v9, (a2)
610601
; V-NEXT: ret
611602
;
612603
; ZVE32F-LABEL: vnsrl_8_8_two_source:
613604
; ZVE32F: # %bb.0: # %entry
614605
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
615-
; ZVE32F-NEXT: vle8.v v8, (a0)
616-
; ZVE32F-NEXT: vle8.v v9, (a1)
617-
; ZVE32F-NEXT: vmv.v.i v0, -16
618-
; ZVE32F-NEXT: vid.v v10
619-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
620-
; ZVE32F-NEXT: vadd.vi v10, v10, -7
606+
; ZVE32F-NEXT: vle8.v v8, (a1)
607+
; ZVE32F-NEXT: vle8.v v9, (a0)
621608
; ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
622609
; ZVE32F-NEXT: vnsrl.wi v8, v8, 8
623-
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, mu
624-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
625-
; ZVE32F-NEXT: vse8.v v8, (a2)
610+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 8
611+
; ZVE32F-NEXT: vsetivli zero, 8, e8, mf4, ta, ma
612+
; ZVE32F-NEXT: vslideup.vi v9, v8, 4
613+
; ZVE32F-NEXT: vse8.v v9, (a2)
626614
; ZVE32F-NEXT: ret
627615
entry:
628616
%0 = load <8 x i8>, ptr %in0, align 1
@@ -636,33 +624,27 @@ define void @vnsrl_0_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
636624
; V-LABEL: vnsrl_0_i16_two_source:
637625
; V: # %bb.0: # %entry
638626
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
639-
; V-NEXT: vle16.v v8, (a0)
640-
; V-NEXT: vle16.v v9, (a1)
641-
; V-NEXT: vid.v v10
642-
; V-NEXT: vadd.vv v10, v10, v10
643-
; V-NEXT: vadd.vi v10, v10, -4
644-
; V-NEXT: vmv.v.i v0, 12
627+
; V-NEXT: vle16.v v8, (a1)
628+
; V-NEXT: vle16.v v9, (a0)
645629
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
646630
; V-NEXT: vnsrl.wi v8, v8, 0
647-
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
648-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
649-
; V-NEXT: vse16.v v8, (a2)
631+
; V-NEXT: vnsrl.wi v9, v9, 0
632+
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
633+
; V-NEXT: vslideup.vi v9, v8, 2
634+
; V-NEXT: vse16.v v9, (a2)
650635
; V-NEXT: ret
651636
;
652637
; ZVE32F-LABEL: vnsrl_0_i16_two_source:
653638
; ZVE32F: # %bb.0: # %entry
654639
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
655-
; ZVE32F-NEXT: vle16.v v8, (a0)
656-
; ZVE32F-NEXT: vle16.v v9, (a1)
657-
; ZVE32F-NEXT: vid.v v10
658-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
659-
; ZVE32F-NEXT: vadd.vi v10, v10, -4
660-
; ZVE32F-NEXT: vmv.v.i v0, 12
640+
; ZVE32F-NEXT: vle16.v v8, (a1)
641+
; ZVE32F-NEXT: vle16.v v9, (a0)
661642
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
662643
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
663-
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
664-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
665-
; ZVE32F-NEXT: vse16.v v8, (a2)
644+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
645+
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
646+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
647+
; ZVE32F-NEXT: vse16.v v9, (a2)
666648
; ZVE32F-NEXT: ret
667649
entry:
668650
%0 = load <4 x i16>, ptr %in0, align 2
@@ -678,33 +660,25 @@ define void @vnsrl_16_i16_two_source(ptr %in0, ptr %in1, ptr %out) {
678660
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
679661
; V-NEXT: vle16.v v8, (a1)
680662
; V-NEXT: vle16.v v9, (a0)
681-
; V-NEXT: li a0, -1
682663
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
683-
; V-NEXT: vslidedown.vi v10, v8, 2
684-
; V-NEXT: vwaddu.vv v11, v8, v10
685-
; V-NEXT: vwmaccu.vx v11, a0, v10
686-
; V-NEXT: vmv.v.i v0, 12
687-
; V-NEXT: vnsrl.wi v8, v9, 16
664+
; V-NEXT: vnsrl.wi v8, v8, 16
665+
; V-NEXT: vnsrl.wi v9, v9, 16
688666
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
689-
; V-NEXT: vmerge.vvm v8, v8, v11, v0
690-
; V-NEXT: vse16.v v8, (a2)
667+
; V-NEXT: vslideup.vi v9, v8, 2
668+
; V-NEXT: vse16.v v9, (a2)
691669
; V-NEXT: ret
692670
;
693671
; ZVE32F-LABEL: vnsrl_16_i16_two_source:
694672
; ZVE32F: # %bb.0: # %entry
695673
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
696674
; ZVE32F-NEXT: vle16.v v8, (a1)
697675
; ZVE32F-NEXT: vle16.v v9, (a0)
698-
; ZVE32F-NEXT: li a0, -1
699676
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
700-
; ZVE32F-NEXT: vslidedown.vi v10, v8, 2
701-
; ZVE32F-NEXT: vwaddu.vv v11, v8, v10
702-
; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10
703-
; ZVE32F-NEXT: vmv.v.i v0, 12
704-
; ZVE32F-NEXT: vnsrl.wi v8, v9, 16
677+
; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
678+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
705679
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
706-
; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0
707-
; ZVE32F-NEXT: vse16.v v8, (a2)
680+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
681+
; ZVE32F-NEXT: vse16.v v9, (a2)
708682
; ZVE32F-NEXT: ret
709683
entry:
710684
%0 = load <4 x i16>, ptr %in0, align 2
@@ -718,33 +692,27 @@ define void @vnsrl_0_half_two_source(ptr %in0, ptr %in1, ptr %out) {
718692
; V-LABEL: vnsrl_0_half_two_source:
719693
; V: # %bb.0: # %entry
720694
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
721-
; V-NEXT: vle16.v v8, (a0)
722-
; V-NEXT: vle16.v v9, (a1)
723-
; V-NEXT: vid.v v10
724-
; V-NEXT: vadd.vv v10, v10, v10
725-
; V-NEXT: vadd.vi v10, v10, -4
726-
; V-NEXT: vmv.v.i v0, 12
695+
; V-NEXT: vle16.v v8, (a1)
696+
; V-NEXT: vle16.v v9, (a0)
727697
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
728698
; V-NEXT: vnsrl.wi v8, v8, 0
729-
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, mu
730-
; V-NEXT: vrgather.vv v8, v9, v10, v0.t
731-
; V-NEXT: vse16.v v8, (a2)
699+
; V-NEXT: vnsrl.wi v9, v9, 0
700+
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
701+
; V-NEXT: vslideup.vi v9, v8, 2
702+
; V-NEXT: vse16.v v9, (a2)
732703
; V-NEXT: ret
733704
;
734705
; ZVE32F-LABEL: vnsrl_0_half_two_source:
735706
; ZVE32F: # %bb.0: # %entry
736707
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
737-
; ZVE32F-NEXT: vle16.v v8, (a0)
738-
; ZVE32F-NEXT: vle16.v v9, (a1)
739-
; ZVE32F-NEXT: vid.v v10
740-
; ZVE32F-NEXT: vadd.vv v10, v10, v10
741-
; ZVE32F-NEXT: vadd.vi v10, v10, -4
742-
; ZVE32F-NEXT: vmv.v.i v0, 12
708+
; ZVE32F-NEXT: vle16.v v8, (a1)
709+
; ZVE32F-NEXT: vle16.v v9, (a0)
743710
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
744711
; ZVE32F-NEXT: vnsrl.wi v8, v8, 0
745-
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
746-
; ZVE32F-NEXT: vrgather.vv v8, v9, v10, v0.t
747-
; ZVE32F-NEXT: vse16.v v8, (a2)
712+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 0
713+
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
714+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
715+
; ZVE32F-NEXT: vse16.v v9, (a2)
748716
; ZVE32F-NEXT: ret
749717
entry:
750718
%0 = load <4 x half>, ptr %in0, align 2
@@ -760,33 +728,25 @@ define void @vnsrl_16_half_two_source(ptr %in0, ptr %in1, ptr %out) {
760728
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
761729
; V-NEXT: vle16.v v8, (a1)
762730
; V-NEXT: vle16.v v9, (a0)
763-
; V-NEXT: li a0, -1
764731
; V-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
765-
; V-NEXT: vslidedown.vi v10, v8, 2
766-
; V-NEXT: vwaddu.vv v11, v8, v10
767-
; V-NEXT: vwmaccu.vx v11, a0, v10
768-
; V-NEXT: vmv.v.i v0, 12
769-
; V-NEXT: vnsrl.wi v8, v9, 16
732+
; V-NEXT: vnsrl.wi v8, v8, 16
733+
; V-NEXT: vnsrl.wi v9, v9, 16
770734
; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
771-
; V-NEXT: vmerge.vvm v8, v8, v11, v0
772-
; V-NEXT: vse16.v v8, (a2)
735+
; V-NEXT: vslideup.vi v9, v8, 2
736+
; V-NEXT: vse16.v v9, (a2)
773737
; V-NEXT: ret
774738
;
775739
; ZVE32F-LABEL: vnsrl_16_half_two_source:
776740
; ZVE32F: # %bb.0: # %entry
777741
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
778742
; ZVE32F-NEXT: vle16.v v8, (a1)
779743
; ZVE32F-NEXT: vle16.v v9, (a0)
780-
; ZVE32F-NEXT: li a0, -1
781744
; ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
782-
; ZVE32F-NEXT: vslidedown.vi v10, v8, 2
783-
; ZVE32F-NEXT: vwaddu.vv v11, v8, v10
784-
; ZVE32F-NEXT: vwmaccu.vx v11, a0, v10
785-
; ZVE32F-NEXT: vmv.v.i v0, 12
786-
; ZVE32F-NEXT: vnsrl.wi v8, v9, 16
745+
; ZVE32F-NEXT: vnsrl.wi v8, v8, 16
746+
; ZVE32F-NEXT: vnsrl.wi v9, v9, 16
787747
; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
788-
; ZVE32F-NEXT: vmerge.vvm v8, v8, v11, v0
789-
; ZVE32F-NEXT: vse16.v v8, (a2)
748+
; ZVE32F-NEXT: vslideup.vi v9, v8, 2
749+
; ZVE32F-NEXT: vse16.v v9, (a2)
790750
; ZVE32F-NEXT: ret
791751
entry:
792752
%0 = load <4 x half>, ptr %in0, align 2

0 commit comments

Comments
 (0)