Skip to content

Commit dc00cbb

Browse files
committed
[RISCV] Match trunc_vector_vl+sra_vl/srl_vl with splat shift amount to vnsra/vnsrl.
Limited to splats because we would need to truncate the shift amount vector otherwise. I tried to do this with new ISD nodes and a DAG combine to avoid such a large pattern, but we don't form the splat until LegalizeDAG and need DAG combine to remove a scalable->fixed->scalable cast before it becomes visible to the shift node. By the time that happens we've already visited the truncate node and won't revisit it. I think I have an idea how to improve i64 on RV32 I'll save for a follow up. Reviewed By: frasercrmck Differential Revision: https://reviews.llvm.org/D102019
1 parent 668dccc commit dc00cbb

File tree

2 files changed

+227
-6
lines changed

2 files changed

+227
-6
lines changed

llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -636,15 +636,47 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_shl_vl, "PseudoVSLL", uimm5>;
636636
defm : VPatBinaryVL_VV_VX_VI<riscv_srl_vl, "PseudoVSRL", uimm5>;
637637
defm : VPatBinaryVL_VV_VX_VI<riscv_sra_vl, "PseudoVSRA", uimm5>;
638638

639+
640+
639641
// 12.7. Vector Narrowing Integer Right Shift Instructions
640-
foreach vtiTofti = AllFractionableVF2IntVectors in {
641-
defvar vti = vtiTofti.Vti;
642-
defvar fti = vtiTofti.Fti;
643-
def : Pat<(fti.Vector (riscv_trunc_vector_vl (vti.Vector vti.RegClass:$rs1),
642+
foreach vtiTowti = AllWidenableIntVectors in {
643+
defvar vti = vtiTowti.Vti;
644+
defvar wti = vtiTowti.Wti;
645+
def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1),
644646
(vti.Mask true_mask),
645647
VLOpFrag)),
646-
(!cast<Instruction>("PseudoVNSRL_WI_"#fti.LMul.MX)
647-
vti.RegClass:$rs1, 0, GPR:$vl, fti.Log2SEW)>;
648+
(!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
649+
wti.RegClass:$rs1, 0, GPR:$vl, vti.Log2SEW)>;
650+
651+
def : Pat<(vti.Vector
652+
(riscv_trunc_vector_vl
653+
(wti.Vector
654+
(riscv_sra_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
655+
true_mask, VLOpFrag)), true_mask, VLOpFrag)),
656+
(!cast<Instruction>("PseudoVNSRA_WX_"#vti.LMul.MX)
657+
wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
658+
def : Pat<(vti.Vector
659+
(riscv_trunc_vector_vl
660+
(wti.Vector
661+
(riscv_sra_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
662+
true_mask, VLOpFrag)), true_mask, VLOpFrag)),
663+
(!cast<Instruction>("PseudoVNSRA_WI_"#vti.LMul.MX)
664+
wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
665+
666+
def : Pat<(vti.Vector
667+
(riscv_trunc_vector_vl
668+
(wti.Vector
669+
(riscv_srl_vl wti.RegClass:$rs1, (SplatPat XLenVT:$rs2),
670+
true_mask, VLOpFrag)), true_mask, VLOpFrag)),
671+
(!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX)
672+
wti.RegClass:$rs1, GPR:$rs2, GPR:$vl, vti.Log2SEW)>;
673+
def : Pat<(vti.Vector
674+
(riscv_trunc_vector_vl
675+
(wti.Vector
676+
(riscv_srl_vl wti.RegClass:$rs1, (SplatPat_uimm5 uimm5:$rs2),
677+
true_mask, VLOpFrag)), true_mask, VLOpFrag)),
678+
(!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
679+
wti.RegClass:$rs1, uimm5:$rs2, GPR:$vl, vti.Log2SEW)>;
648680
}
649681

650682
// 12.8. Vector Integer Comparison Instructions
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3+
; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4+
5+
define <8 x i8> @vnsra_v8i16_v8i8_scalar(<8 x i16> %x, i16 %y) {
6+
; CHECK-LABEL: vnsra_v8i16_v8i8_scalar:
7+
; CHECK: # %bb.0:
8+
; CHECK-NEXT: vsetivli a1, 8, e8,mf2,ta,mu
9+
; CHECK-NEXT: vnsra.wx v25, v8, a0
10+
; CHECK-NEXT: vmv1r.v v8, v25
11+
; CHECK-NEXT: ret
12+
%insert = insertelement <8 x i16> undef, i16 %y, i16 0
13+
%splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
14+
%a = ashr <8 x i16> %x, %splat
15+
%b = trunc <8 x i16> %a to <8 x i8>
16+
ret <8 x i8> %b
17+
}
18+
19+
define <4 x i16> @vnsra_v4i32_v4i16_scalar(<4 x i32> %x, i32 %y) {
20+
; CHECK-LABEL: vnsra_v4i32_v4i16_scalar:
21+
; CHECK: # %bb.0:
22+
; CHECK-NEXT: vsetivli a1, 4, e16,mf2,ta,mu
23+
; CHECK-NEXT: vnsra.wx v25, v8, a0
24+
; CHECK-NEXT: vmv1r.v v8, v25
25+
; CHECK-NEXT: ret
26+
%insert = insertelement <4 x i32> undef, i32 %y, i32 0
27+
%splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
28+
%a = ashr <4 x i32> %x, %splat
29+
%b = trunc <4 x i32> %a to <4 x i16>
30+
ret <4 x i16> %b
31+
}
32+
33+
define <2 x i32> @vnsra_v2i64_v2i32_scalar(<2 x i64> %x, i64 %y) {
34+
; RV32-LABEL: vnsra_v2i64_v2i32_scalar:
35+
; RV32: # %bb.0:
36+
; RV32-NEXT: addi sp, sp, -16
37+
; RV32-NEXT: .cfi_def_cfa_offset 16
38+
; RV32-NEXT: sw a1, 12(sp)
39+
; RV32-NEXT: sw a0, 8(sp)
40+
; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
41+
; RV32-NEXT: addi a0, sp, 8
42+
; RV32-NEXT: vlse64.v v25, (a0), zero
43+
; RV32-NEXT: vsra.vv v25, v8, v25
44+
; RV32-NEXT: vsetivli a0, 2, e32,mf2,ta,mu
45+
; RV32-NEXT: vnsrl.wi v8, v25, 0
46+
; RV32-NEXT: addi sp, sp, 16
47+
; RV32-NEXT: ret
48+
;
49+
; RV64-LABEL: vnsra_v2i64_v2i32_scalar:
50+
; RV64: # %bb.0:
51+
; RV64-NEXT: vsetivli a1, 2, e32,mf2,ta,mu
52+
; RV64-NEXT: vnsra.wx v25, v8, a0
53+
; RV64-NEXT: vmv1r.v v8, v25
54+
; RV64-NEXT: ret
55+
%insert = insertelement <2 x i64> undef, i64 %y, i32 0
56+
%splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
57+
%a = ashr <2 x i64> %x, %splat
58+
%b = trunc <2 x i64> %a to <2 x i32>
59+
ret <2 x i32> %b
60+
}
61+
62+
define <8 x i8> @vnsra_v8i16_v8i8_imm(<8 x i16> %x) {
63+
; CHECK-LABEL: vnsra_v8i16_v8i8_imm:
64+
; CHECK: # %bb.0:
65+
; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu
66+
; CHECK-NEXT: vnsrl.wi v25, v8, 8
67+
; CHECK-NEXT: vmv1r.v v8, v25
68+
; CHECK-NEXT: ret
69+
%a = ashr <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8,i16 8, i16 8, i16 8, i16 8>
70+
%b = trunc <8 x i16> %a to <8 x i8>
71+
ret <8 x i8> %b
72+
}
73+
74+
define <4 x i16> @vnsra_v4i32_v4i16_imm(<4 x i32> %x) {
75+
; CHECK-LABEL: vnsra_v4i32_v4i16_imm:
76+
; CHECK: # %bb.0:
77+
; CHECK-NEXT: vsetivli a0, 4, e16,mf2,ta,mu
78+
; CHECK-NEXT: vnsrl.wi v25, v8, 16
79+
; CHECK-NEXT: vmv1r.v v8, v25
80+
; CHECK-NEXT: ret
81+
%a = ashr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
82+
%b = trunc <4 x i32> %a to <4 x i16>
83+
ret <4 x i16> %b
84+
}
85+
86+
define <2 x i32> @vnsra_v2i64_v2i32_imm(<2 x i64> %x) {
87+
; CHECK-LABEL: vnsra_v2i64_v2i32_imm:
88+
; CHECK: # %bb.0:
89+
; CHECK-NEXT: vsetivli a0, 2, e32,mf2,ta,mu
90+
; CHECK-NEXT: vnsrl.wi v25, v8, 31
91+
; CHECK-NEXT: vmv1r.v v8, v25
92+
; CHECK-NEXT: ret
93+
%a = ashr <2 x i64> %x, <i64 31, i64 31>
94+
%b = trunc <2 x i64> %a to <2 x i32>
95+
ret <2 x i32> %b
96+
}
97+
98+
define <8 x i8> @vnsrl_v8i16_v8i8_scalar(<8 x i16> %x, i16 %y) {
99+
; CHECK-LABEL: vnsrl_v8i16_v8i8_scalar:
100+
; CHECK: # %bb.0:
101+
; CHECK-NEXT: vsetivli a1, 8, e8,mf2,ta,mu
102+
; CHECK-NEXT: vnsrl.wx v25, v8, a0
103+
; CHECK-NEXT: vmv1r.v v8, v25
104+
; CHECK-NEXT: ret
105+
%insert = insertelement <8 x i16> undef, i16 %y, i16 0
106+
%splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
107+
%a = lshr <8 x i16> %x, %splat
108+
%b = trunc <8 x i16> %a to <8 x i8>
109+
ret <8 x i8> %b
110+
}
111+
112+
define <4 x i16> @vnsrl_v4i32_v4i16_scalar(<4 x i32> %x, i32 %y) {
113+
; CHECK-LABEL: vnsrl_v4i32_v4i16_scalar:
114+
; CHECK: # %bb.0:
115+
; CHECK-NEXT: vsetivli a1, 4, e16,mf2,ta,mu
116+
; CHECK-NEXT: vnsrl.wx v25, v8, a0
117+
; CHECK-NEXT: vmv1r.v v8, v25
118+
; CHECK-NEXT: ret
119+
%insert = insertelement <4 x i32> undef, i32 %y, i32 0
120+
%splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
121+
%a = lshr <4 x i32> %x, %splat
122+
%b = trunc <4 x i32> %a to <4 x i16>
123+
ret <4 x i16> %b
124+
}
125+
126+
define <2 x i32> @vnsrl_v2i64_v2i32_scalar(<2 x i64> %x, i64 %y) {
127+
; RV32-LABEL: vnsrl_v2i64_v2i32_scalar:
128+
; RV32: # %bb.0:
129+
; RV32-NEXT: addi sp, sp, -16
130+
; RV32-NEXT: .cfi_def_cfa_offset 16
131+
; RV32-NEXT: sw a1, 12(sp)
132+
; RV32-NEXT: sw a0, 8(sp)
133+
; RV32-NEXT: vsetivli a0, 2, e64,m1,ta,mu
134+
; RV32-NEXT: addi a0, sp, 8
135+
; RV32-NEXT: vlse64.v v25, (a0), zero
136+
; RV32-NEXT: vsrl.vv v25, v8, v25
137+
; RV32-NEXT: vsetivli a0, 2, e32,mf2,ta,mu
138+
; RV32-NEXT: vnsrl.wi v8, v25, 0
139+
; RV32-NEXT: addi sp, sp, 16
140+
; RV32-NEXT: ret
141+
;
142+
; RV64-LABEL: vnsrl_v2i64_v2i32_scalar:
143+
; RV64: # %bb.0:
144+
; RV64-NEXT: vsetivli a1, 2, e32,mf2,ta,mu
145+
; RV64-NEXT: vnsrl.wx v25, v8, a0
146+
; RV64-NEXT: vmv1r.v v8, v25
147+
; RV64-NEXT: ret
148+
%insert = insertelement <2 x i64> undef, i64 %y, i32 0
149+
%splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
150+
%a = lshr <2 x i64> %x, %splat
151+
%b = trunc <2 x i64> %a to <2 x i32>
152+
ret <2 x i32> %b
153+
}
154+
155+
define <8 x i8> @vnsrl_v8i16_v8i8_imm(<8 x i16> %x) {
156+
; CHECK-LABEL: vnsrl_v8i16_v8i8_imm:
157+
; CHECK: # %bb.0:
158+
; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu
159+
; CHECK-NEXT: vnsrl.wi v25, v8, 8
160+
; CHECK-NEXT: vmv1r.v v8, v25
161+
; CHECK-NEXT: ret
162+
%a = lshr <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8,i16 8, i16 8, i16 8, i16 8>
163+
%b = trunc <8 x i16> %a to <8 x i8>
164+
ret <8 x i8> %b
165+
}
166+
167+
define <4 x i16> @vnsrl_v4i32_v4i16_imm(<4 x i32> %x) {
168+
; CHECK-LABEL: vnsrl_v4i32_v4i16_imm:
169+
; CHECK: # %bb.0:
170+
; CHECK-NEXT: vsetivli a0, 4, e16,mf2,ta,mu
171+
; CHECK-NEXT: vnsrl.wi v25, v8, 16
172+
; CHECK-NEXT: vmv1r.v v8, v25
173+
; CHECK-NEXT: ret
174+
%a = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
175+
%b = trunc <4 x i32> %a to <4 x i16>
176+
ret <4 x i16> %b
177+
}
178+
179+
define <2 x i32> @vnsrl_v2i64_v2i32_imm(<2 x i64> %x) {
180+
; CHECK-LABEL: vnsrl_v2i64_v2i32_imm:
181+
; CHECK: # %bb.0:
182+
; CHECK-NEXT: vsetivli a0, 2, e32,mf2,ta,mu
183+
; CHECK-NEXT: vnsrl.wi v25, v8, 31
184+
; CHECK-NEXT: vmv1r.v v8, v25
185+
; CHECK-NEXT: ret
186+
%a = lshr <2 x i64> %x, <i64 31, i64 31>
187+
%b = trunc <2 x i64> %a to <2 x i32>
188+
ret <2 x i32> %b
189+
}

0 commit comments

Comments
 (0)