Skip to content

Commit 0c7f8f2

Browse files
committed
added test cases
1 parent 0bdbc64 commit 0c7f8f2

File tree

1 file changed

+306
-0
lines changed

1 file changed

+306
-0
lines changed
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,-avx,-avx2 | FileCheck %s --check-prefix=SSSE3
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 | FileCheck %s --check-prefix=AVX
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+sse4a | FileCheck %s --check-prefix=ZNVER1
5+
;
6+
; Check the permutation of a variable shift with i8 vector into a widened shift.
7+
;
8+
9+
; Transform only occurs on SSSE3 because operand is not a shuffle, and shift
10+
; amounts cannot be rearranged to quads. Not checking the correctness of
11+
; untransformed variants here as they are covered by other vector shift checks.
12+
define <16 x i8> @shl_v16i8(<16 x i8> %a) {
13+
; SSSE3-LABEL: shl_v16i8:
14+
; SSSE3: # %bb.0:
15+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [8,1,2,12,4,5,6,7,0,9,10,11,3,13,14,15]
16+
; SSSE3-NEXT: pshufb %xmm1, %xmm0
17+
; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4,1,1,8,1,16,32]
18+
; SSSE3-NEXT: pshufb %xmm1, %xmm0
19+
; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
20+
; SSSE3-NEXT: retq
21+
;
22+
; AVX-LABEL: shl_v16i8:
23+
; AVX: # %bb.0:
24+
; AVX-NOT: pshufb
25+
; AVX-NOT: vpshufb
26+
; AVX: retq
27+
;
28+
; AVX2-LABEL: shl_v16i8:
29+
; AVX2: # %bb.0:
30+
; AVX2-NOT: pshufb
31+
; AVX2-NOT: vpshufb
32+
; AVX2: retq
33+
%shift = shl <16 x i8> %a, <i8 3, i8 0, i8 2, i8 4, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 3, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 5>
34+
ret <16 x i8> %shift
35+
}
36+
37+
define <16 x i8> @lshr_v16i8(<16 x i8> %a) {
38+
; SSSE3-LABEL: lshr_v16i8:
39+
; SSSE3: # %bb.0:
40+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[2,1,4,3,6,5,8,7,10,9,12,11,14,13,0,15]
41+
; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,2048,8192,16384,32768,8192,2048,4096]
42+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15]
43+
; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
44+
; SSSE3-NEXT: retq
45+
;
46+
; AVX-LABEL: lshr_v16i8:
47+
; AVX: # %bb.0:
48+
; AVX-NOT: pshufb
49+
; AVX-NOT: vpshufb
50+
; AVX: retq
51+
;
52+
; AVX2-LABEL: lshr_v16i8:
53+
; AVX2: # %bb.0:
54+
; AVX2-NOT: pshufb
55+
; AVX2-NOT: vpshufb
56+
; AVX2: retq
57+
%shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 3, i8 3, i8 5, i8 5, i8 4>
58+
ret <16 x i8> %shift
59+
}
60+
61+
define <16 x i8> @ashr_v16i8(<16 x i8> %a) {
62+
; SSSE3-LABEL: ashr_v16i8:
63+
; SSSE3: # %bb.0:
64+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,9,11,7,8,13,10,6,1,14,5,15]
65+
; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,8192,512,8192,4096,1024,32768,2048]
66+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,12,2,3,4,14,11,7,8,5,10,6,1,9,13,15]
67+
; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
68+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [32,64,16,16,1,4,2,16,8,1,u,16,32,8,64,4]
69+
; SSSE3-NEXT: pxor %xmm1, %xmm0
70+
; SSSE3-NEXT: psubb %xmm1, %xmm0
71+
; SSSE3-NEXT: retq
72+
;
73+
; AVX-LABEL: ashr_v16i8:
74+
; AVX: # %bb.0:
75+
; AVX-NOT: pshufb
76+
; AVX-NOT: vpshufb
77+
; AVX: retq
78+
;
79+
; AVX2-LABEL: ashr_v16i8:
80+
; AVX2: # %bb.0:
81+
; AVX2-NOT: pshufb
82+
; AVX2-NOT: vpshufb
83+
; AVX2: retq
84+
%shift = ashr <16 x i8> %a, <i8 2, i8 1, i8 3, i8 3, i8 7, i8 5, i8 6, i8 3, i8 4, i8 7, i8 undef, i8 3, i8 2, i8 4, i8 1, i8 5>
85+
ret <16 x i8> %shift
86+
}
87+
88+
; Shift amounts cannot be paired.
89+
define <16 x i8> @not_shl_v16i8(<16 x i8> %a) {
90+
; SSSE3-LABEL: not_shl_v16i8:
91+
; SSSE3: # %bb.0:
92+
; SSSE3-NOT: pshufb
93+
; SSSE3-NOT: vpshufb
94+
; SSSE3: retq
95+
;
96+
; AVX-LABEL: not_shl_v16i8:
97+
; AVX: # %bb.0:
98+
; AVX-NOT: pshufb
99+
; AVX-NOT: vpshufb
100+
; AVX: retq
101+
;
102+
; AVX2-LABEL: not_shl_v16i8:
103+
; AVX2: # %bb.0:
104+
; AVX2-NOT: pshufb
105+
; AVX2-NOT: vpshufb
106+
; AVX2: retq
107+
%shift = shl <16 x i8> %a, <i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 4, i8 2, i8 1, i8 3, i8 0, i8 7, i8 5, i8 6, i8 5>
108+
ret <16 x i8> %shift
109+
}
110+
111+
; Right shift amounts containing zero and cannot form quads.
112+
define <16 x i8> @not_lshr_v16i8(<16 x i8> %a) {
113+
; SSSE3-LABEL: not_lshr_v16i8:
114+
; SSSE3: # %bb.0:
115+
; SSSE3-NOT: pshufb
116+
; SSSE3-NOT: vpshufb
117+
; SSSE3: retq
118+
;
119+
; AVX-LABEL: not_lshr_v16i8:
120+
; AVX: # %bb.0:
121+
; AVX-NOT: pshufb
122+
; AVX-NOT: vpshufb
123+
; AVX: retq
124+
;
125+
; AVX2-LABEL: not_lshr_v16i8:
126+
; AVX2: # %bb.0:
127+
; AVX2-NOT: pshufb
128+
; AVX2-NOT: vpshufb
129+
; AVX2: retq
130+
%shift = lshr <16 x i8> %a, <i8 4, i8 2, i8 2, i8 5, i8 5, i8 3, i8 3, i8 2, i8 2, i8 1, i8 1, i8 0, i8 0, i8 5, i8 5, i8 4>
131+
ret <16 x i8> %shift
132+
}
133+
134+
; Shift cannot form quads and operand is not shuffle, only transform on SSSE3.
135+
define <32 x i8> @shl_v32i8(<32 x i8> %a) {
136+
; SSSE3-LABEL: shl_v32i8:
137+
; SSSE3: # %bb.0:
138+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # xmm2 = [0,2,1,3,6,5,4,7,8,9,12,11,10,13,14,15]
139+
; SSSE3-NEXT: pshufb %xmm2, %xmm0
140+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # xmm3 = [1,4,8,2,16,32,64,16]
141+
; SSSE3-NEXT: pmullw %xmm3, %xmm0
142+
; SSSE3-NEXT: pshufb %xmm2, %xmm0
143+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # xmm4 = [255,252,255,252,254,248,248,254,240,240,192,224,224,192,240,240]
144+
; SSSE3-NEXT: pand %xmm4, %xmm0
145+
; SSSE3-NEXT: pshufb %xmm2, %xmm1
146+
; SSSE3-NEXT: pmullw %xmm3, %xmm1
147+
; SSSE3-NEXT: pshufb %xmm2, %xmm1
148+
; SSSE3-NEXT: pand %xmm4, %xmm1
149+
; SSSE3-NEXT: retq
150+
;
151+
; AVX-LABEL: shl_v32i8:
152+
; AVX: # %bb.0:
153+
; AVX-NOT: pshufb
154+
; AVX-NOT: vpshufb
155+
; AVX: retq
156+
;
157+
; AVX2-LABEL: shl_v32i8:
158+
; AVX2: # %bb.0:
159+
; AVX2-NOT: pshufb
160+
; AVX2-NOT: vpshufb
161+
; AVX2: retq
162+
%shift = shl <32 x i8> %a, <i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4,
163+
i8 0, i8 2, i8 0, i8 2, i8 1, i8 3, i8 3, i8 1, i8 4, i8 4, i8 6, i8 5, i8 5, i8 6, i8 4, i8 4>
164+
ret <32 x i8> %shift
165+
}
166+
167+
; For quads only testing on AVX2 as it has vps**vd.
168+
define <32 x i8> @shl_v32i8_quad(<32 x i8> %a) {
169+
; AVX2-LABEL: shl_v32i8_quad:
170+
; AVX2: # %bb.0:
171+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
172+
; AVX2-NEXT: vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
173+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
174+
; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
175+
; AVX2-NEXT: retq
176+
;
177+
; ZNVER1-LABEL: shl_v32i8_quad:
178+
; ZNVER1: # %bb.0:
179+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
180+
; ZNVER1-NEXT: vpsllvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
181+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
182+
; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
183+
; ZNVER1-NEXT: retq
184+
%shift = shl <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
185+
i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
186+
ret <32 x i8> %shift
187+
}
188+
189+
define <32 x i8> @lshr_v32i8_quad(<32 x i8> %a) {
190+
; AVX2-LABEL: lshr_v32i8_quad:
191+
; AVX2: # %bb.0:
192+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
193+
; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
194+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
195+
; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
196+
; AVX2-NEXT: retq
197+
;
198+
; ZNVER1-LABEL: lshr_v32i8_quad:
199+
; ZNVER1: # %bb.0:
200+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
201+
; ZNVER1-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
202+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
203+
; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
204+
; ZNVER1-NEXT: retq
205+
%shift = lshr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
206+
i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
207+
ret <32 x i8> %shift
208+
}
209+
210+
; Disabling the transform for AMD Zen because it can schedule two vpmullw 2
211+
; cycles faster compared to Intel.
212+
define <32 x i8> @ashr_v32i8_quad(<32 x i8> %a) {
213+
; AVX2-LABEL: ashr_v32i8_quad:
214+
; AVX2: # %bb.0:
215+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,5,13,9,3,6,12,11,2,4,10,14,1,7,8,15,25,29,18,22,24,28,19,23,17,21,26,30,16,20,27,31]
216+
; AVX2-NEXT: vpsrlvd {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
217+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # ymm0 = ymm0[0,12,8,4,9,1,5,13,14,3,10,7,6,2,11,15,28,24,18,22,29,25,19,23,20,16,26,30,21,17,27,31]
218+
; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
219+
; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 # ymm1 = [128,32,8,2,8,128,2,32,32,128,8,2,2,128,8,32,64,16,4,1,64,16,4,1,1,4,16,64,1,4,16,64]
220+
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
221+
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
222+
; AVX2-NEXT: retq
223+
;
224+
; ZNVER1-LABEL: ashr_v32i8_quad:
225+
; ZNVER1: # %bb.0:
226+
; ZNVER1-NOT: pshufb
227+
; ZNVER1-NOT: vpshufb
228+
; ZNVER1: retq
229+
%shift = ashr <32 x i8> %a, <i8 0, i8 2, i8 4, i8 6, i8 4, i8 0, i8 6, i8 2, i8 2, i8 0, i8 4, i8 6, i8 6, i8 0, i8 4, i8 2,
230+
i8 1, i8 3, i8 5, i8 7, i8 1, i8 3, i8 5, i8 7, i8 7, i8 5, i8 3, i8 1, i8 7, i8 5, i8 3, i8 1>
231+
ret <32 x i8> %shift
232+
}
233+
234+
; Shift amounts cannot be paired in lane.
235+
define <32 x i8> @not_shl_v32i8(<32 x i8> %a) {
236+
; SSSE3-LABEL: not_shl_v32i8:
237+
; SSSE3: # %bb.0:
238+
; SSSE3-NOT: pshufb
239+
; SSSE3-NOT: vpshufb
240+
; SSSE3: retq
241+
;
242+
; AVX-LABEL: not_shl_v32i8:
243+
; AVX: # %bb.0:
244+
; AVX-NOT: pshufb
245+
; AVX-NOT: vpshufb
246+
; AVX: retq
247+
;
248+
; AVX2-LABEL: not_shl_v32i8:
249+
; AVX2: # %bb.0:
250+
; AVX2-NOT: pshufb
251+
; AVX2-NOT: vpshufb
252+
; AVX2: retq
253+
%shift = shl <32 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3,
254+
i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3>
255+
ret <32 x i8> %shift
256+
}
257+
258+
; Always transform if operand is shuffle and shift amounts can be paired.
259+
define <16 x i8> @lshr_shuffle_v16i8(<16 x i8> %a) {
260+
; SSSE3-LABEL: lshr_shuffle_v16i8:
261+
; SSSE3: # %bb.0:
262+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
263+
; SSSE3-NEXT: pmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
264+
; SSSE3-NEXT: pshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
265+
; SSSE3-NEXT: pand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
266+
; SSSE3-NEXT: movdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
267+
; SSSE3-NEXT: pxor %xmm1, %xmm0
268+
; SSSE3-NEXT: psubb %xmm1, %xmm0
269+
; SSSE3-NEXT: retq
270+
;
271+
; AVX-LABEL: lshr_shuffle_v16i8:
272+
; AVX: # %bb.0:
273+
; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
274+
; AVX-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
275+
; AVX-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
276+
; AVX-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
277+
; AVX-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
278+
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
279+
; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
280+
; AVX-NEXT: retq
281+
;
282+
; AVX2-LABEL: lshr_shuffle_v16i8:
283+
; AVX2: # %bb.0:
284+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
285+
; AVX2-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
286+
; AVX2-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
287+
; AVX2-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
288+
; AVX2-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
289+
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
290+
; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
291+
; AVX2-NEXT: retq
292+
;
293+
; ZNVER1-LABEL: lshr_shuffle_v16i8:
294+
; ZNVER1: # %bb.0:
295+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,8,4,12,1,9,5,13,2,10,6,14,3,11,7,15]
296+
; ZNVER1-NEXT: vpmulhuw {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,16384,8192,8192,4096,4096,2048]
297+
; ZNVER1-NEXT: vpshufb {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # xmm0 = xmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15]
298+
; ZNVER1-NEXT: vpand {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
299+
; ZNVER1-NEXT: vmovdqa {{\.LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # xmm1 = [64,32,64,32,32,16,32,16,16,8,16,8,8,4,8,4]
300+
; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0
301+
; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
302+
; ZNVER1-NEXT: retq
303+
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
304+
%shift = ashr <16 x i8> %shuffle, <i8 1, i8 2, i8 1, i8 2, i8 2, i8 3, i8 2, i8 3, i8 3, i8 4, i8 3, i8 4, i8 4, i8 5, i8 4, i8 5>
305+
ret <16 x i8> %shift
306+
}

0 commit comments

Comments
 (0)