Skip to content

Commit 03d8529

Browse files
committed
[X86] Add test coverage for i512 shift-by-constants
Based off llvm#132601 - pass the i512 types inside 512-bit vectors Shows several missed general codegen issues that will help in a lot more cases than just this.
1 parent f8416fc commit 03d8529

File tree

1 file changed

+359
-0
lines changed

1 file changed

+359
-0
lines changed

llvm/test/CodeGen/X86/shift-i512.ll

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s -check-prefixes=AVX512VL
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+avx512vbmi2 | FileCheck %s -check-prefixes=AVX512VBMI
4+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s -check-prefixes=ZNVER4
5+
6+
; i512 shifts hidden inside 512-bit vectors.
7+
8+
define <8 x i64> @shl_i512_1(<8 x i64> %a) {
9+
; AVX512VL-LABEL: shl_i512_1:
10+
; AVX512VL: # %bb.0:
11+
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
12+
; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
13+
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm3
14+
; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm4
15+
; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm5
16+
; AVX512VL-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
17+
; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
18+
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
19+
; AVX512VL-NEXT: vpsrlq $63, %ymm5, %ymm5
20+
; AVX512VL-NEXT: vpor %ymm5, %ymm4, %ymm4
21+
; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm5
22+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
23+
; AVX512VL-NEXT: vpsrlq $63, %xmm6, %xmm7
24+
; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm8
25+
; AVX512VL-NEXT: vpor %xmm7, %xmm8, %xmm7
26+
; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
27+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
28+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3]
29+
; AVX512VL-NEXT: vpaddq %xmm5, %xmm5, %xmm5
30+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
31+
; AVX512VL-NEXT: vpaddq %xmm7, %xmm7, %xmm7
32+
; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
33+
; AVX512VL-NEXT: vpaddq %xmm6, %xmm6, %xmm6
34+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
35+
; AVX512VL-NEXT: vpaddq %xmm7, %xmm7, %xmm7
36+
; AVX512VL-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
37+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
38+
; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
39+
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
40+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
41+
; AVX512VL-NEXT: vpsrlq $63, %zmm0, %zmm0
42+
; AVX512VL-NEXT: vporq %zmm0, %zmm5, %zmm0
43+
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm4[0],zmm0[0],zmm4[2],zmm0[2],zmm4[4],zmm0[4],zmm4[6],zmm0[6]
44+
; AVX512VL-NEXT: retq
45+
;
46+
; AVX512VBMI-LABEL: shl_i512_1:
47+
; AVX512VBMI: # %bb.0:
48+
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
49+
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
50+
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm3
51+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
52+
; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm4, %xmm4
53+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
54+
; AVX512VBMI-NEXT: vpshldq $1, %xmm2, %xmm5, %xmm6
55+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
56+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
57+
; AVX512VBMI-NEXT: vpshldq $1, %xmm1, %xmm6, %xmm7
58+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
59+
; AVX512VBMI-NEXT: vpshldq $1, %xmm0, %xmm8, %xmm9
60+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7
61+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4
62+
; AVX512VBMI-NEXT: vpshldq $1, %xmm8, %xmm1, %xmm1
63+
; AVX512VBMI-NEXT: vpsllq $1, %xmm0, %xmm0
64+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
65+
; AVX512VBMI-NEXT: vpshldq $1, %xmm5, %xmm3, %xmm1
66+
; AVX512VBMI-NEXT: vpshldq $1, %xmm6, %xmm2, %xmm2
67+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
68+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
69+
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm4[0],zmm0[2],zmm4[2],zmm0[4],zmm4[4],zmm0[6],zmm4[6]
70+
; AVX512VBMI-NEXT: retq
71+
;
72+
; ZNVER4-LABEL: shl_i512_1:
73+
; ZNVER4: # %bb.0:
74+
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1
75+
; ZNVER4-NEXT: vmovq %xmm0, %rdx
76+
; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9
77+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax
78+
; ZNVER4-NEXT: vmovq %xmm1, %rcx
79+
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
80+
; ZNVER4-NEXT: shrq $63, %rdx
81+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi
82+
; ZNVER4-NEXT: vmovq %xmm1, %rdi
83+
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
84+
; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx
85+
; ZNVER4-NEXT: shrq $63, %r9
86+
; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0
87+
; ZNVER4-NEXT: vmovq %xmm1, %r10
88+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8
89+
; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9
90+
; ZNVER4-NEXT: shrq $63, %r10
91+
; ZNVER4-NEXT: vmovq %rdx, %xmm4
92+
; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10
93+
; ZNVER4-NEXT: shrq $63, %r8
94+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
95+
; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8
96+
; ZNVER4-NEXT: shrq $63, %rdi
97+
; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi
98+
; ZNVER4-NEXT: shrq $63, %rsi
99+
; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi
100+
; ZNVER4-NEXT: shrq $63, %rcx
101+
; ZNVER4-NEXT: vmovq %r8, %xmm3
102+
; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax
103+
; ZNVER4-NEXT: vmovq %rsi, %xmm2
104+
; ZNVER4-NEXT: vmovq %rax, %xmm1
105+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
106+
; ZNVER4-NEXT: vmovq %rdi, %xmm2
107+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
108+
; ZNVER4-NEXT: vmovq %r10, %xmm3
109+
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
110+
; ZNVER4-NEXT: vmovq %r9, %xmm2
111+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
112+
; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
113+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
114+
; ZNVER4-NEXT: retq
115+
%d = bitcast <8 x i64> %a to i512
116+
%s = shl i512 %d, 1
117+
%r = bitcast i512 %s to <8 x i64>
118+
ret <8 x i64> %r
119+
}
120+
121+
define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
122+
; AVX512VL-LABEL: lshr_i512_1:
123+
; AVX512VL: # %bb.0:
124+
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
125+
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
126+
; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3
127+
; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4
128+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
129+
; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5
130+
; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4
131+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
132+
; AVX512VL-NEXT: vpsrlq $1, %xmm3, %xmm3
133+
; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
134+
; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
135+
; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1
136+
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
137+
; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2
138+
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
139+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
140+
; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2
141+
; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
142+
; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
143+
; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
144+
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
145+
; AVX512VL-NEXT: retq
146+
;
147+
; AVX512VBMI-LABEL: lshr_i512_1:
148+
; AVX512VBMI: # %bb.0:
149+
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
150+
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
151+
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm3
152+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
153+
; AVX512VBMI-NEXT: vpshldq $63, %xmm3, %xmm4, %xmm5
154+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
155+
; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm6, %xmm7
156+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
157+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
158+
; AVX512VBMI-NEXT: vpshldq $63, %xmm1, %xmm7, %xmm8
159+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
160+
; AVX512VBMI-NEXT: vpshldq $63, %xmm0, %xmm9, %xmm0
161+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
162+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
163+
; AVX512VBMI-NEXT: vpshldq $63, %xmm7, %xmm2, %xmm2
164+
; AVX512VBMI-NEXT: vpshldq $63, %xmm9, %xmm1, %xmm1
165+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
166+
; AVX512VBMI-NEXT: vpshldq $63, %xmm6, %xmm3, %xmm2
167+
; AVX512VBMI-NEXT: vpsrlq $1, %xmm4, %xmm3
168+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
169+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
170+
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
171+
; AVX512VBMI-NEXT: retq
172+
;
173+
; ZNVER4-LABEL: lshr_i512_1:
174+
; ZNVER4: # %bb.0:
175+
; ZNVER4-NEXT: pushq %rbx
176+
; ZNVER4-NEXT: .cfi_def_cfa_offset 16
177+
; ZNVER4-NEXT: .cfi_offset %rbx, -16
178+
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
179+
; ZNVER4-NEXT: vmovq %xmm0, %r10
180+
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
181+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
182+
; ZNVER4-NEXT: vmovq %xmm1, %r9
183+
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
184+
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
185+
; ZNVER4-NEXT: shrq %r10
186+
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
187+
; ZNVER4-NEXT: vmovq %xmm0, %rdx
188+
; ZNVER4-NEXT: vmovq %xmm1, %rdi
189+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
190+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
191+
; ZNVER4-NEXT: movq %rdx, %r8
192+
; ZNVER4-NEXT: shrq %r8
193+
; ZNVER4-NEXT: shlq $63, %rax
194+
; ZNVER4-NEXT: movq %rdi, %rbx
195+
; ZNVER4-NEXT: shrq %rbx
196+
; ZNVER4-NEXT: shlq $63, %rdx
197+
; ZNVER4-NEXT: shlq $63, %rdi
198+
; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0
199+
; ZNVER4-NEXT: orq %r8, %rax
200+
; ZNVER4-NEXT: movq %r11, %r8
201+
; ZNVER4-NEXT: shlq $63, %r8
202+
; ZNVER4-NEXT: shrq %r11
203+
; ZNVER4-NEXT: orq %rbx, %r8
204+
; ZNVER4-NEXT: movq %r9, %rbx
205+
; ZNVER4-NEXT: orq %r11, %rdx
206+
; ZNVER4-NEXT: movq %rsi, %r11
207+
; ZNVER4-NEXT: shrq %r11
208+
; ZNVER4-NEXT: shlq $63, %rbx
209+
; ZNVER4-NEXT: shrq %r9
210+
; ZNVER4-NEXT: shlq $63, %rsi
211+
; ZNVER4-NEXT: vmovq %rax, %xmm4
212+
; ZNVER4-NEXT: orq %r11, %rbx
213+
; ZNVER4-NEXT: movq %rcx, %r11
214+
; ZNVER4-NEXT: shlq $63, %r11
215+
; ZNVER4-NEXT: shrq %rcx
216+
; ZNVER4-NEXT: orq %r10, %rsi
217+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
218+
; ZNVER4-NEXT: orq %r9, %r11
219+
; ZNVER4-NEXT: orq %rdi, %rcx
220+
; ZNVER4-NEXT: vmovq %rbx, %xmm3
221+
; ZNVER4-NEXT: vmovq %rcx, %xmm1
222+
; ZNVER4-NEXT: vmovq %r11, %xmm2
223+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
224+
; ZNVER4-NEXT: vmovq %rsi, %xmm2
225+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
226+
; ZNVER4-NEXT: vmovq %r8, %xmm3
227+
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
228+
; ZNVER4-NEXT: vmovq %rdx, %xmm2
229+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
230+
; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
231+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
232+
; ZNVER4-NEXT: popq %rbx
233+
; ZNVER4-NEXT: .cfi_def_cfa_offset 8
234+
; ZNVER4-NEXT: retq
235+
%d = bitcast <8 x i64> %a to i512
236+
%s = lshr i512 %d, 1
237+
%r = bitcast i512 %s to <8 x i64>
238+
ret <8 x i64> %r
239+
}
240+
241+
define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
242+
; AVX512VL-LABEL: ashr_i512_1:
243+
; AVX512VL: # %bb.0:
244+
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
245+
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
246+
; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3
247+
; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4
248+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
249+
; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5
250+
; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4
251+
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
252+
; AVX512VL-NEXT: vpsraq $1, %xmm3, %xmm3
253+
; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
254+
; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
255+
; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1
256+
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
257+
; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2
258+
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
259+
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
260+
; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2
261+
; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
262+
; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
263+
; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
264+
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
265+
; AVX512VL-NEXT: retq
266+
;
267+
; AVX512VBMI-LABEL: ashr_i512_1:
268+
; AVX512VBMI: # %bb.0:
269+
; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
270+
; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
271+
; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm3
272+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
273+
; AVX512VBMI-NEXT: vpshldq $63, %xmm3, %xmm4, %xmm5
274+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
275+
; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm6, %xmm7
276+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
277+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
278+
; AVX512VBMI-NEXT: vpshldq $63, %xmm1, %xmm7, %xmm8
279+
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,3,2,3]
280+
; AVX512VBMI-NEXT: vpshldq $63, %xmm0, %xmm9, %xmm0
281+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
282+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
283+
; AVX512VBMI-NEXT: vpshldq $63, %xmm7, %xmm2, %xmm2
284+
; AVX512VBMI-NEXT: vpshldq $63, %xmm9, %xmm1, %xmm1
285+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
286+
; AVX512VBMI-NEXT: vpshldq $63, %xmm6, %xmm3, %xmm2
287+
; AVX512VBMI-NEXT: vpsraq $1, %xmm4, %xmm3
288+
; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
289+
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
290+
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
291+
; AVX512VBMI-NEXT: retq
292+
;
293+
; ZNVER4-LABEL: ashr_i512_1:
294+
; ZNVER4: # %bb.0:
295+
; ZNVER4-NEXT: pushq %rbx
296+
; ZNVER4-NEXT: .cfi_def_cfa_offset 16
297+
; ZNVER4-NEXT: .cfi_offset %rbx, -16
298+
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
299+
; ZNVER4-NEXT: vmovq %xmm0, %r10
300+
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
301+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
302+
; ZNVER4-NEXT: vmovq %xmm1, %r9
303+
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
304+
; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
305+
; ZNVER4-NEXT: shrq %r10
306+
; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
307+
; ZNVER4-NEXT: vmovq %xmm0, %rdx
308+
; ZNVER4-NEXT: vmovq %xmm1, %rdi
309+
; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
310+
; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
311+
; ZNVER4-NEXT: movq %rdx, %r8
312+
; ZNVER4-NEXT: shrq %r8
313+
; ZNVER4-NEXT: shlq $63, %rax
314+
; ZNVER4-NEXT: movq %rdi, %rbx
315+
; ZNVER4-NEXT: shrq %rbx
316+
; ZNVER4-NEXT: shlq $63, %rdx
317+
; ZNVER4-NEXT: shlq $63, %rdi
318+
; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0
319+
; ZNVER4-NEXT: orq %r8, %rax
320+
; ZNVER4-NEXT: movq %r11, %r8
321+
; ZNVER4-NEXT: shlq $63, %r8
322+
; ZNVER4-NEXT: shrq %r11
323+
; ZNVER4-NEXT: orq %rbx, %r8
324+
; ZNVER4-NEXT: movq %r9, %rbx
325+
; ZNVER4-NEXT: orq %r11, %rdx
326+
; ZNVER4-NEXT: movq %rsi, %r11
327+
; ZNVER4-NEXT: shrq %r11
328+
; ZNVER4-NEXT: shlq $63, %rbx
329+
; ZNVER4-NEXT: shrq %r9
330+
; ZNVER4-NEXT: shlq $63, %rsi
331+
; ZNVER4-NEXT: vmovq %rax, %xmm4
332+
; ZNVER4-NEXT: orq %r11, %rbx
333+
; ZNVER4-NEXT: movq %rcx, %r11
334+
; ZNVER4-NEXT: shlq $63, %r11
335+
; ZNVER4-NEXT: shrq %rcx
336+
; ZNVER4-NEXT: orq %r10, %rsi
337+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
338+
; ZNVER4-NEXT: orq %r9, %r11
339+
; ZNVER4-NEXT: orq %rdi, %rcx
340+
; ZNVER4-NEXT: vmovq %rbx, %xmm3
341+
; ZNVER4-NEXT: vmovq %rcx, %xmm1
342+
; ZNVER4-NEXT: vmovq %r11, %xmm2
343+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
344+
; ZNVER4-NEXT: vmovq %rsi, %xmm2
345+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
346+
; ZNVER4-NEXT: vmovq %r8, %xmm3
347+
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
348+
; ZNVER4-NEXT: vmovq %rdx, %xmm2
349+
; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
350+
; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
351+
; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
352+
; ZNVER4-NEXT: popq %rbx
353+
; ZNVER4-NEXT: .cfi_def_cfa_offset 8
354+
; ZNVER4-NEXT: retq
355+
%d = bitcast <8 x i64> %a to i512
356+
%s = ashr i512 %d, 1
357+
%r = bitcast i512 %s to <8 x i64>
358+
ret <8 x i64> %r
359+
}

0 commit comments

Comments
 (0)