@@ -48,46 +48,20 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) {
48
48
;
49
49
; ZNVER4-LABEL: shl_i512_1:
50
50
; ZNVER4: # %bb.0:
51
- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm1
52
- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
53
- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %r9
54
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rax
55
- ; ZNVER4-NEXT: vmovq %xmm1, %rcx
56
51
; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
57
- ; ZNVER4-NEXT: shrq $63, %rdx
58
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rsi
59
- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
60
- ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
61
- ; ZNVER4-NEXT: leaq (%rdx,%r9,2), %rdx
62
- ; ZNVER4-NEXT: shrq $63, %r9
63
- ; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm0
64
- ; ZNVER4-NEXT: vmovq %xmm1, %r10
65
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r8
66
- ; ZNVER4-NEXT: leaq (%r9,%r10,2), %r9
67
- ; ZNVER4-NEXT: shrq $63, %r10
68
- ; ZNVER4-NEXT: vmovq %rdx, %xmm4
69
- ; ZNVER4-NEXT: leaq (%r10,%r8,2), %r10
70
- ; ZNVER4-NEXT: shrq $63, %r8
71
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
72
- ; ZNVER4-NEXT: leaq (%r8,%rdi,2), %r8
73
- ; ZNVER4-NEXT: shrq $63, %rdi
74
- ; ZNVER4-NEXT: leaq (%rdi,%rsi,2), %rdi
75
- ; ZNVER4-NEXT: shrq $63, %rsi
76
- ; ZNVER4-NEXT: leaq (%rsi,%rcx,2), %rsi
77
- ; ZNVER4-NEXT: shrq $63, %rcx
78
- ; ZNVER4-NEXT: vmovq %r8, %xmm3
79
- ; ZNVER4-NEXT: leaq (%rcx,%rax,2), %rax
80
- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
81
- ; ZNVER4-NEXT: vmovq %rax, %xmm1
82
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
83
- ; ZNVER4-NEXT: vmovq %rdi, %xmm2
84
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
85
- ; ZNVER4-NEXT: vmovq %r10, %xmm3
52
+ ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2
53
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
54
+ ; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4
86
55
; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
87
- ; ZNVER4-NEXT: vmovq %r9, %xmm2
88
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
89
- ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
90
- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
56
+ ; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3
57
+ ; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2
58
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
59
+ ; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1
60
+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
61
+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
62
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
63
+ ; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0
64
+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
91
65
; ZNVER4-NEXT: retq
92
66
%d = bitcast <8 x i64 > %a to i512
93
67
%s = shl i512 %d , 1
@@ -142,65 +116,21 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
142
116
;
143
117
; ZNVER4-LABEL: lshr_i512_1:
144
118
; ZNVER4: # %bb.0:
145
- ; ZNVER4-NEXT: pushq %rbx
146
- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
147
- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
119
+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
148
120
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
149
- ; ZNVER4-NEXT: vmovq %xmm0, %r10
150
- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
151
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
152
- ; ZNVER4-NEXT: vmovq %xmm1, %r9
153
- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
154
- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
155
- ; ZNVER4-NEXT: shrq %r10
156
- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
157
- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
158
- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
159
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
160
- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
161
- ; ZNVER4-NEXT: movq %rdx, %r8
162
- ; ZNVER4-NEXT: shrq %r8
163
- ; ZNVER4-NEXT: shlq $63, %rax
164
- ; ZNVER4-NEXT: movq %rdi, %rbx
165
- ; ZNVER4-NEXT: shrq %rbx
166
- ; ZNVER4-NEXT: shlq $63, %rdx
167
- ; ZNVER4-NEXT: shlq $63, %rdi
168
- ; ZNVER4-NEXT: vpsrlq $1, %xmm0, %xmm0
169
- ; ZNVER4-NEXT: orq %r8, %rax
170
- ; ZNVER4-NEXT: movq %r11, %r8
171
- ; ZNVER4-NEXT: shlq $63, %r8
172
- ; ZNVER4-NEXT: shrq %r11
173
- ; ZNVER4-NEXT: orq %rbx, %r8
174
- ; ZNVER4-NEXT: movq %r9, %rbx
175
- ; ZNVER4-NEXT: orq %r11, %rdx
176
- ; ZNVER4-NEXT: movq %rsi, %r11
177
- ; ZNVER4-NEXT: shrq %r11
178
- ; ZNVER4-NEXT: shlq $63, %rbx
179
- ; ZNVER4-NEXT: shrq %r9
180
- ; ZNVER4-NEXT: shlq $63, %rsi
181
- ; ZNVER4-NEXT: vmovq %rax, %xmm4
182
- ; ZNVER4-NEXT: orq %r11, %rbx
183
- ; ZNVER4-NEXT: movq %rcx, %r11
184
- ; ZNVER4-NEXT: shlq $63, %r11
185
- ; ZNVER4-NEXT: shrq %rcx
186
- ; ZNVER4-NEXT: orq %r10, %rsi
187
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
188
- ; ZNVER4-NEXT: orq %r9, %r11
189
- ; ZNVER4-NEXT: orq %rdi, %rcx
190
- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
191
- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
192
- ; ZNVER4-NEXT: vmovq %r11, %xmm2
193
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
194
- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
195
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
196
- ; ZNVER4-NEXT: vmovq %r8, %xmm3
197
- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
198
- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
199
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
200
- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
201
- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
202
- ; ZNVER4-NEXT: popq %rbx
203
- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
121
+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
122
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
123
+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
124
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
125
+ ; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
126
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
127
+ ; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
128
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
129
+ ; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2
130
+ ; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
131
+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
132
+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
133
+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
204
134
; ZNVER4-NEXT: retq
205
135
%d = bitcast <8 x i64 > %a to i512
206
136
%s = lshr i512 %d , 1
@@ -255,65 +185,21 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
255
185
;
256
186
; ZNVER4-LABEL: ashr_i512_1:
257
187
; ZNVER4: # %bb.0:
258
- ; ZNVER4-NEXT: pushq %rbx
259
- ; ZNVER4-NEXT: .cfi_def_cfa_offset 16
260
- ; ZNVER4-NEXT: .cfi_offset %rbx, -16
188
+ ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
261
189
; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
262
- ; ZNVER4-NEXT: vmovq %xmm0, %r10
263
- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rsi
264
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %rcx
265
- ; ZNVER4-NEXT: vmovq %xmm1, %r9
266
- ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
267
- ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm0
268
- ; ZNVER4-NEXT: shrq %r10
269
- ; ZNVER4-NEXT: vpextrq $1, %xmm0, %rax
270
- ; ZNVER4-NEXT: vmovq %xmm0, %rdx
271
- ; ZNVER4-NEXT: vmovq %xmm1, %rdi
272
- ; ZNVER4-NEXT: vpextrq $1, %xmm1, %r11
273
- ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
274
- ; ZNVER4-NEXT: movq %rdx, %r8
275
- ; ZNVER4-NEXT: shrq %r8
276
- ; ZNVER4-NEXT: shlq $63, %rax
277
- ; ZNVER4-NEXT: movq %rdi, %rbx
278
- ; ZNVER4-NEXT: shrq %rbx
279
- ; ZNVER4-NEXT: shlq $63, %rdx
280
- ; ZNVER4-NEXT: shlq $63, %rdi
281
- ; ZNVER4-NEXT: vpsraq $1, %xmm0, %xmm0
282
- ; ZNVER4-NEXT: orq %r8, %rax
283
- ; ZNVER4-NEXT: movq %r11, %r8
284
- ; ZNVER4-NEXT: shlq $63, %r8
285
- ; ZNVER4-NEXT: shrq %r11
286
- ; ZNVER4-NEXT: orq %rbx, %r8
287
- ; ZNVER4-NEXT: movq %r9, %rbx
288
- ; ZNVER4-NEXT: orq %r11, %rdx
289
- ; ZNVER4-NEXT: movq %rsi, %r11
290
- ; ZNVER4-NEXT: shrq %r11
291
- ; ZNVER4-NEXT: shlq $63, %rbx
292
- ; ZNVER4-NEXT: shrq %r9
293
- ; ZNVER4-NEXT: shlq $63, %rsi
294
- ; ZNVER4-NEXT: vmovq %rax, %xmm4
295
- ; ZNVER4-NEXT: orq %r11, %rbx
296
- ; ZNVER4-NEXT: movq %rcx, %r11
297
- ; ZNVER4-NEXT: shlq $63, %r11
298
- ; ZNVER4-NEXT: shrq %rcx
299
- ; ZNVER4-NEXT: orq %r10, %rsi
300
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
301
- ; ZNVER4-NEXT: orq %r9, %r11
302
- ; ZNVER4-NEXT: orq %rdi, %rcx
303
- ; ZNVER4-NEXT: vmovq %rbx, %xmm3
304
- ; ZNVER4-NEXT: vmovq %rcx, %xmm1
305
- ; ZNVER4-NEXT: vmovq %r11, %xmm2
306
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
307
- ; ZNVER4-NEXT: vmovq %rsi, %xmm2
308
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
309
- ; ZNVER4-NEXT: vmovq %r8, %xmm3
310
- ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
311
- ; ZNVER4-NEXT: vmovq %rdx, %xmm2
312
- ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
313
- ; ZNVER4-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
314
- ; ZNVER4-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
315
- ; ZNVER4-NEXT: popq %rbx
316
- ; ZNVER4-NEXT: .cfi_def_cfa_offset 8
190
+ ; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
191
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
192
+ ; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
193
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
194
+ ; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
195
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
196
+ ; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
197
+ ; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
198
+ ; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2
199
+ ; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
200
+ ; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
201
+ ; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
202
+ ; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
317
203
; ZNVER4-NEXT: retq
318
204
%d = bitcast <8 x i64 > %a to i512
319
205
%s = ashr i512 %d , 1
0 commit comments