@@ -138,8 +138,282 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
138
138
ret <64 x i16 > %result
139
139
}
140
140
141
- define <64 x i8 > @interleave2x32 (<32 x i8 > %a , <32 x i8 > %b ) {
142
- ; SSE-LABEL: interleave2x32:
141
+ define <8 x double > @interleave2x4f64 (<4 x double > %a , <4 x double > %b ) {
142
+ ; SSE-LABEL: interleave2x4f64:
143
+ ; SSE: # %bb.0:
144
+ ; SSE-NEXT: movaps %xmm0, %xmm4
145
+ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
146
+ ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
147
+ ; SSE-NEXT: movaps %xmm1, %xmm2
148
+ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
149
+ ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
150
+ ; SSE-NEXT: movaps %xmm4, %xmm1
151
+ ; SSE-NEXT: retq
152
+ ;
153
+ ; AVX1-LABEL: interleave2x4f64:
154
+ ; AVX1: # %bb.0:
155
+ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
156
+ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
157
+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
158
+ ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
159
+ ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
160
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
161
+ ; AVX1-NEXT: vmovapd %ymm2, %ymm1
162
+ ; AVX1-NEXT: retq
163
+ ;
164
+ ; AVX2-LABEL: interleave2x4f64:
165
+ ; AVX2: # %bb.0:
166
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
167
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
168
+ ; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
169
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
170
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
171
+ ; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
172
+ ; AVX2-NEXT: vmovapd %ymm2, %ymm0
173
+ ; AVX2-NEXT: retq
174
+ %result = shufflevector <4 x double > %a , <4 x double > %b , <8 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 , i32 2 , i32 6 , i32 3 , i32 7 >
175
+ ret <8 x double > %result
176
+ }
177
+
178
+ define <8 x i64 > @interleave2x4i64 (<4 x i64 > %a , <4 x i64 > %b ) {
179
+ ; SSE-LABEL: interleave2x4i64:
180
+ ; SSE: # %bb.0:
181
+ ; SSE-NEXT: movaps %xmm1, %xmm4
182
+ ; SSE-NEXT: movaps %xmm0, %xmm1
183
+ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
184
+ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
185
+ ; SSE-NEXT: movaps %xmm4, %xmm2
186
+ ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
187
+ ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
188
+ ; SSE-NEXT: movaps %xmm4, %xmm3
189
+ ; SSE-NEXT: retq
190
+ ;
191
+ ; AVX1-LABEL: interleave2x4i64:
192
+ ; AVX1: # %bb.0:
193
+ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3]
194
+ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
195
+ ; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3]
196
+ ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1]
197
+ ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
199
+ ; AVX1-NEXT: vmovapd %ymm2, %ymm1
200
+ ; AVX1-NEXT: retq
201
+ ;
202
+ ; AVX2-LABEL: interleave2x4i64:
203
+ ; AVX2: # %bb.0:
204
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1]
205
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3]
206
+ ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
207
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
208
+ ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
209
+ ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
210
+ ; AVX2-NEXT: vmovaps %ymm2, %ymm0
211
+ ; AVX2-NEXT: retq
212
+ %result = shufflevector <4 x i64 > %a , <4 x i64 > %b , <8 x i32 > <i32 0 , i32 4 , i32 1 , i32 5 , i32 2 , i32 6 , i32 3 , i32 7 >
213
+ ret <8 x i64 > %result
214
+ }
215
+
216
+ define <16 x float > @interleave2x8f32 (<8 x float > %a , <8 x float > %b ) {
217
+ ; SSE-LABEL: interleave2x8f32:
218
+ ; SSE: # %bb.0:
219
+ ; SSE-NEXT: movaps %xmm1, %xmm4
220
+ ; SSE-NEXT: movaps %xmm0, %xmm1
221
+ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
222
+ ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
223
+ ; SSE-NEXT: movaps %xmm4, %xmm2
224
+ ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
225
+ ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
226
+ ; SSE-NEXT: movaps %xmm4, %xmm3
227
+ ; SSE-NEXT: retq
228
+ ;
229
+ ; AVX1-LABEL: interleave2x8f32:
230
+ ; AVX1: # %bb.0:
231
+ ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
232
+ ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
233
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
234
+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
235
+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
236
+ ; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
237
+ ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
239
+ ; AVX1-NEXT: vmovaps %ymm2, %ymm0
240
+ ; AVX1-NEXT: retq
241
+ ;
242
+ ; AVX2-LABEL: interleave2x8f32:
243
+ ; AVX2: # %bb.0:
244
+ ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
245
+ ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
246
+ ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
247
+ ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
248
+ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
249
+ ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
250
+ ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
251
+ ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
252
+ ; AVX2-NEXT: vmovaps %ymm2, %ymm0
253
+ ; AVX2-NEXT: retq
254
+ %result = shufflevector <8 x float > %a , <8 x float > %b , <16 x i32 > <i32 0 , i32 8 , i32 1 , i32 9 , i32 2 , i32 10 , i32 3 , i32 11 , i32 4 , i32 12 , i32 5 , i32 13 , i32 6 , i32 14 , i32 7 , i32 15 >
255
+ ret <16 x float > %result
256
+ }
257
+
258
+ define <16 x i32 > @interleave2x8i32 (<8 x i32 > %a , <8 x i32 > %b ) {
259
+ ; SSE-LABEL: interleave2x8i32:
260
+ ; SSE: # %bb.0:
261
+ ; SSE-NEXT: movaps %xmm1, %xmm4
262
+ ; SSE-NEXT: movaps %xmm0, %xmm1
263
+ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
264
+ ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
265
+ ; SSE-NEXT: movaps %xmm4, %xmm2
266
+ ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
267
+ ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
268
+ ; SSE-NEXT: movaps %xmm4, %xmm3
269
+ ; SSE-NEXT: retq
270
+ ;
271
+ ; AVX1-LABEL: interleave2x8i32:
272
+ ; AVX1: # %bb.0:
273
+ ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
274
+ ; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
276
+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
277
+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
278
+ ; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
279
+ ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
280
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
281
+ ; AVX1-NEXT: vmovaps %ymm2, %ymm0
282
+ ; AVX1-NEXT: retq
283
+ ;
284
+ ; AVX2-LABEL: interleave2x8i32:
285
+ ; AVX2: # %bb.0:
286
+ ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
287
+ ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
288
+ ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
289
+ ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
290
+ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
291
+ ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
292
+ ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
293
+ ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
294
+ ; AVX2-NEXT: vmovaps %ymm2, %ymm0
295
+ ; AVX2-NEXT: retq
296
+ %result = shufflevector <8 x i32 > %a , <8 x i32 > %b , <16 x i32 > <i32 0 , i32 8 , i32 1 , i32 9 , i32 2 , i32 10 , i32 3 , i32 11 , i32 4 , i32 12 , i32 5 , i32 13 , i32 6 , i32 14 , i32 7 , i32 15 >
297
+ ret <16 x i32 > %result
298
+ }
299
+
300
+ define <32 x i16 > @interleave2x16i16 (<16 x i16 > %a , <16 x i16 > %b ) {
301
+ ; SSE-LABEL: interleave2x16i16:
302
+ ; SSE: # %bb.0:
303
+ ; SSE-NEXT: movdqa %xmm1, %xmm4
304
+ ; SSE-NEXT: movdqa %xmm0, %xmm1
305
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
306
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
307
+ ; SSE-NEXT: movdqa %xmm4, %xmm2
308
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
309
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
310
+ ; SSE-NEXT: movdqa %xmm4, %xmm3
311
+ ; SSE-NEXT: retq
312
+ ;
313
+ ; AVX1-LABEL: interleave2x16i16:
314
+ ; AVX1: # %bb.0:
315
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
316
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
317
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
318
+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
319
+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
320
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
321
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
322
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
323
+ ; AVX1-NEXT: vmovaps %ymm2, %ymm0
324
+ ; AVX1-NEXT: retq
325
+ ;
326
+ ; AVX2-LABEL: interleave2x16i16:
327
+ ; AVX2: # %bb.0:
328
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
329
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
330
+ ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
331
+ ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
332
+ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
333
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
334
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
335
+ ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1
336
+ ; AVX2-NEXT: vmovdqa %ymm2, %ymm0
337
+ ; AVX2-NEXT: retq
338
+ %result = shufflevector <16 x i16 > %a , <16 x i16 > %b , <32 x i32 > <i32 0 , i32 16 , i32 1 , i32 17 , i32 2 , i32 18 , i32 3 , i32 19 , i32 4 , i32 20 , i32 5 , i32 21 , i32 6 , i32 22 , i32 7 , i32 23 , i32 8 , i32 24 , i32 9 , i32 25 , i32 10 , i32 26 , i32 11 , i32 27 , i32 12 , i32 28 , i32 13 , i32 29 , i32 14 , i32 30 , i32 15 , i32 31 >
339
+ ret <32 x i16 > %result
340
+ }
341
+
342
+ define <64 x i16 > @interleave2x32i16 (<32 x i16 > %a , <32 x i16 > %b ) {
343
+ ; SSE-LABEL: interleave2x32i16:
344
+ ; SSE: # %bb.0:
345
+ ; SSE-NEXT: movq %rdi, %rax
346
+ ; SSE-NEXT: movdqa %xmm0, %xmm8
347
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
348
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
349
+ ; SSE-NEXT: movdqa %xmm1, %xmm4
350
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
351
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
352
+ ; SSE-NEXT: movdqa %xmm2, %xmm5
353
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
354
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
355
+ ; SSE-NEXT: movdqa %xmm3, %xmm6
356
+ ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
357
+ ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
358
+ ; SSE-NEXT: movdqa %xmm3, 112(%rdi)
359
+ ; SSE-NEXT: movdqa %xmm6, 96(%rdi)
360
+ ; SSE-NEXT: movdqa %xmm2, 80(%rdi)
361
+ ; SSE-NEXT: movdqa %xmm5, 64(%rdi)
362
+ ; SSE-NEXT: movdqa %xmm1, 48(%rdi)
363
+ ; SSE-NEXT: movdqa %xmm4, 32(%rdi)
364
+ ; SSE-NEXT: movdqa %xmm0, 16(%rdi)
365
+ ; SSE-NEXT: movdqa %xmm8, (%rdi)
366
+ ; SSE-NEXT: retq
367
+ ;
368
+ ; AVX1-LABEL: interleave2x32i16:
369
+ ; AVX1: # %bb.0:
370
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
371
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
372
+ ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
373
+ ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
374
+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
375
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
376
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
377
+ ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
378
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
379
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
380
+ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
381
+ ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
382
+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
383
+ ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
384
+ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
385
+ ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
386
+ ; AVX1-NEXT: vmovaps %ymm4, %ymm0
387
+ ; AVX1-NEXT: vmovaps %ymm5, %ymm1
388
+ ; AVX1-NEXT: retq
389
+ ;
390
+ ; AVX2-LABEL: interleave2x32i16:
391
+ ; AVX2: # %bb.0:
392
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
393
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
394
+ ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
395
+ ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
396
+ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
397
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
398
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
399
+ ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
400
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
401
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
402
+ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
403
+ ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0
404
+ ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
405
+ ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
406
+ ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
407
+ ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
408
+ ; AVX2-NEXT: vmovdqa %ymm4, %ymm0
409
+ ; AVX2-NEXT: vmovdqa %ymm5, %ymm1
410
+ ; AVX2-NEXT: retq
411
+ %result = shufflevector <32 x i16 > %a , <32 x i16 > %b , <64 x i32 > <i32 0 , i32 32 , i32 1 , i32 33 , i32 2 , i32 34 , i32 3 , i32 35 , i32 4 , i32 36 , i32 5 , i32 37 , i32 6 , i32 38 , i32 7 , i32 39 , i32 8 , i32 40 , i32 9 , i32 41 , i32 10 , i32 42 , i32 11 , i32 43 , i32 12 , i32 44 , i32 13 , i32 45 , i32 14 , i32 46 , i32 15 , i32 47 , i32 16 , i32 48 , i32 17 , i32 49 , i32 18 , i32 50 , i32 19 , i32 51 , i32 20 , i32 52 , i32 21 , i32 53 , i32 22 , i32 54 , i32 23 , i32 55 , i32 24 , i32 56 , i32 25 , i32 57 , i32 26 , i32 58 , i32 27 , i32 59 , i32 28 , i32 60 , i32 29 , i32 61 , i32 30 , i32 62 , i32 31 , i32 63 >
412
+ ret <64 x i16 > %result
413
+ }
414
+
415
+ define <64 x i8 > @interleave2x32i8 (<32 x i8 > %a , <32 x i8 > %b ) {
416
+ ; SSE-LABEL: interleave2x32i8:
143
417
; SSE: # %bb.0:
144
418
; SSE-NEXT: movdqa %xmm1, %xmm4
145
419
; SSE-NEXT: movdqa %xmm0, %xmm1
@@ -151,7 +425,7 @@ define <64 x i8> @interleave2x32(<32 x i8> %a, <32 x i8> %b) {
151
425
; SSE-NEXT: movdqa %xmm4, %xmm3
152
426
; SSE-NEXT: retq
153
427
;
154
- ; AVX1-LABEL: interleave2x32 :
428
+ ; AVX1-LABEL: interleave2x32i8 :
155
429
; AVX1: # %bb.0:
156
430
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
157
431
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -164,7 +438,7 @@ define <64 x i8> @interleave2x32(<32 x i8> %a, <32 x i8> %b) {
164
438
; AVX1-NEXT: vmovaps %ymm2, %ymm0
165
439
; AVX1-NEXT: retq
166
440
;
167
- ; AVX2-LABEL: interleave2x32 :
441
+ ; AVX2-LABEL: interleave2x32i8 :
168
442
; AVX2: # %bb.0:
169
443
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
170
444
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
0 commit comments