@@ -240,21 +240,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
240
240
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
241
241
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
242
242
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
243
- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
244
- ; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
243
+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
245
244
; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
246
- ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
247
- ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
248
- ; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1]
245
+ ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
249
246
; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
250
- ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
251
247
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
252
248
; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
253
249
; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
254
250
; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
255
251
; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
256
252
; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
257
- ; AVX512-FCP-NEXT: vmovq %xmm1 , (%rax)
253
+ ; AVX512-FCP-NEXT: vmovq %xmm8 , (%rax)
258
254
; AVX512-FCP-NEXT: vzeroupper
259
255
; AVX512-FCP-NEXT: retq
260
256
;
@@ -309,21 +305,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
309
305
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
310
306
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
311
307
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
312
- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
313
- ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
308
+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
314
309
; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
315
- ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
316
- ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
317
- ; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1]
310
+ ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
318
311
; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
319
- ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
320
312
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
321
313
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
322
314
; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
323
315
; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
324
316
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
325
317
; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
326
- ; AVX512DQ-FCP-NEXT: vmovq %xmm1 , (%rax)
318
+ ; AVX512DQ-FCP-NEXT: vmovq %xmm8 , (%rax)
327
319
; AVX512DQ-FCP-NEXT: vzeroupper
328
320
; AVX512DQ-FCP-NEXT: retq
329
321
;
@@ -378,21 +370,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
378
370
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
379
371
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
380
372
; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
381
- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
382
- ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
373
+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
383
374
; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
384
- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
385
- ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
386
- ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
375
+ ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
387
376
; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
388
- ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
389
377
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
390
378
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
391
379
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
392
380
; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
393
381
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
394
382
; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
395
- ; AVX512BW-FCP-NEXT: vmovq %xmm1 , (%rax)
383
+ ; AVX512BW-FCP-NEXT: vmovq %xmm8 , (%rax)
396
384
; AVX512BW-FCP-NEXT: vzeroupper
397
385
; AVX512BW-FCP-NEXT: retq
398
386
;
@@ -447,21 +435,17 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
447
435
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
448
436
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6
449
437
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
450
- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
451
- ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
438
+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [13,4,6,7]
452
439
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
453
- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
454
- ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
455
- ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
440
+ ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,13,6,7]
456
441
; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
457
- ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
458
442
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
459
443
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
460
444
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
461
445
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
462
446
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
463
447
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
464
- ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1 , (%rax)
448
+ ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8 , (%rax)
465
449
; AVX512DQ-BW-FCP-NEXT: vzeroupper
466
450
; AVX512DQ-BW-FCP-NEXT: retq
467
451
%wide.vec = load <14 x i32>, ptr %in.vec, align 64
0 commit comments