@@ -436,6 +436,157 @@ entry:
436
436
unreachable
437
437
}
438
438
439
+ define void @PR48908 (<4 x double > %v0 , <4 x double > %v1 , <4 x double > %v2 , <4 x double >* noalias %out0 , <4 x double >* noalias %out1 , <4 x double >* noalias %out2 ) {
440
+ ; X86-AVX1-LABEL: PR48908:
441
+ ; X86-AVX1: # %bb.0:
442
+ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
443
+ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
444
+ ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
445
+ ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
446
+ ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
447
+ ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
448
+ ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
449
+ ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
450
+ ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
451
+ ; X86-AVX1-NEXT: vmovapd %ymm4, (%edx)
452
+ ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
453
+ ; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
454
+ ; X86-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
455
+ ; X86-AVX1-NEXT: vmovapd %ymm3, (%ecx)
456
+ ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
457
+ ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
458
+ ; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
459
+ ; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
460
+ ; X86-AVX1-NEXT: vmovapd %ymm0, (%eax)
461
+ ; X86-AVX1-NEXT: vzeroupper
462
+ ; X86-AVX1-NEXT: retl
463
+ ;
464
+ ; X86-AVX2-LABEL: PR48908:
465
+ ; X86-AVX2: # %bb.0:
466
+ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
467
+ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
468
+ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
469
+ ; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
470
+ ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
471
+ ; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
472
+ ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
473
+ ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
474
+ ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
475
+ ; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
476
+ ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
477
+ ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
478
+ ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
479
+ ; X86-AVX2-NEXT: vmovapd %ymm3, (%ecx)
480
+ ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
481
+ ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
482
+ ; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
483
+ ; X86-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
484
+ ; X86-AVX2-NEXT: vmovapd %ymm0, (%eax)
485
+ ; X86-AVX2-NEXT: vzeroupper
486
+ ; X86-AVX2-NEXT: retl
487
+ ;
488
+ ; X86-AVX512-LABEL: PR48908:
489
+ ; X86-AVX512: # %bb.0:
490
+ ; X86-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
491
+ ; X86-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
492
+ ; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
493
+ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
494
+ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
495
+ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
496
+ ; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
497
+ ; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
498
+ ; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
499
+ ; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
500
+ ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0]
501
+ ; X86-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3
502
+ ; X86-AVX512-NEXT: vmovapd %ymm3, (%edx)
503
+ ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0]
504
+ ; X86-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4
505
+ ; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
506
+ ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u>
507
+ ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
508
+ ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0]
509
+ ; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
510
+ ; X86-AVX512-NEXT: vmovapd %ymm0, (%eax)
511
+ ; X86-AVX512-NEXT: vzeroupper
512
+ ; X86-AVX512-NEXT: retl
513
+ ;
514
+ ; X64-AVX1-LABEL: PR48908:
515
+ ; X64-AVX1: # %bb.0:
516
+ ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1]
517
+ ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
518
+ ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[1],ymm3[0],ymm4[2],ymm3[3]
519
+ ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[0,1],ymm0[0,1]
520
+ ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
521
+ ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[2]
522
+ ; X64-AVX1-NEXT: vmovapd %ymm4, (%rdi)
523
+ ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
524
+ ; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3]
525
+ ; X64-AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
526
+ ; X64-AVX1-NEXT: vmovapd %ymm3, (%rsi)
527
+ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
528
+ ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
529
+ ; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
530
+ ; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
531
+ ; X64-AVX1-NEXT: vmovapd %ymm0, (%rdx)
532
+ ; X64-AVX1-NEXT: vzeroupper
533
+ ; X64-AVX1-NEXT: retq
534
+ ;
535
+ ; X64-AVX2-LABEL: PR48908:
536
+ ; X64-AVX2: # %bb.0:
537
+ ; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
538
+ ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
539
+ ; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
540
+ ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
541
+ ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
542
+ ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
543
+ ; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
544
+ ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3]
545
+ ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
546
+ ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
547
+ ; X64-AVX2-NEXT: vmovapd %ymm3, (%rsi)
548
+ ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
549
+ ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
550
+ ; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
551
+ ; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
552
+ ; X64-AVX2-NEXT: vmovapd %ymm0, (%rdx)
553
+ ; X64-AVX2-NEXT: vzeroupper
554
+ ; X64-AVX2-NEXT: retq
555
+ ;
556
+ ; X64-AVX512-LABEL: PR48908:
557
+ ; X64-AVX512: # %bb.0:
558
+ ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
559
+ ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
560
+ ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
561
+ ; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
562
+ ; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2]
563
+ ; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
564
+ ; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3]
565
+ ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1]
566
+ ; X64-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3
567
+ ; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi)
568
+ ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1]
569
+ ; X64-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4
570
+ ; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
571
+ ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,11,u,u>
572
+ ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
573
+ ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3]
574
+ ; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
575
+ ; X64-AVX512-NEXT: vmovapd %ymm0, (%rdx)
576
+ ; X64-AVX512-NEXT: vzeroupper
577
+ ; X64-AVX512-NEXT: retq
578
+ %t0 = shufflevector <4 x double > %v0 , <4 x double > %v1 , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 4 >
579
+ %t1 = shufflevector <4 x double > %v1 , <4 x double > %v2 , <4 x i32 > <i32 1 , i32 2 , i32 4 , i32 5 >
580
+ %r0 = shufflevector <4 x double > %t0 , <4 x double > %t1 , <4 x i32 > <i32 0 , i32 3 , i32 6 , i32 1 >
581
+ store <4 x double > %r0 , <4 x double >* %out0 , align 32
582
+ %r1 = shufflevector <4 x double > %t0 , <4 x double > %t1 , <4 x i32 > <i32 4 , i32 7 , i32 2 , i32 5 >
583
+ store <4 x double > %r1 , <4 x double >* %out1 , align 32
584
+ %t2 = shufflevector <4 x double > %v0 , <4 x double > %v1 , <4 x i32 > <i32 3 , i32 7 , i32 undef , i32 undef >
585
+ %r2 = shufflevector <4 x double > %t2 , <4 x double > %v2 , <4 x i32 > <i32 6 , i32 0 , i32 1 , i32 7 >
586
+ store <4 x double > %r2 , <4 x double >* %out2 , align 32
587
+ ret void
588
+ }
589
+
439
590
define <4 x i64 > @concat_self_v4i64 (<2 x i64 > %x ) {
440
591
; AVX1-LABEL: concat_self_v4i64:
441
592
; AVX1: # %bb.0:
0 commit comments