@@ -449,60 +449,48 @@ define <4 x double> @blend_broadcasts_v1f64(ptr %p0, ptr %p1) {
449
449
; X86-AVX: # %bb.0:
450
450
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
451
451
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
452
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
453
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
454
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
455
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
456
- ; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
452
+ ; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
453
+ ; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
454
+ ; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
457
455
; X86-AVX-NEXT: retl
458
456
;
459
457
; X64-AVX-LABEL: blend_broadcasts_v1f64:
460
458
; X64-AVX: # %bb.0:
461
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
462
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
463
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
464
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
465
- ; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
459
+ ; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
460
+ ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
461
+ ; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
466
462
; X64-AVX-NEXT: retq
467
463
;
468
464
; X86-AVX2-LABEL: blend_broadcasts_v1f64:
469
465
; X86-AVX2: # %bb.0:
470
466
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
471
467
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
472
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
473
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
474
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
475
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
476
- ; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
468
+ ; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
469
+ ; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
470
+ ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
477
471
; X86-AVX2-NEXT: retl
478
472
;
479
473
; X64-AVX2-LABEL: blend_broadcasts_v1f64:
480
474
; X64-AVX2: # %bb.0:
481
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
482
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
483
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
484
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
485
- ; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
475
+ ; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
476
+ ; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
477
+ ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
486
478
; X64-AVX2-NEXT: retq
487
479
;
488
480
; X86-AVX512-LABEL: blend_broadcasts_v1f64:
489
481
; X86-AVX512: # %bb.0:
490
482
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
491
483
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
492
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
493
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
494
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
495
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
496
- ; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
484
+ ; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
485
+ ; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
486
+ ; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
497
487
; X86-AVX512-NEXT: retl
498
488
;
499
489
; X64-AVX512-LABEL: blend_broadcasts_v1f64:
500
490
; X64-AVX512: # %bb.0:
501
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
502
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
503
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
504
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
505
- ; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
491
+ ; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
492
+ ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
493
+ ; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
506
494
; X64-AVX512-NEXT: retq
507
495
%ld0 = load <1 x double >, ptr %p0 , align 32
508
496
%ld1 = load <1 x double >, ptr %p1 , align 32
@@ -535,60 +523,48 @@ define <4 x double> @blend_broadcasts_v1f64_4x(ptr %p0, ptr %p1) {
535
523
; X86-AVX: # %bb.0:
536
524
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
537
525
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
538
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
539
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
540
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
541
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
542
- ; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
526
+ ; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
527
+ ; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
528
+ ; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
543
529
; X86-AVX-NEXT: retl
544
530
;
545
531
; X64-AVX-LABEL: blend_broadcasts_v1f64_4x:
546
532
; X64-AVX: # %bb.0:
547
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
548
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
549
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
550
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
551
- ; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
533
+ ; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
534
+ ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
535
+ ; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
552
536
; X64-AVX-NEXT: retq
553
537
;
554
538
; X86-AVX2-LABEL: blend_broadcasts_v1f64_4x:
555
539
; X86-AVX2: # %bb.0:
556
540
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
557
541
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
558
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
559
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
560
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
561
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
562
- ; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
542
+ ; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
543
+ ; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
544
+ ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
563
545
; X86-AVX2-NEXT: retl
564
546
;
565
547
; X64-AVX2-LABEL: blend_broadcasts_v1f64_4x:
566
548
; X64-AVX2: # %bb.0:
567
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
568
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
569
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
570
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
571
- ; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
549
+ ; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
550
+ ; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
551
+ ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
572
552
; X64-AVX2-NEXT: retq
573
553
;
574
554
; X86-AVX512-LABEL: blend_broadcasts_v1f64_4x:
575
555
; X86-AVX512: # %bb.0:
576
556
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
577
557
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
578
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
579
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
580
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
581
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
582
- ; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
558
+ ; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
559
+ ; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
560
+ ; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
583
561
; X86-AVX512-NEXT: retl
584
562
;
585
563
; X64-AVX512-LABEL: blend_broadcasts_v1f64_4x:
586
564
; X64-AVX512: # %bb.0:
587
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
588
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
589
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
590
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
591
- ; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
565
+ ; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
566
+ ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
567
+ ; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
592
568
; X64-AVX512-NEXT: retq
593
569
%ld0 = load <1 x double >, ptr %p0 , align 32
594
570
%ld1 = load <1 x double >, ptr %p1 , align 32
@@ -623,60 +599,48 @@ define <4 x double> @blend_broadcasts_v1f64_2x(ptr %p0, ptr %p1) {
623
599
; X86-AVX: # %bb.0:
624
600
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
625
601
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
626
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
627
- ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
628
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
629
- ; X86-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
630
- ; X86-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
602
+ ; X86-AVX-NEXT: vbroadcastsd (%ecx), %ymm0
603
+ ; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm1
604
+ ; X86-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
631
605
; X86-AVX-NEXT: retl
632
606
;
633
607
; X64-AVX-LABEL: blend_broadcasts_v1f64_2x:
634
608
; X64-AVX: # %bb.0:
635
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
636
- ; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
637
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
638
- ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
639
- ; X64-AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
609
+ ; X64-AVX-NEXT: vbroadcastsd (%rsi), %ymm0
610
+ ; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm1
611
+ ; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
640
612
; X64-AVX-NEXT: retq
641
613
;
642
614
; X86-AVX2-LABEL: blend_broadcasts_v1f64_2x:
643
615
; X86-AVX2: # %bb.0:
644
616
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
645
617
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
646
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
647
- ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
648
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
649
- ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
650
- ; X86-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
618
+ ; X86-AVX2-NEXT: vbroadcastsd (%ecx), %ymm0
619
+ ; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1
620
+ ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
651
621
; X86-AVX2-NEXT: retl
652
622
;
653
623
; X64-AVX2-LABEL: blend_broadcasts_v1f64_2x:
654
624
; X64-AVX2: # %bb.0:
655
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
656
- ; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
657
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
658
- ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
659
- ; X64-AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
625
+ ; X64-AVX2-NEXT: vbroadcastsd (%rsi), %ymm0
626
+ ; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1
627
+ ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
660
628
; X64-AVX2-NEXT: retq
661
629
;
662
630
; X86-AVX512-LABEL: blend_broadcasts_v1f64_2x:
663
631
; X86-AVX512: # %bb.0:
664
632
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
665
633
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
666
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
667
- ; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
668
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
669
- ; X86-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
670
- ; X86-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
634
+ ; X86-AVX512-NEXT: vbroadcastsd (%ecx), %ymm0
635
+ ; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1
636
+ ; X86-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
671
637
; X86-AVX512-NEXT: retl
672
638
;
673
639
; X64-AVX512-LABEL: blend_broadcasts_v1f64_2x:
674
640
; X64-AVX512: # %bb.0:
675
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
676
- ; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
677
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0]
678
- ; X64-AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
679
- ; X64-AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
641
+ ; X64-AVX512-NEXT: vbroadcastsd (%rsi), %ymm0
642
+ ; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1
643
+ ; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
680
644
; X64-AVX512-NEXT: retq
681
645
%ld0 = load <1 x double >, ptr %p0 , align 32
682
646
%ld1 = load <1 x double >, ptr %p1 , align 32
0 commit comments