@@ -525,101 +525,13 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
525
525
define <8 x bfloat> @addv (<8 x bfloat> %a , <8 x bfloat> %b ) nounwind {
526
526
; X86-LABEL: addv:
527
527
; X86: # %bb.0:
528
- ; X86-NEXT: pushl %ebp
529
- ; X86-NEXT: pushl %ebx
530
- ; X86-NEXT: pushl %edi
531
- ; X86-NEXT: pushl %esi
532
- ; X86-NEXT: vmovw %xmm1, %eax
533
- ; X86-NEXT: shll $16, %eax
534
- ; X86-NEXT: vmovd %eax, %xmm2
535
- ; X86-NEXT: vmovw %xmm0, %eax
536
- ; X86-NEXT: shll $16, %eax
537
- ; X86-NEXT: vmovd %eax, %xmm3
538
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
539
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
540
- ; X86-NEXT: vmovw %xmm2, %ecx
541
- ; X86-NEXT: vpextrw $1, %xmm1, %eax
542
- ; X86-NEXT: shll $16, %eax
543
- ; X86-NEXT: vmovd %eax, %xmm2
544
- ; X86-NEXT: vpextrw $1, %xmm0, %eax
545
- ; X86-NEXT: shll $16, %eax
546
- ; X86-NEXT: vmovd %eax, %xmm3
547
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
548
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
549
- ; X86-NEXT: vmovw %xmm2, %eax
550
- ; X86-NEXT: vpextrw $2, %xmm1, %edx
551
- ; X86-NEXT: shll $16, %edx
552
- ; X86-NEXT: vmovd %edx, %xmm2
553
- ; X86-NEXT: vpextrw $2, %xmm0, %edx
554
- ; X86-NEXT: shll $16, %edx
555
- ; X86-NEXT: vmovd %edx, %xmm3
556
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
557
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
558
- ; X86-NEXT: vmovw %xmm2, %edx
559
- ; X86-NEXT: vpextrw $3, %xmm1, %esi
560
- ; X86-NEXT: shll $16, %esi
561
- ; X86-NEXT: vmovd %esi, %xmm2
562
- ; X86-NEXT: vpextrw $3, %xmm0, %esi
563
- ; X86-NEXT: shll $16, %esi
564
- ; X86-NEXT: vmovd %esi, %xmm3
565
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
566
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
567
- ; X86-NEXT: vmovw %xmm2, %esi
568
- ; X86-NEXT: vpextrw $4, %xmm1, %edi
569
- ; X86-NEXT: shll $16, %edi
570
- ; X86-NEXT: vmovd %edi, %xmm2
571
- ; X86-NEXT: vpextrw $4, %xmm0, %edi
572
- ; X86-NEXT: shll $16, %edi
573
- ; X86-NEXT: vmovd %edi, %xmm3
574
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
575
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
576
- ; X86-NEXT: vmovw %xmm2, %ebx
577
- ; X86-NEXT: vpextrw $5, %xmm1, %edi
578
- ; X86-NEXT: shll $16, %edi
579
- ; X86-NEXT: vmovd %edi, %xmm2
580
- ; X86-NEXT: vpextrw $5, %xmm0, %edi
581
- ; X86-NEXT: shll $16, %edi
582
- ; X86-NEXT: vmovd %edi, %xmm3
583
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
584
- ; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
585
- ; X86-NEXT: vmovw %xmm2, %edi
586
- ; X86-NEXT: vpextrw $6, %xmm1, %ebp
587
- ; X86-NEXT: shll $16, %ebp
588
- ; X86-NEXT: vmovd %ebp, %xmm2
589
- ; X86-NEXT: vpextrw $6, %xmm0, %ebp
590
- ; X86-NEXT: shll $16, %ebp
591
- ; X86-NEXT: vmovd %ebp, %xmm3
592
- ; X86-NEXT: vaddss %xmm2, %xmm3, %xmm3
593
- ; X86-NEXT: vmovw %ecx, %xmm2
594
- ; X86-NEXT: vcvtneps2bf16 %xmm3, %xmm3
595
- ; X86-NEXT: vmovw %xmm3, %ecx
596
- ; X86-NEXT: vmovw %ebx, %xmm3
597
- ; X86-NEXT: vpextrw $7, %xmm1, %ebx
598
- ; X86-NEXT: shll $16, %ebx
599
- ; X86-NEXT: vmovd %ebx, %xmm1
600
- ; X86-NEXT: vpextrw $7, %xmm0, %ebx
601
- ; X86-NEXT: shll $16, %ebx
602
- ; X86-NEXT: vmovd %ebx, %xmm0
603
- ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
604
- ; X86-NEXT: vmovw %ecx, %xmm1
605
- ; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
606
- ; X86-NEXT: vmovw %xmm0, %ecx
607
- ; X86-NEXT: vmovw %ecx, %xmm0
608
- ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
609
- ; X86-NEXT: vmovw %edi, %xmm1
610
- ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
611
- ; X86-NEXT: vmovw %edx, %xmm3
612
- ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
613
- ; X86-NEXT: vmovw %esi, %xmm1
614
- ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
615
- ; X86-NEXT: vmovw %eax, %xmm3
616
- ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
617
- ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
618
- ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
619
- ; X86-NEXT: popl %esi
620
- ; X86-NEXT: popl %edi
621
- ; X86-NEXT: popl %ebx
622
- ; X86-NEXT: popl %ebp
528
+ ; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
529
+ ; X86-NEXT: vpslld $16, %ymm1, %ymm1
530
+ ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
531
+ ; X86-NEXT: vpslld $16, %ymm0, %ymm0
532
+ ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
533
+ ; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
534
+ ; X86-NEXT: vzeroupper
623
535
; X86-NEXT: retl
624
536
;
625
537
; SSE2-LABEL: addv:
@@ -756,176 +668,26 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
756
668
; SSE2-NEXT: popq %rbp
757
669
; SSE2-NEXT: retq
758
670
;
759
- ; FP16-LABEL: addv:
760
- ; FP16: # %bb.0:
761
- ; FP16-NEXT: vmovw %xmm1, %eax
762
- ; FP16-NEXT: shll $16, %eax
763
- ; FP16-NEXT: vmovd %eax, %xmm2
764
- ; FP16-NEXT: vmovw %xmm0, %eax
765
- ; FP16-NEXT: shll $16, %eax
766
- ; FP16-NEXT: vmovd %eax, %xmm3
767
- ; FP16-NEXT: vaddss %xmm2, %xmm3, %xmm2
768
- ; FP16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
769
- ; FP16-NEXT: vmovw %xmm2, %eax
770
- ; FP16-NEXT: vmovw %eax, %xmm2
771
- ; FP16-NEXT: vpextrw $1, %xmm1, %eax
772
- ; FP16-NEXT: shll $16, %eax
773
- ; FP16-NEXT: vmovd %eax, %xmm3
774
- ; FP16-NEXT: vpextrw $1, %xmm0, %eax
775
- ; FP16-NEXT: shll $16, %eax
776
- ; FP16-NEXT: vmovd %eax, %xmm4
777
- ; FP16-NEXT: vaddss %xmm3, %xmm4, %xmm3
778
- ; FP16-NEXT: vcvtneps2bf16 %xmm3, %xmm3
779
- ; FP16-NEXT: vmovw %xmm3, %eax
780
- ; FP16-NEXT: vmovw %eax, %xmm3
781
- ; FP16-NEXT: vpextrw $2, %xmm1, %eax
782
- ; FP16-NEXT: shll $16, %eax
783
- ; FP16-NEXT: vmovd %eax, %xmm4
784
- ; FP16-NEXT: vpextrw $2, %xmm0, %eax
785
- ; FP16-NEXT: shll $16, %eax
786
- ; FP16-NEXT: vmovd %eax, %xmm5
787
- ; FP16-NEXT: vaddss %xmm4, %xmm5, %xmm4
788
- ; FP16-NEXT: vcvtneps2bf16 %xmm4, %xmm4
789
- ; FP16-NEXT: vmovw %xmm4, %eax
790
- ; FP16-NEXT: vmovw %eax, %xmm4
791
- ; FP16-NEXT: vpextrw $3, %xmm1, %eax
792
- ; FP16-NEXT: shll $16, %eax
793
- ; FP16-NEXT: vmovd %eax, %xmm5
794
- ; FP16-NEXT: vpextrw $3, %xmm0, %eax
795
- ; FP16-NEXT: shll $16, %eax
796
- ; FP16-NEXT: vmovd %eax, %xmm6
797
- ; FP16-NEXT: vaddss %xmm5, %xmm6, %xmm5
798
- ; FP16-NEXT: vcvtneps2bf16 %xmm5, %xmm5
799
- ; FP16-NEXT: vmovw %xmm5, %eax
800
- ; FP16-NEXT: vmovw %eax, %xmm5
801
- ; FP16-NEXT: vpextrw $4, %xmm1, %eax
802
- ; FP16-NEXT: shll $16, %eax
803
- ; FP16-NEXT: vmovd %eax, %xmm6
804
- ; FP16-NEXT: vpextrw $4, %xmm0, %eax
805
- ; FP16-NEXT: shll $16, %eax
806
- ; FP16-NEXT: vmovd %eax, %xmm7
807
- ; FP16-NEXT: vaddss %xmm6, %xmm7, %xmm6
808
- ; FP16-NEXT: vcvtneps2bf16 %xmm6, %xmm6
809
- ; FP16-NEXT: vmovw %xmm6, %eax
810
- ; FP16-NEXT: vmovw %eax, %xmm6
811
- ; FP16-NEXT: vpextrw $5, %xmm1, %eax
812
- ; FP16-NEXT: shll $16, %eax
813
- ; FP16-NEXT: vmovd %eax, %xmm7
814
- ; FP16-NEXT: vpextrw $5, %xmm0, %eax
815
- ; FP16-NEXT: shll $16, %eax
816
- ; FP16-NEXT: vmovd %eax, %xmm8
817
- ; FP16-NEXT: vaddss %xmm7, %xmm8, %xmm7
818
- ; FP16-NEXT: vcvtneps2bf16 %xmm7, %xmm7
819
- ; FP16-NEXT: vmovw %xmm7, %eax
820
- ; FP16-NEXT: vmovw %eax, %xmm7
821
- ; FP16-NEXT: vpextrw $6, %xmm1, %eax
822
- ; FP16-NEXT: shll $16, %eax
823
- ; FP16-NEXT: vmovd %eax, %xmm8
824
- ; FP16-NEXT: vpextrw $6, %xmm0, %eax
825
- ; FP16-NEXT: shll $16, %eax
826
- ; FP16-NEXT: vmovd %eax, %xmm9
827
- ; FP16-NEXT: vaddss %xmm8, %xmm9, %xmm8
828
- ; FP16-NEXT: vcvtneps2bf16 %xmm8, %xmm8
829
- ; FP16-NEXT: vmovw %xmm8, %eax
830
- ; FP16-NEXT: vmovw %eax, %xmm8
831
- ; FP16-NEXT: vpextrw $7, %xmm1, %eax
832
- ; FP16-NEXT: shll $16, %eax
833
- ; FP16-NEXT: vmovd %eax, %xmm1
834
- ; FP16-NEXT: vpextrw $7, %xmm0, %eax
835
- ; FP16-NEXT: shll $16, %eax
836
- ; FP16-NEXT: vmovd %eax, %xmm0
837
- ; FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
838
- ; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
839
- ; FP16-NEXT: vmovw %xmm0, %eax
840
- ; FP16-NEXT: vmovw %eax, %xmm0
841
- ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
842
- ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
843
- ; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
844
- ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
845
- ; FP16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
846
- ; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
847
- ; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
848
- ; FP16-NEXT: retq
671
+ ; F16-LABEL: addv:
672
+ ; F16: # %bb.0:
673
+ ; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
674
+ ; F16-NEXT: vpslld $16, %ymm1, %ymm1
675
+ ; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
676
+ ; F16-NEXT: vpslld $16, %ymm0, %ymm0
677
+ ; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
678
+ ; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
679
+ ; F16-NEXT: vzeroupper
680
+ ; F16-NEXT: retq
849
681
;
850
682
; AVXNC-LABEL: addv:
851
683
; AVXNC: # %bb.0:
852
- ; AVXNC-NEXT: vpextrw $7, %xmm1, %eax
853
- ; AVXNC-NEXT: shll $16, %eax
854
- ; AVXNC-NEXT: vmovd %eax, %xmm2
855
- ; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
856
- ; AVXNC-NEXT: shll $16, %eax
857
- ; AVXNC-NEXT: vmovd %eax, %xmm3
858
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
859
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
860
- ; AVXNC-NEXT: vmovd %xmm2, %eax
861
- ; AVXNC-NEXT: vpextrw $6, %xmm1, %ecx
862
- ; AVXNC-NEXT: shll $16, %ecx
863
- ; AVXNC-NEXT: vmovd %ecx, %xmm2
864
- ; AVXNC-NEXT: vpextrw $6, %xmm0, %ecx
865
- ; AVXNC-NEXT: shll $16, %ecx
866
- ; AVXNC-NEXT: vmovd %ecx, %xmm3
867
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
868
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
869
- ; AVXNC-NEXT: vmovd %xmm2, %ecx
870
- ; AVXNC-NEXT: vpextrw $5, %xmm1, %edx
871
- ; AVXNC-NEXT: shll $16, %edx
872
- ; AVXNC-NEXT: vmovd %edx, %xmm2
873
- ; AVXNC-NEXT: vpextrw $5, %xmm0, %edx
874
- ; AVXNC-NEXT: shll $16, %edx
875
- ; AVXNC-NEXT: vmovd %edx, %xmm3
876
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
877
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
878
- ; AVXNC-NEXT: vmovd %xmm2, %edx
879
- ; AVXNC-NEXT: vpextrw $4, %xmm1, %esi
880
- ; AVXNC-NEXT: shll $16, %esi
881
- ; AVXNC-NEXT: vmovd %esi, %xmm2
882
- ; AVXNC-NEXT: vpextrw $4, %xmm0, %esi
883
- ; AVXNC-NEXT: shll $16, %esi
884
- ; AVXNC-NEXT: vmovd %esi, %xmm3
885
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
886
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
887
- ; AVXNC-NEXT: vmovd %xmm2, %esi
888
- ; AVXNC-NEXT: vpextrw $3, %xmm1, %edi
889
- ; AVXNC-NEXT: shll $16, %edi
890
- ; AVXNC-NEXT: vmovd %edi, %xmm2
891
- ; AVXNC-NEXT: vpextrw $3, %xmm0, %edi
892
- ; AVXNC-NEXT: shll $16, %edi
893
- ; AVXNC-NEXT: vmovd %edi, %xmm3
894
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
895
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
896
- ; AVXNC-NEXT: vmovd %xmm2, %edi
897
- ; AVXNC-NEXT: vpextrw $2, %xmm1, %r8d
898
- ; AVXNC-NEXT: shll $16, %r8d
899
- ; AVXNC-NEXT: vmovd %r8d, %xmm2
900
- ; AVXNC-NEXT: vpextrw $2, %xmm0, %r8d
901
- ; AVXNC-NEXT: shll $16, %r8d
902
- ; AVXNC-NEXT: vmovd %r8d, %xmm3
903
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
904
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
905
- ; AVXNC-NEXT: vmovd %xmm2, %r8d
906
- ; AVXNC-NEXT: vpextrw $1, %xmm1, %r9d
907
- ; AVXNC-NEXT: shll $16, %r9d
908
- ; AVXNC-NEXT: vmovd %r9d, %xmm2
909
- ; AVXNC-NEXT: vpextrw $1, %xmm0, %r9d
910
- ; AVXNC-NEXT: shll $16, %r9d
911
- ; AVXNC-NEXT: vmovd %r9d, %xmm3
912
- ; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
913
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
914
- ; AVXNC-NEXT: vmovd %xmm1, %r9d
915
- ; AVXNC-NEXT: shll $16, %r9d
916
- ; AVXNC-NEXT: vmovd %r9d, %xmm1
917
- ; AVXNC-NEXT: vmovd %xmm0, %r9d
918
- ; AVXNC-NEXT: shll $16, %r9d
919
- ; AVXNC-NEXT: vmovd %r9d, %xmm0
920
- ; AVXNC-NEXT: vaddss %xmm1, %xmm0, %xmm0
921
- ; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
922
- ; AVXNC-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
923
- ; AVXNC-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
924
- ; AVXNC-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
925
- ; AVXNC-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
926
- ; AVXNC-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
927
- ; AVXNC-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
928
- ; AVXNC-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
684
+ ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
685
+ ; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
686
+ ; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687
+ ; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
688
+ ; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
689
+ ; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
690
+ ; AVXNC-NEXT: vzeroupper
929
691
; AVXNC-NEXT: retq
930
692
%add = fadd <8 x bfloat> %a , %b
931
693
ret <8 x bfloat> %add
0 commit comments