Skip to content

Commit 86df55e

Browse files
authored
[X86][BF16] Promote vector ADD/SUB/MUL/DIV to f32 (#87858)
1 parent 09d51a8 commit 86df55e

File tree

2 files changed

+31
-273
lines changed

2 files changed

+31
-273
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,10 +1978,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
19781978
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
19791979
setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
19801980
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
1981-
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1982-
setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1981+
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
19831982
setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
1984-
}
19851983

19861984
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
19871985
setOperationAction(ISD::MLOAD, VT, Legal);
@@ -2296,26 +2294,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
22962294
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
22972295
for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
22982296
setF16Action(VT, Expand);
2299-
setOperationAction(ISD::FADD, VT, Expand);
2300-
setOperationAction(ISD::FSUB, VT, Expand);
2301-
setOperationAction(ISD::FMUL, VT, Expand);
2302-
setOperationAction(ISD::FDIV, VT, Expand);
23032297
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
23042298
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
23052299
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
23062300
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
23072301
}
2302+
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2303+
setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2304+
setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2305+
}
23082306
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
23092307
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
23102308
}
23112309

23122310
if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
23132311
addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
23142312
setF16Action(MVT::v32bf16, Expand);
2315-
setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
2316-
setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
2317-
setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
2318-
setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
2313+
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2314+
setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
23192315
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
23202316
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
23212317
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);

llvm/test/CodeGen/X86/bfloat.ll

Lines changed: 24 additions & 262 deletions
Original file line numberDiff line numberDiff line change
@@ -525,101 +525,13 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
525525
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
526526
; X86-LABEL: addv:
527527
; X86: # %bb.0:
528-
; X86-NEXT: pushl %ebp
529-
; X86-NEXT: pushl %ebx
530-
; X86-NEXT: pushl %edi
531-
; X86-NEXT: pushl %esi
532-
; X86-NEXT: vmovw %xmm1, %eax
533-
; X86-NEXT: shll $16, %eax
534-
; X86-NEXT: vmovd %eax, %xmm2
535-
; X86-NEXT: vmovw %xmm0, %eax
536-
; X86-NEXT: shll $16, %eax
537-
; X86-NEXT: vmovd %eax, %xmm3
538-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
539-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
540-
; X86-NEXT: vmovw %xmm2, %ecx
541-
; X86-NEXT: vpextrw $1, %xmm1, %eax
542-
; X86-NEXT: shll $16, %eax
543-
; X86-NEXT: vmovd %eax, %xmm2
544-
; X86-NEXT: vpextrw $1, %xmm0, %eax
545-
; X86-NEXT: shll $16, %eax
546-
; X86-NEXT: vmovd %eax, %xmm3
547-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
548-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
549-
; X86-NEXT: vmovw %xmm2, %eax
550-
; X86-NEXT: vpextrw $2, %xmm1, %edx
551-
; X86-NEXT: shll $16, %edx
552-
; X86-NEXT: vmovd %edx, %xmm2
553-
; X86-NEXT: vpextrw $2, %xmm0, %edx
554-
; X86-NEXT: shll $16, %edx
555-
; X86-NEXT: vmovd %edx, %xmm3
556-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
557-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
558-
; X86-NEXT: vmovw %xmm2, %edx
559-
; X86-NEXT: vpextrw $3, %xmm1, %esi
560-
; X86-NEXT: shll $16, %esi
561-
; X86-NEXT: vmovd %esi, %xmm2
562-
; X86-NEXT: vpextrw $3, %xmm0, %esi
563-
; X86-NEXT: shll $16, %esi
564-
; X86-NEXT: vmovd %esi, %xmm3
565-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
566-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
567-
; X86-NEXT: vmovw %xmm2, %esi
568-
; X86-NEXT: vpextrw $4, %xmm1, %edi
569-
; X86-NEXT: shll $16, %edi
570-
; X86-NEXT: vmovd %edi, %xmm2
571-
; X86-NEXT: vpextrw $4, %xmm0, %edi
572-
; X86-NEXT: shll $16, %edi
573-
; X86-NEXT: vmovd %edi, %xmm3
574-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
575-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
576-
; X86-NEXT: vmovw %xmm2, %ebx
577-
; X86-NEXT: vpextrw $5, %xmm1, %edi
578-
; X86-NEXT: shll $16, %edi
579-
; X86-NEXT: vmovd %edi, %xmm2
580-
; X86-NEXT: vpextrw $5, %xmm0, %edi
581-
; X86-NEXT: shll $16, %edi
582-
; X86-NEXT: vmovd %edi, %xmm3
583-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
584-
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
585-
; X86-NEXT: vmovw %xmm2, %edi
586-
; X86-NEXT: vpextrw $6, %xmm1, %ebp
587-
; X86-NEXT: shll $16, %ebp
588-
; X86-NEXT: vmovd %ebp, %xmm2
589-
; X86-NEXT: vpextrw $6, %xmm0, %ebp
590-
; X86-NEXT: shll $16, %ebp
591-
; X86-NEXT: vmovd %ebp, %xmm3
592-
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm3
593-
; X86-NEXT: vmovw %ecx, %xmm2
594-
; X86-NEXT: vcvtneps2bf16 %xmm3, %xmm3
595-
; X86-NEXT: vmovw %xmm3, %ecx
596-
; X86-NEXT: vmovw %ebx, %xmm3
597-
; X86-NEXT: vpextrw $7, %xmm1, %ebx
598-
; X86-NEXT: shll $16, %ebx
599-
; X86-NEXT: vmovd %ebx, %xmm1
600-
; X86-NEXT: vpextrw $7, %xmm0, %ebx
601-
; X86-NEXT: shll $16, %ebx
602-
; X86-NEXT: vmovd %ebx, %xmm0
603-
; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
604-
; X86-NEXT: vmovw %ecx, %xmm1
605-
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
606-
; X86-NEXT: vmovw %xmm0, %ecx
607-
; X86-NEXT: vmovw %ecx, %xmm0
608-
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
609-
; X86-NEXT: vmovw %edi, %xmm1
610-
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
611-
; X86-NEXT: vmovw %edx, %xmm3
612-
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
613-
; X86-NEXT: vmovw %esi, %xmm1
614-
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
615-
; X86-NEXT: vmovw %eax, %xmm3
616-
; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
617-
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
618-
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
619-
; X86-NEXT: popl %esi
620-
; X86-NEXT: popl %edi
621-
; X86-NEXT: popl %ebx
622-
; X86-NEXT: popl %ebp
528+
; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
529+
; X86-NEXT: vpslld $16, %ymm1, %ymm1
530+
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
531+
; X86-NEXT: vpslld $16, %ymm0, %ymm0
532+
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
533+
; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
534+
; X86-NEXT: vzeroupper
623535
; X86-NEXT: retl
624536
;
625537
; SSE2-LABEL: addv:
@@ -756,176 +668,26 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
756668
; SSE2-NEXT: popq %rbp
757669
; SSE2-NEXT: retq
758670
;
759-
; FP16-LABEL: addv:
760-
; FP16: # %bb.0:
761-
; FP16-NEXT: vmovw %xmm1, %eax
762-
; FP16-NEXT: shll $16, %eax
763-
; FP16-NEXT: vmovd %eax, %xmm2
764-
; FP16-NEXT: vmovw %xmm0, %eax
765-
; FP16-NEXT: shll $16, %eax
766-
; FP16-NEXT: vmovd %eax, %xmm3
767-
; FP16-NEXT: vaddss %xmm2, %xmm3, %xmm2
768-
; FP16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
769-
; FP16-NEXT: vmovw %xmm2, %eax
770-
; FP16-NEXT: vmovw %eax, %xmm2
771-
; FP16-NEXT: vpextrw $1, %xmm1, %eax
772-
; FP16-NEXT: shll $16, %eax
773-
; FP16-NEXT: vmovd %eax, %xmm3
774-
; FP16-NEXT: vpextrw $1, %xmm0, %eax
775-
; FP16-NEXT: shll $16, %eax
776-
; FP16-NEXT: vmovd %eax, %xmm4
777-
; FP16-NEXT: vaddss %xmm3, %xmm4, %xmm3
778-
; FP16-NEXT: vcvtneps2bf16 %xmm3, %xmm3
779-
; FP16-NEXT: vmovw %xmm3, %eax
780-
; FP16-NEXT: vmovw %eax, %xmm3
781-
; FP16-NEXT: vpextrw $2, %xmm1, %eax
782-
; FP16-NEXT: shll $16, %eax
783-
; FP16-NEXT: vmovd %eax, %xmm4
784-
; FP16-NEXT: vpextrw $2, %xmm0, %eax
785-
; FP16-NEXT: shll $16, %eax
786-
; FP16-NEXT: vmovd %eax, %xmm5
787-
; FP16-NEXT: vaddss %xmm4, %xmm5, %xmm4
788-
; FP16-NEXT: vcvtneps2bf16 %xmm4, %xmm4
789-
; FP16-NEXT: vmovw %xmm4, %eax
790-
; FP16-NEXT: vmovw %eax, %xmm4
791-
; FP16-NEXT: vpextrw $3, %xmm1, %eax
792-
; FP16-NEXT: shll $16, %eax
793-
; FP16-NEXT: vmovd %eax, %xmm5
794-
; FP16-NEXT: vpextrw $3, %xmm0, %eax
795-
; FP16-NEXT: shll $16, %eax
796-
; FP16-NEXT: vmovd %eax, %xmm6
797-
; FP16-NEXT: vaddss %xmm5, %xmm6, %xmm5
798-
; FP16-NEXT: vcvtneps2bf16 %xmm5, %xmm5
799-
; FP16-NEXT: vmovw %xmm5, %eax
800-
; FP16-NEXT: vmovw %eax, %xmm5
801-
; FP16-NEXT: vpextrw $4, %xmm1, %eax
802-
; FP16-NEXT: shll $16, %eax
803-
; FP16-NEXT: vmovd %eax, %xmm6
804-
; FP16-NEXT: vpextrw $4, %xmm0, %eax
805-
; FP16-NEXT: shll $16, %eax
806-
; FP16-NEXT: vmovd %eax, %xmm7
807-
; FP16-NEXT: vaddss %xmm6, %xmm7, %xmm6
808-
; FP16-NEXT: vcvtneps2bf16 %xmm6, %xmm6
809-
; FP16-NEXT: vmovw %xmm6, %eax
810-
; FP16-NEXT: vmovw %eax, %xmm6
811-
; FP16-NEXT: vpextrw $5, %xmm1, %eax
812-
; FP16-NEXT: shll $16, %eax
813-
; FP16-NEXT: vmovd %eax, %xmm7
814-
; FP16-NEXT: vpextrw $5, %xmm0, %eax
815-
; FP16-NEXT: shll $16, %eax
816-
; FP16-NEXT: vmovd %eax, %xmm8
817-
; FP16-NEXT: vaddss %xmm7, %xmm8, %xmm7
818-
; FP16-NEXT: vcvtneps2bf16 %xmm7, %xmm7
819-
; FP16-NEXT: vmovw %xmm7, %eax
820-
; FP16-NEXT: vmovw %eax, %xmm7
821-
; FP16-NEXT: vpextrw $6, %xmm1, %eax
822-
; FP16-NEXT: shll $16, %eax
823-
; FP16-NEXT: vmovd %eax, %xmm8
824-
; FP16-NEXT: vpextrw $6, %xmm0, %eax
825-
; FP16-NEXT: shll $16, %eax
826-
; FP16-NEXT: vmovd %eax, %xmm9
827-
; FP16-NEXT: vaddss %xmm8, %xmm9, %xmm8
828-
; FP16-NEXT: vcvtneps2bf16 %xmm8, %xmm8
829-
; FP16-NEXT: vmovw %xmm8, %eax
830-
; FP16-NEXT: vmovw %eax, %xmm8
831-
; FP16-NEXT: vpextrw $7, %xmm1, %eax
832-
; FP16-NEXT: shll $16, %eax
833-
; FP16-NEXT: vmovd %eax, %xmm1
834-
; FP16-NEXT: vpextrw $7, %xmm0, %eax
835-
; FP16-NEXT: shll $16, %eax
836-
; FP16-NEXT: vmovd %eax, %xmm0
837-
; FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
838-
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
839-
; FP16-NEXT: vmovw %xmm0, %eax
840-
; FP16-NEXT: vmovw %eax, %xmm0
841-
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
842-
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
843-
; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
844-
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
845-
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
846-
; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
847-
; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
848-
; FP16-NEXT: retq
671+
; F16-LABEL: addv:
672+
; F16: # %bb.0:
673+
; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
674+
; F16-NEXT: vpslld $16, %ymm1, %ymm1
675+
; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
676+
; F16-NEXT: vpslld $16, %ymm0, %ymm0
677+
; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
678+
; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
679+
; F16-NEXT: vzeroupper
680+
; F16-NEXT: retq
849681
;
850682
; AVXNC-LABEL: addv:
851683
; AVXNC: # %bb.0:
852-
; AVXNC-NEXT: vpextrw $7, %xmm1, %eax
853-
; AVXNC-NEXT: shll $16, %eax
854-
; AVXNC-NEXT: vmovd %eax, %xmm2
855-
; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
856-
; AVXNC-NEXT: shll $16, %eax
857-
; AVXNC-NEXT: vmovd %eax, %xmm3
858-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
859-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
860-
; AVXNC-NEXT: vmovd %xmm2, %eax
861-
; AVXNC-NEXT: vpextrw $6, %xmm1, %ecx
862-
; AVXNC-NEXT: shll $16, %ecx
863-
; AVXNC-NEXT: vmovd %ecx, %xmm2
864-
; AVXNC-NEXT: vpextrw $6, %xmm0, %ecx
865-
; AVXNC-NEXT: shll $16, %ecx
866-
; AVXNC-NEXT: vmovd %ecx, %xmm3
867-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
868-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
869-
; AVXNC-NEXT: vmovd %xmm2, %ecx
870-
; AVXNC-NEXT: vpextrw $5, %xmm1, %edx
871-
; AVXNC-NEXT: shll $16, %edx
872-
; AVXNC-NEXT: vmovd %edx, %xmm2
873-
; AVXNC-NEXT: vpextrw $5, %xmm0, %edx
874-
; AVXNC-NEXT: shll $16, %edx
875-
; AVXNC-NEXT: vmovd %edx, %xmm3
876-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
877-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
878-
; AVXNC-NEXT: vmovd %xmm2, %edx
879-
; AVXNC-NEXT: vpextrw $4, %xmm1, %esi
880-
; AVXNC-NEXT: shll $16, %esi
881-
; AVXNC-NEXT: vmovd %esi, %xmm2
882-
; AVXNC-NEXT: vpextrw $4, %xmm0, %esi
883-
; AVXNC-NEXT: shll $16, %esi
884-
; AVXNC-NEXT: vmovd %esi, %xmm3
885-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
886-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
887-
; AVXNC-NEXT: vmovd %xmm2, %esi
888-
; AVXNC-NEXT: vpextrw $3, %xmm1, %edi
889-
; AVXNC-NEXT: shll $16, %edi
890-
; AVXNC-NEXT: vmovd %edi, %xmm2
891-
; AVXNC-NEXT: vpextrw $3, %xmm0, %edi
892-
; AVXNC-NEXT: shll $16, %edi
893-
; AVXNC-NEXT: vmovd %edi, %xmm3
894-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
895-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
896-
; AVXNC-NEXT: vmovd %xmm2, %edi
897-
; AVXNC-NEXT: vpextrw $2, %xmm1, %r8d
898-
; AVXNC-NEXT: shll $16, %r8d
899-
; AVXNC-NEXT: vmovd %r8d, %xmm2
900-
; AVXNC-NEXT: vpextrw $2, %xmm0, %r8d
901-
; AVXNC-NEXT: shll $16, %r8d
902-
; AVXNC-NEXT: vmovd %r8d, %xmm3
903-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
904-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
905-
; AVXNC-NEXT: vmovd %xmm2, %r8d
906-
; AVXNC-NEXT: vpextrw $1, %xmm1, %r9d
907-
; AVXNC-NEXT: shll $16, %r9d
908-
; AVXNC-NEXT: vmovd %r9d, %xmm2
909-
; AVXNC-NEXT: vpextrw $1, %xmm0, %r9d
910-
; AVXNC-NEXT: shll $16, %r9d
911-
; AVXNC-NEXT: vmovd %r9d, %xmm3
912-
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
913-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
914-
; AVXNC-NEXT: vmovd %xmm1, %r9d
915-
; AVXNC-NEXT: shll $16, %r9d
916-
; AVXNC-NEXT: vmovd %r9d, %xmm1
917-
; AVXNC-NEXT: vmovd %xmm0, %r9d
918-
; AVXNC-NEXT: shll $16, %r9d
919-
; AVXNC-NEXT: vmovd %r9d, %xmm0
920-
; AVXNC-NEXT: vaddss %xmm1, %xmm0, %xmm0
921-
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
922-
; AVXNC-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
923-
; AVXNC-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
924-
; AVXNC-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
925-
; AVXNC-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
926-
; AVXNC-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
927-
; AVXNC-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
928-
; AVXNC-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
684+
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
685+
; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
686+
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687+
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
688+
; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
689+
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
690+
; AVXNC-NEXT: vzeroupper
929691
; AVXNC-NEXT: retq
930692
%add = fadd <8 x bfloat> %a, %b
931693
ret <8 x bfloat> %add

0 commit comments

Comments
 (0)