Skip to content

[X86][BF16] Promote vector ADD/SUB/MUL/DIV to f32 #87858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1973,10 +1973,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom);
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
}

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::MLOAD, VT, Legal);
Expand Down Expand Up @@ -2291,26 +2289,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom);
for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
setF16Action(VT, Expand);
setOperationAction(ISD::FADD, VT, Expand);
setOperationAction(ISD::FSUB, VT, Expand);
setOperationAction(ISD::FMUL, VT, Expand);
setOperationAction(ISD::FDIV, VT, Expand);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
}
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
}
setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom);
addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
}

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
setF16Action(MVT::v32bf16, Expand);
setOperationAction(ISD::FADD, MVT::v32bf16, Expand);
setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);
setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);
setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom);
Expand Down
286 changes: 24 additions & 262 deletions llvm/test/CodeGen/X86/bfloat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -525,101 +525,13 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; X86-LABEL: addv:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: vmovw %xmm1, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm2
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %ecx
; X86-NEXT: vpextrw $1, %xmm1, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm2
; X86-NEXT: vpextrw $1, %xmm0, %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: vmovd %eax, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %eax
; X86-NEXT: vpextrw $2, %xmm1, %edx
; X86-NEXT: shll $16, %edx
; X86-NEXT: vmovd %edx, %xmm2
; X86-NEXT: vpextrw $2, %xmm0, %edx
; X86-NEXT: shll $16, %edx
; X86-NEXT: vmovd %edx, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %edx
; X86-NEXT: vpextrw $3, %xmm1, %esi
; X86-NEXT: shll $16, %esi
; X86-NEXT: vmovd %esi, %xmm2
; X86-NEXT: vpextrw $3, %xmm0, %esi
; X86-NEXT: shll $16, %esi
; X86-NEXT: vmovd %esi, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %esi
; X86-NEXT: vpextrw $4, %xmm1, %edi
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm2
; X86-NEXT: vpextrw $4, %xmm0, %edi
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %ebx
; X86-NEXT: vpextrw $5, %xmm1, %edi
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm2
; X86-NEXT: vpextrw $5, %xmm0, %edi
; X86-NEXT: shll $16, %edi
; X86-NEXT: vmovd %edi, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; X86-NEXT: vmovw %xmm2, %edi
; X86-NEXT: vpextrw $6, %xmm1, %ebp
; X86-NEXT: shll $16, %ebp
; X86-NEXT: vmovd %ebp, %xmm2
; X86-NEXT: vpextrw $6, %xmm0, %ebp
; X86-NEXT: shll $16, %ebp
; X86-NEXT: vmovd %ebp, %xmm3
; X86-NEXT: vaddss %xmm2, %xmm3, %xmm3
; X86-NEXT: vmovw %ecx, %xmm2
; X86-NEXT: vcvtneps2bf16 %xmm3, %xmm3
; X86-NEXT: vmovw %xmm3, %ecx
; X86-NEXT: vmovw %ebx, %xmm3
; X86-NEXT: vpextrw $7, %xmm1, %ebx
; X86-NEXT: shll $16, %ebx
; X86-NEXT: vmovd %ebx, %xmm1
; X86-NEXT: vpextrw $7, %xmm0, %ebx
; X86-NEXT: shll $16, %ebx
; X86-NEXT: vmovd %ebx, %xmm0
; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovw %ecx, %xmm1
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; X86-NEXT: vmovw %xmm0, %ecx
; X86-NEXT: vmovw %ecx, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: vmovw %edi, %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; X86-NEXT: vmovw %edx, %xmm3
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: vmovw %esi, %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; X86-NEXT: vmovw %eax, %xmm3
; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
; X86-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X86-NEXT: vpslld $16, %ymm1, %ymm1
; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X86-NEXT: vpslld $16, %ymm0, %ymm0
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; SSE2-LABEL: addv:
Expand Down Expand Up @@ -756,176 +668,26 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
; FP16-LABEL: addv:
; FP16: # %bb.0:
; FP16-NEXT: vmovw %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm2
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm3
; FP16-NEXT: vaddss %xmm2, %xmm3, %xmm2
; FP16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; FP16-NEXT: vmovw %xmm2, %eax
; FP16-NEXT: vmovw %eax, %xmm2
; FP16-NEXT: vpextrw $1, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm3
; FP16-NEXT: vpextrw $1, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm4
; FP16-NEXT: vaddss %xmm3, %xmm4, %xmm3
; FP16-NEXT: vcvtneps2bf16 %xmm3, %xmm3
; FP16-NEXT: vmovw %xmm3, %eax
; FP16-NEXT: vmovw %eax, %xmm3
; FP16-NEXT: vpextrw $2, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm4
; FP16-NEXT: vpextrw $2, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm5
; FP16-NEXT: vaddss %xmm4, %xmm5, %xmm4
; FP16-NEXT: vcvtneps2bf16 %xmm4, %xmm4
; FP16-NEXT: vmovw %xmm4, %eax
; FP16-NEXT: vmovw %eax, %xmm4
; FP16-NEXT: vpextrw $3, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm5
; FP16-NEXT: vpextrw $3, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm6
; FP16-NEXT: vaddss %xmm5, %xmm6, %xmm5
; FP16-NEXT: vcvtneps2bf16 %xmm5, %xmm5
; FP16-NEXT: vmovw %xmm5, %eax
; FP16-NEXT: vmovw %eax, %xmm5
; FP16-NEXT: vpextrw $4, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm6
; FP16-NEXT: vpextrw $4, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm7
; FP16-NEXT: vaddss %xmm6, %xmm7, %xmm6
; FP16-NEXT: vcvtneps2bf16 %xmm6, %xmm6
; FP16-NEXT: vmovw %xmm6, %eax
; FP16-NEXT: vmovw %eax, %xmm6
; FP16-NEXT: vpextrw $5, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm7
; FP16-NEXT: vpextrw $5, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm8
; FP16-NEXT: vaddss %xmm7, %xmm8, %xmm7
; FP16-NEXT: vcvtneps2bf16 %xmm7, %xmm7
; FP16-NEXT: vmovw %xmm7, %eax
; FP16-NEXT: vmovw %eax, %xmm7
; FP16-NEXT: vpextrw $6, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm8
; FP16-NEXT: vpextrw $6, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm9
; FP16-NEXT: vaddss %xmm8, %xmm9, %xmm8
; FP16-NEXT: vcvtneps2bf16 %xmm8, %xmm8
; FP16-NEXT: vmovw %xmm8, %eax
; FP16-NEXT: vmovw %eax, %xmm8
; FP16-NEXT: vpextrw $7, %xmm1, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vpextrw $7, %xmm0, %eax
; FP16-NEXT: shll $16, %eax
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vaddss %xmm1, %xmm0, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FP16-NEXT: vmovw %xmm0, %eax
; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; FP16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; FP16-NEXT: retq
; F16-LABEL: addv:
; F16: # %bb.0:
; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; F16-NEXT: vpslld $16, %ymm1, %ymm1
; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; F16-NEXT: vpslld $16, %ymm0, %ymm0
; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; F16-NEXT: vzeroupper
; F16-NEXT: retq
;
; AVXNC-LABEL: addv:
; AVXNC: # %bb.0:
; AVXNC-NEXT: vpextrw $7, %xmm1, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm2
; AVXNC-NEXT: vpextrw $7, %xmm0, %eax
; AVXNC-NEXT: shll $16, %eax
; AVXNC-NEXT: vmovd %eax, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %eax
; AVXNC-NEXT: vpextrw $6, %xmm1, %ecx
; AVXNC-NEXT: shll $16, %ecx
; AVXNC-NEXT: vmovd %ecx, %xmm2
; AVXNC-NEXT: vpextrw $6, %xmm0, %ecx
; AVXNC-NEXT: shll $16, %ecx
; AVXNC-NEXT: vmovd %ecx, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %ecx
; AVXNC-NEXT: vpextrw $5, %xmm1, %edx
; AVXNC-NEXT: shll $16, %edx
; AVXNC-NEXT: vmovd %edx, %xmm2
; AVXNC-NEXT: vpextrw $5, %xmm0, %edx
; AVXNC-NEXT: shll $16, %edx
; AVXNC-NEXT: vmovd %edx, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %edx
; AVXNC-NEXT: vpextrw $4, %xmm1, %esi
; AVXNC-NEXT: shll $16, %esi
; AVXNC-NEXT: vmovd %esi, %xmm2
; AVXNC-NEXT: vpextrw $4, %xmm0, %esi
; AVXNC-NEXT: shll $16, %esi
; AVXNC-NEXT: vmovd %esi, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %esi
; AVXNC-NEXT: vpextrw $3, %xmm1, %edi
; AVXNC-NEXT: shll $16, %edi
; AVXNC-NEXT: vmovd %edi, %xmm2
; AVXNC-NEXT: vpextrw $3, %xmm0, %edi
; AVXNC-NEXT: shll $16, %edi
; AVXNC-NEXT: vmovd %edi, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %edi
; AVXNC-NEXT: vpextrw $2, %xmm1, %r8d
; AVXNC-NEXT: shll $16, %r8d
; AVXNC-NEXT: vmovd %r8d, %xmm2
; AVXNC-NEXT: vpextrw $2, %xmm0, %r8d
; AVXNC-NEXT: shll $16, %r8d
; AVXNC-NEXT: vmovd %r8d, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm2, %r8d
; AVXNC-NEXT: vpextrw $1, %xmm1, %r9d
; AVXNC-NEXT: shll $16, %r9d
; AVXNC-NEXT: vmovd %r9d, %xmm2
; AVXNC-NEXT: vpextrw $1, %xmm0, %r9d
; AVXNC-NEXT: shll $16, %r9d
; AVXNC-NEXT: vmovd %r9d, %xmm3
; AVXNC-NEXT: vaddss %xmm2, %xmm3, %xmm2
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; AVXNC-NEXT: vmovd %xmm1, %r9d
; AVXNC-NEXT: shll $16, %r9d
; AVXNC-NEXT: vmovd %r9d, %xmm1
; AVXNC-NEXT: vmovd %xmm0, %r9d
; AVXNC-NEXT: shll $16, %r9d
; AVXNC-NEXT: vmovd %r9d, %xmm0
; AVXNC-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; AVXNC-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVXNC-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVXNC-NEXT: vpslld $16, %ymm1, %ymm1
; AVXNC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVXNC-NEXT: vpslld $16, %ymm0, %ymm0
; AVXNC-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: retq
%add = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %add
Expand Down