-
Notifications
You must be signed in to change notification settings - Fork 14.3k
X86: Fix asserting on bfloat argument/return without sse2 #93146
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Matt Arsenault (arsenm) ChangesThese now get the default promote-to-float behavior, like half does. Fixes #92899 Patch is 58.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/93146.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index b107d56f8cf9b..4fddb60db5f10 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -123,12 +123,14 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
!Subtarget.hasX87())
return MVT::i32;
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
- return getRegisterTypeForCallingConv(Context, CC,
- VT.changeVectorElementType(MVT::f16));
+ if (isTypeLegal(MVT::f16)) {
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ return getRegisterTypeForCallingConv(
+ Context, CC, VT.changeVectorElementType(MVT::f16));
- if (VT == MVT::bf16)
- return MVT::f16;
+ if (VT == MVT::bf16)
+ return MVT::f16;
+ }
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -161,7 +163,8 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
return 3;
}
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+ isTypeLegal(MVT::f16))
return getNumRegistersForCallingConv(Context, CC,
VT.changeVectorElementType(MVT::f16));
@@ -193,7 +196,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
}
// Split vNbf16 vectors according to vNf16.
- if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)
+ if (VT.isVector() && VT.getVectorElementType() == MVT::bf16 &&
+ isTypeLegal(MVT::f16))
VT = VT.changeVectorElementType(MVT::f16);
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
new file mode 100644
index 0000000000000..05f036201f4b1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bfloat-calling-conv-no-sse2.ll
@@ -0,0 +1,1441 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-linux-gnu < %s | FileCheck -check-prefixes=CHECK,NOSSE %s
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse < %s | FileCheck -check-prefixes=CHECK,SSE %s
+
+; Make sure no assert without SSE2 and bfloat. Issue 92899
+
+define bfloat @return_arg_bf16(bfloat %x) {
+; CHECK-LABEL: return_arg_bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: retl
+ ret bfloat %x
+}
+
+define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) {
+; CHECK-LABEL: return_arg_v2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: retl
+ ret <2 x bfloat> %x
+}
+
+define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v3bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: pushl %eax
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: .cfi_offset %esi, -12
+; NOSSE-NEXT: .cfi_offset %edi, -8
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: # kill: def $ax killed $ax def $eax
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: shll $16, %eax
+; NOSSE-NEXT: movzwl %si, %edi
+; NOSSE-NEXT: orl %eax, %edi
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 4(%esi)
+; NOSSE-NEXT: movl %edi, (%esi)
+; NOSSE-NEXT: movl %esi, %eax
+; NOSSE-NEXT: addl $4, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 4
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v3bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: pushl %eax
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: .cfi_offset %esi, -12
+; SSE-NEXT: .cfi_offset %edi, -8
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: # kill: def $ax killed $ax def $eax
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: movzwl %si, %edi
+; SSE-NEXT: orl %eax, %edi
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 4(%esi)
+; SSE-NEXT: movl %edi, (%esi)
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: addl $4, %esp
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: popl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: popl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 4
+; SSE-NEXT: retl $4
+ ret <3 x bfloat> %x
+}
+
+define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v4bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: subl $12, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 32
+; NOSSE-NEXT: .cfi_offset %esi, -20
+; NOSSE-NEXT: .cfi_offset %edi, -16
+; NOSSE-NEXT: .cfi_offset %ebx, -12
+; NOSSE-NEXT: .cfi_offset %ebp, -8
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %edi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 6(%ebp)
+; NOSSE-NEXT: movw %bx, 4(%ebp)
+; NOSSE-NEXT: movw %di, 2(%ebp)
+; NOSSE-NEXT: movw %si, (%ebp)
+; NOSSE-NEXT: movl %ebp, %eax
+; NOSSE-NEXT: addl $12, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 4
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v4bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 20
+; SSE-NEXT: subl $12, %esp
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: .cfi_offset %esi, -20
+; SSE-NEXT: .cfi_offset %edi, -16
+; SSE-NEXT: .cfi_offset %ebx, -12
+; SSE-NEXT: .cfi_offset %ebp, -8
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %edi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 6(%ebp)
+; SSE-NEXT: movw %bx, 4(%ebp)
+; SSE-NEXT: movw %di, 2(%ebp)
+; SSE-NEXT: movw %si, (%ebp)
+; SSE-NEXT: movl %ebp, %eax
+; SSE-NEXT: addl $12, %esp
+; SSE-NEXT: .cfi_def_cfa_offset 20
+; SSE-NEXT: popl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: .cfi_def_cfa_offset 4
+; SSE-NEXT: retl $4
+ ret <4 x bfloat> %x
+}
+
+define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v8bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: subl $12, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 32
+; NOSSE-NEXT: .cfi_offset %esi, -20
+; NOSSE-NEXT: .cfi_offset %edi, -16
+; NOSSE-NEXT: .cfi_offset %ebx, -12
+; NOSSE-NEXT: .cfi_offset %ebp, -8
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %edi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 14(%ebp)
+; NOSSE-NEXT: movw %bx, 12(%ebp)
+; NOSSE-NEXT: movw %di, 10(%ebp)
+; NOSSE-NEXT: movw %si, 8(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 6(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 4(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 2(%ebp)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, (%ebp)
+; NOSSE-NEXT: movl %ebp, %eax
+; NOSSE-NEXT: addl $12, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 4
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v8bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: pushl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 20
+; SSE-NEXT: subl $12, %esp
+; SSE-NEXT: .cfi_def_cfa_offset 32
+; SSE-NEXT: .cfi_offset %esi, -20
+; SSE-NEXT: .cfi_offset %edi, -16
+; SSE-NEXT: .cfi_offset %ebx, -12
+; SSE-NEXT: .cfi_offset %ebp, -8
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %edi
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movl %eax, %ebx
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: movss %xmm0, (%esp)
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; SSE-NEXT: calll __truncsfbf2
+; SSE-NEXT: movw %ax, 14(%ebp)
+; SSE-NEXT: movw %bx, 12(%ebp)
+; SSE-NEXT: movw %di, 10(%ebp)
+; SSE-NEXT: movw %si, 8(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 6(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 4(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, 2(%ebp)
+; SSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; SSE-NEXT: movw %ax, (%ebp)
+; SSE-NEXT: movl %ebp, %eax
+; SSE-NEXT: addl $12, %esp
+; SSE-NEXT: .cfi_def_cfa_offset 20
+; SSE-NEXT: popl %esi
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: popl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: popl %ebx
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: popl %ebp
+; SSE-NEXT: .cfi_def_cfa_offset 4
+; SSE-NEXT: retl $4
+ ret <8 x bfloat> %x
+}
+
+define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) {
+; NOSSE-LABEL: return_arg_v16bf16:
+; NOSSE: # %bb.0:
+; NOSSE-NEXT: pushl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: pushl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: pushl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: pushl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: subl $28, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 48
+; NOSSE-NEXT: .cfi_offset %esi, -20
+; NOSSE-NEXT: .cfi_offset %edi, -16
+; NOSSE-NEXT: .cfi_offset %ebx, -12
+; NOSSE-NEXT: .cfi_offset %ebp, -8
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %esi
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebx
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movl %eax, %ebp
+; NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; NOSSE-NEXT: fstps (%esp)
+; NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi
+; NOSSE-NEXT: calll __truncsfbf2
+; NOSSE-NEXT: movw %ax, 30(%edi)
+; NOSSE-NEXT: movw %bp, 28(%edi)
+; NOSSE-NEXT: movw %bx, 26(%edi)
+; NOSSE-NEXT: movw %si, 24(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 22(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 20(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 18(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 16(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 14(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 12(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 10(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 8(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 6(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 4(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, 2(%edi)
+; NOSSE-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; NOSSE-NEXT: movw %ax, (%edi)
+; NOSSE-NEXT: movl %edi, %eax
+; NOSSE-NEXT: addl $28, %esp
+; NOSSE-NEXT: .cfi_def_cfa_offset 20
+; NOSSE-NEXT: popl %esi
+; NOSSE-NEXT: .cfi_def_cfa_offset 16
+; NOSSE-NEXT: popl %edi
+; NOSSE-NEXT: .cfi_def_cfa_offset 12
+; NOSSE-NEXT: popl %ebx
+; NOSSE-NEXT: .cfi_def_cfa_offset 8
+; NOSSE-NEXT: popl %ebp
+; NOSSE-NEXT: .cfi_def_cfa_offset 4
+; NOSSE-NEXT: retl $4
+;
+; SSE-LABEL: return_arg_v16bf16:
+; SSE: # %bb.0:
+; SSE-NEXT: pushl %ebp
+; SSE-NEXT: .cfi_def_cfa_offset 8
+; SSE-NEXT: pushl %ebx
+; SSE-NEXT: .cfi_def_cfa_offset 12
+; SSE-NEXT: pushl %edi
+; SSE-NEXT: .cfi_def_cfa_offset 16
+; SSE-NEXT: p...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In general, I think what we do here is useless. But I'm neutral to land it or not. I'll leave it to other reviewers.
define bfloat @return_arg_bf16(bfloat %x) { | ||
; CHECK-LABEL: return_arg_bf16: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: flds {{[0-9]+}}(%esp) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't match with half, maybe fine if you just want to make assert happy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default logic is to legalize half to i16, and bfloat to float:
if (!isTypeLegal(MVT::f16)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's how it works in most target but doesn't work for X86, because when on 32-bit system or 64-bit without SSE, X87 register is used for float
type. But X87 instruction cannot handle half
type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But you could just as well replace half in ABI contexts with float, as is done for bfloat. You can make either decision, the instruction support doesn't really change this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right, I mistook with another issue. You need to add back the code I removed in #76901. Even though, you probably still getting error on 64-bit like this https://godbolt.org/z/b36dxdsW9
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With this patch, I see that error. Without it, I see the assert this avoids. If you change your godbolt link to one of the llc builds with assertions, you also see the assert.
Which code in #76901 do you mean? I'm not sure the removed splitValueIntoRegisterParts code matters here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean you need to add back splitValueIntoRegisterParts/joinRegisterPartsIntoValue, otherwise you just load/store in 32-bits width.
The assertions is not the key. The key is LLVM17 contains #76901 (removed since 18) which equals the patch is this case. What I want to show is even with this, we have another bar there.
Think again, I think this can be taken as a precedent how we handle ABI unsuppoted cases: https://godbolt.org/z/z5YdGdaWs
Maybe we just need to emit an error like that for BF16 without SSE2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, the 32-bit load/store here is of a float value. It is not trying to load a bfloat as a 32-bit operation. The bfloat simply does not exist. The ABI implicitly upcasts to float
ping |
These now get the default promote-to-float behavior, like half does. Fixes llvm#92899
ed4adaf
to
54c7ce2
Compare
ping |
2 similar comments
ping |
ping |
@@ -811,8 +854,7 @@ define void @store_bfloat(ptr %fptr, bfloat %v) { | |||
ret void | |||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this codegen so poor vs store_half?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's ABI promotes to float vs. ABI promotes to i16
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM (mainly just to stop the assert), but I'd prefer @phoebewang replies as well
These now get the default promote-to-float behavior, like half does.
Fixes #92899