-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16)))))) -> (v4i32 bitcast (v8f16 scalar_to_vector)) #123338
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…-> (v4i32 bitcast (v8f16 scalar_to_vector)) This pattern tends to appear during f16 -> f32 promotion Partially addresses the unnecessary XMM->GPR->XMM moves when working with f16 types (llvm#107086)
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis pattern tends to appear during f16 -> f32 promotion Partially addresses the unnecessary XMM->GPR->XMM moves when working with f16 types (#107086) Patch is 29.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123338.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dba38f3e1a0bc1..dd682e061e646a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58574,6 +58574,7 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
EVT VT = N->getValueType(0);
SDValue Src = N->getOperand(0);
SDLoc DL(N);
@@ -58641,6 +58642,16 @@ static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
}
+ if (VT == MVT::v4i32) {
+ SDValue HalfSrc;
+ // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
+ // to remove XMM->GPR->XMM moves.
+ if (sd_match(Src, m_AnyExt(m_BitCast(
+ m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
+ }
+
// See if we're broadcasting the scalar value, in which case just reuse that.
// Ensure the same SDValue from the SDNode use is being used.
if (VT.getScalarType() == Src.getValueType())
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index a6b3e3fd1fd169..d67cd6b62c2b92 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -708,10 +708,8 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
;
; BF16-LABEL: pr62997:
; BF16: # %bb.0:
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: vpextrw $0, %xmm1, %ecx
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
+; BF16-NEXT: vpextrw $0, %xmm1, %eax
+; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; BF16-NEXT: retq
;
; FP16-LABEL: pr62997:
@@ -1652,66 +1650,63 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; AVXNC-NEXT: pushq %r12
; AVXNC-NEXT: pushq %rbx
; AVXNC-NEXT: subq $168, %rsp
-; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVXNC-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT: # xmm0 = mem[1,0]
+; AVXNC-NEXT: callq __truncdfbf2@PLT
+; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
-; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[1,0]
-; AVXNC-NEXT: callq __truncdfbf2@PLT
-; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2@PLT
-; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
-; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
-; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
+; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d
-; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
-; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2@PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
-; AVXNC-NEXT: vmovd %ebx, %xmm0
-; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vpinsrw $1, %r13d, %xmm0, %xmm0
+; AVXNC-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
-; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; AVXNC-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0
; AVXNC-NEXT: addq $168, %rsp
; AVXNC-NEXT: popq %rbx
; AVXNC-NEXT: popq %r12
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index fdf0bf3f692d62..e911a24d830f75 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -133,11 +133,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
;
; AVX512-LABEL: complex_canonicalize_fmul_half:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovd %eax, %xmm1
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index c7ef353f7f6038..efc457e35e7f37 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -59,8 +59,7 @@ define float @test2(ptr nocapture %src) nounwind {
;
; F16C-LABEL: test2:
; F16C: # %bb.0:
-; F16C-NEXT: movzwl (%rdi), %eax
-; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: retq
;
@@ -119,8 +118,7 @@ define double @test4(ptr nocapture %src) nounwind {
;
; F16C-LABEL: test4:
; F16C: # %bb.0:
-; F16C-NEXT: movzwl (%rdi), %eax
-; F16C-NEXT: vmovd %eax, %xmm0
+; F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; F16C-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll
index 8037c783dd8e67..7d1c52cd654512 100644
--- a/llvm/test/CodeGen/X86/fp-roundeven.ll
+++ b/llvm/test/CodeGen/X86/fp-roundeven.ll
@@ -50,8 +50,6 @@ define half @roundeven_f16(half %h) {
;
; AVX512F-LABEL: roundeven_f16:
; AVX512F: ## %bb.0: ## %entry
-; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 1515cd1366bc63..0d8290b120fa4f 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -9,8 +9,6 @@
define void @test_half_ceil(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_ceil:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -107,8 +105,6 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq cosf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -168,8 +164,6 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq expf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -229,8 +223,6 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq exp2f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -290,8 +282,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq exp10f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -349,8 +339,6 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
define void @test_half_fabs(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_fabs:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -392,8 +380,6 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
define void @test_half_floor(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_floor:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -447,14 +433,8 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm2, %eax
-; F16C-NEXT: vpextrw $0, %xmm1, %ecx
-; F16C-NEXT: vpextrw $0, %xmm0, %edx
-; F16C-NEXT: vmovd %edx, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vmovd %ecx, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; F16C-NEXT: vmovd %eax, %xmm2
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-NEXT: callq fmaf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -542,8 +522,6 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
define void @test_half_fneg(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_fneg:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -587,8 +565,6 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq logf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -648,8 +624,6 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq log2f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -709,8 +683,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq log10f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -768,8 +740,6 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_nearbyint:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -823,11 +793,7 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm1, %eax
-; F16C-NEXT: vpextrw $0, %xmm0, %ecx
-; F16C-NEXT: vmovd %ecx, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: vmovd %eax, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: callq powf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -907,8 +873,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rsi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq __powisf2@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -976,8 +940,6 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
define void @test_half_rint(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_rint:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -1031,8 +993,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq sinf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -1090,8 +1050,6 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_sqrt:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -1146,8 +1104,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
; F16C: # %bb.0:
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq tanf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -1205,8 +1161,6 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
define void @test_half_trunc(half %a0, ptr %p0) nounwind {
; F16C-LABEL: test_half_trunc:
; F16C: # %bb.0:
-; F16C-NEXT: vpextrw $0, %xmm0, %eax
-; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 1d2f4eb39bbe62..ec099db4e7ca7f 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -76,8 +76,7 @@ define float @extendhfsf(ptr %ptr) nounwind {
;
; CHECK-F16C-LABEL: extendhfsf:
; CHECK-F16C: ## %bb.0:
-; CHECK-F16C-NEXT: movzwl (%rdi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-F16C-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/half-fp80-darwin.ll b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
index 0ba734e66c7b2b..65a26187c5857c 100644
--- a/llvm/test/CodeGen/X86/half-fp80-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-fp80-darwin.ll
@@ -19,8 +19,7 @@ define void @extendhfxf(ptr %outptr, ptr %inptr) nounwind {
;
; CHECK-F16C-LABEL: extendhfxf:
; CHECK-F16C: ## %bb.0:
-; CHECK-F16C-NEXT: movzwl (%rsi), %eax
-; CHECK-F16C-NEXT: vmovd %eax, %xmm0
+; CHECK-F16C-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm0
; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-F16C-NEXT: flds -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 033cadae6a1e70..7bac075e486809 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -81,8 +81,7 @@ define float @test_extend32(ptr %addr) #0 {
;
; BWON-F16C-LABEL: test_extend32:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movzwl (%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: retq
;
@@ -113,8 +112,7 @@ define double @test_extend64(ptr %addr) #0 {
;
; BWON-F16C-LABEL: test_extend64:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movzwl (%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: retq
@@ -220,8 +218,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 {
;
; BWON-F16C-LABEL: test_fptosi_i64:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movzwl (%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax
; BWON-F16C-NEXT: retq
@@ -312,8 +309,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 {
;
; BWON-F16C-LABEL: test_fptoui_i64:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: movzwl (%rdi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm0
+; BWON-F16C-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: vcvttss2si %xmm0, %rcx
; BWON-F16C-NEXT: movq %rcx, %rdx
@@ -851,13 +847,12 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
;
; BWON-F16C-LABEL: test_sitofp_fadd_i32:
; BWON-F16C: # %bb.0:
-; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT: movzwl (%rsi), %eax
-; BWON-F16C-NEXT: vmovd %eax, %xmm1
+; BWON-F16C-NEXT: vpinsr...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
… -> (v4i32 bitcast (shuffle (v8f16 scalar_to_vector))) Extension to llvm#123338
… -> (v4i32 bitcast (shuffle (v8f16 scalar_to_vector))) Extension to llvm#123338
This pattern tends to appear during f16 -> f32 promotion
Partially addresses the unnecessary XMM->GPR->XMM moves when working with f16 types (#107086)