-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold (v4i32 (scalar_to_vector (i32 (zext (bitcast (f16)))))) -> (v4i32 bitcast (shuffle (v8f16 scalar_to_vector))) #126033
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesExtension to #123338 Patch is 48.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126033.diff 10 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb210..b3df32114e331f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58721,12 +58721,20 @@ static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::v4i32) {
SDValue HalfSrc;
- // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
+ // Combine (v4i32 (scalar_to_vector (i32 (a/zext (bitcast (f16))))))
// to remove XMM->GPR->XMM moves.
if (sd_match(Src, m_AnyExt(m_BitCast(
m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
return DAG.getBitcast(
VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
+ if (sd_match(Src, m_ZExt(m_BitCast(m_AllOf(m_SpecificVT(MVT::f16),
+ m_Value(HalfSrc)))))) {
+ SDValue R = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc);
+ R = DAG.getVectorShuffle(MVT::v8f16, DL, R,
+ getZeroVector(MVT::v8f16, Subtarget, DAG, DL),
+ {0, 8, -1, -1, -1, -1, -1, -1});
+ return DAG.getBitcast(VT, R);
+ }
}
// See if we're broadcasting the scalar value, in which case just reuse that.
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c830..70f04fb6df30ae 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -43,13 +43,11 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
;
; AVX512-LABEL: v_test_canonicalize__half:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: movzwl (%rdi), %eax
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vcvtph2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -144,9 +142,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm2
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vcvtph2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
@@ -228,9 +224,7 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-LABEL: v_test_canonicalize_v2half:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index c7f5e13cb74647..bd31e7b484a50d 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1840,9 +1840,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9
+; AVX512-NEXT: vcvtph2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 58c4f71892e902..8c209c311602aa 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -50,9 +50,7 @@ define half @round_f16(half %h) {
;
; AVX512F-LABEL: round_f16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpextrw $0, %xmm0, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
; AVX512F-NEXT: vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
index 6a6b86e8efa7c3..0597f4327e43d3 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
@@ -32,15 +32,11 @@ define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_oeq_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovnel %esi, %eax
; AVX-NEXT: cmovpl %esi, %eax
; AVX-NEXT: retq
@@ -96,15 +92,11 @@ define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ogt_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovbel %esi, %eax
; AVX-NEXT: retq
;
@@ -157,15 +149,11 @@ define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_oge_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovbl %esi, %eax
; AVX-NEXT: retq
;
@@ -220,13 +208,9 @@ define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_olt_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vucomiss %xmm0, %xmm1
; AVX-NEXT: cmovbel %esi, %eax
@@ -283,13 +267,9 @@ define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ole_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vucomiss %xmm0, %xmm1
; AVX-NEXT: cmovbl %esi, %eax
@@ -344,15 +324,11 @@ define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_one_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovel %esi, %eax
; AVX-NEXT: retq
;
@@ -405,15 +381,11 @@ define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ord_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovpl %esi, %eax
; AVX-NEXT: retq
;
@@ -466,15 +438,11 @@ define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ueq_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovnel %esi, %eax
; AVX-NEXT: retq
;
@@ -529,13 +497,9 @@ define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ugt_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vucomiss %xmm0, %xmm1
; AVX-NEXT: cmovael %esi, %eax
@@ -592,13 +556,9 @@ define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_uge_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vucomiss %xmm0, %xmm1
; AVX-NEXT: cmoval %esi, %eax
@@ -653,15 +613,11 @@ define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ult_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovael %esi, %eax
; AVX-NEXT: retq
;
@@ -714,15 +670,11 @@ define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ule_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmoval %esi, %eax
; AVX-NEXT: retq
;
@@ -776,15 +728,11 @@ define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_une_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %esi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovnel %edi, %eax
; AVX-NEXT: cmovpl %edi, %eax
; AVX-NEXT: retq
@@ -840,15 +788,11 @@ define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_uno_q:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vucomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vucomiss %xmm1, %xmm0
; AVX-NEXT: cmovnpl %esi, %eax
; AVX-NEXT: retq
;
@@ -902,15 +846,11 @@ define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_oeq_s:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcomiss %xmm1, %xmm0
; AVX-NEXT: cmovnel %esi, %eax
; AVX-NEXT: cmovpl %esi, %eax
; AVX-NEXT: retq
@@ -966,15 +906,11 @@ define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ogt_s:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcomiss %xmm1, %xmm0
; AVX-NEXT: cmovbel %esi, %eax
; AVX-NEXT: retq
;
@@ -1027,15 +963,11 @@ define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_oge_s:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm0, %ecx
-; AVX-NEXT: vpextrw $0, %xmm1, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX-NEXT: vcomiss %xmm0, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT: vcomiss %xmm1, %xmm0
; AVX-NEXT: cmovbl %esi, %eax
; AVX-NEXT: retq
;
@@ -1090,13 +1022,9 @@ define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_olt_s:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX-NEXT: vcomiss %xmm0, %xmm1
; AVX-NEXT: cmovbel %esi, %eax
@@ -1153,13 +1081,9 @@ define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
; AVX-LABEL: test_f16_ole_s:
; AVX: # %bb.0:
; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vpextrw $0, %xmm1, %ecx
-; AVX-NEXT: vpextrw $0, %xmm0, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX-NEXT: movzwl %c...
[truncated]
|
How about avoid to generate zext? #126194 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about avoid to generate zext? #126194
Ok, it's still useful for strict FP cases.
; AVX512-NEXT: vmovd %eax, %xmm1 | ||
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 | ||
; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 | ||
; AVX512-NEXT: vcvtph2ps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The memory fold is greet, but I think movzwl + vmovd
is better than vpinsrw + vpmovzxwq
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, I'll try to rework it to so xmm is zero'd out before the vpinsrw
vpxor %xmm0, %xmm0, %xmm0
vpinsrw $0, (%rdi), %xmm0, %xmm0
… a hidden broadcast pattern lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving llvm#126033 .......
… a hidden broadcast pattern (#126517) lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving #126033 .......
… a hidden broadcast pattern (llvm#126517) lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving llvm#126033 .......
… a hidden broadcast pattern (llvm#126517) lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving llvm#126033 .......
… a hidden broadcast pattern (llvm#126517) lowerShuffleAsBroadcast only matches a known-splat shuffle mask, but we can use the isShuffleEquivalent/IsElementEquivalent helpers to attempt to find a hidden broadcast-able shuffle pattern. This requires an extension to IsElementEquivalent to peek through bitcasts to match against wider shuffles - these typically appear during shuffle lowering where we've widened a preceding shuffle, often to a vector concatenation etc. Amazingly I hit this while yak shaving llvm#126033 .......
… -> (v4i32 bitcast (shuffle (v8f16 scalar_to_vector))) Extension to llvm#123338
Extension to #123338