-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold (f16 bitcast extract_vectorelt(v,0)) to (extract_vectorelt (v8f16 bitcast(v,0))) #125877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAlso handles possible truncations from i32 to i16. Cleans up some of the poor codegen identified in #98630 Patch is 85.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125877.diff 14 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6cf6061deba702..b0cebea5f29880 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45160,6 +45160,19 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}
+ // Attempt to peek through f16 bitcasted extractions hidden by truncation.
+ if (VT == MVT::f16 && SrcVT == MVT::i16) {
+ SDValue Src = peekThroughTruncates(N0);
+ if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Src.getOperand(0).getValueSizeInBits() == 128 &&
+ isNullConstant(Src.getOperand(1))) {
+ SDLoc DL(N);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+ DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ }
+
// Since MMX types are special and don't usually play with other vector types,
// it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions.
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index d67cd6b62c2b92..4d269cfff2afe6 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -82,8 +82,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; X86-NEXT: vmovd %eax, %xmm1
; X86-NEXT: vaddss %xmm0, %xmm1, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add2:
@@ -110,8 +108,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; FP16-NEXT: vmovd %eax, %xmm1
; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add2:
@@ -124,8 +120,6 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm1
; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, %b
ret bfloat %add
@@ -432,8 +426,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; X86-NEXT: vmovd %eax, %xmm0
; X86-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
; X86-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; X86-NEXT: vmovw %xmm0, %eax
-; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
;
; SSE2-LABEL: add_constant2:
@@ -454,8 +446,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; FP16-NEXT: vmovd %eax, %xmm0
; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %eax, %xmm0
; FP16-NEXT: retq
;
; AVXNC-LABEL: add_constant2:
@@ -465,8 +455,6 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; AVXNC-NEXT: vmovd %eax, %xmm0
; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
-; AVXNC-NEXT: vmovd %xmm0, %eax
-; AVXNC-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNC-NEXT: retq
%add = fadd bfloat %a, 1.0
ret bfloat %add
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 04087c4f0dd5ed..556b0deaf4c830 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -154,8 +154,6 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
@@ -239,15 +237,11 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; AVX512-NEXT: vmovd %xmm0, (%rdi)
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index bfff6ef41dbe00..fbc3fbf1055f45 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1812,212 +1812,186 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
;
; AVX512-LABEL: test_fmaximumnum_v4f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: subq $72, %rsp
; AVX512-NEXT: vmovdqa %xmm1, %xmm4
-; AVX512-NEXT: vmovdqa %xmm0, %xmm6
+; AVX512-NEXT: vmovdqa %xmm0, %xmm8
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
-; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm2
+; AVX512-NEXT: vucomiss %xmm0, %xmm1
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm2
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm9
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm9
; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vxorps %xmm10, %xmm10, %xmm10
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,3,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
+; AVX512-NEXT: vucomiss %xmm3, %xmm2
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1}
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm2
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1}
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm4[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm0
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm4[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm2
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512-NEXT: vucomiss %xmm5, %xmm5
+; AVX512-NEXT: vshufpd {{.*#+}} xmm7 = xmm8[1,0]
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vucomiss %xmm7, %xmm7
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm15
-; AVX512-NEXT: vcvtph2ps %xmm15, %xmm5
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm5
+; AVX512-NEXT: vmovss %xmm2, %xmm7, %xmm7 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14
+; AVX512-NEXT: vcvtph2ps %xmm14, %xmm7
+; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: vucomiss %xmm2, %xmm7
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vmulss %xmm3, %xmm9, %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1
+; AVX512-NEXT: vmovss %xmm7, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vxorps %xmm15, %xmm15, %xmm15
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm3
+; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm1, %eax
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovd %xmm0, %ecx
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm15[1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2
+; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm11
-; AVX512-NEXT: vcvtph2ps %xmm11, %xmm3
-; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX512-NEXT: vucomiss %xmm2, %xmm3
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12
+; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1
+; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13
+; AVX512-NEXT: vcvtph2ps %xmm13, %xmm6
+; AVX512-NEXT: vucomiss %xmm6, %xmm1
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1}
+; AVX512-NEXT: vmovss %xmm1, %xmm6, %xmm6 {%k1}
; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512-NEXT: vucomiss %xmm3, %xmm3
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vucomiss %xmm1, %xmm1
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k2
-; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2}
-; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm7
-; AVX512-NEXT: vcvtph2ps %xmm7, %xmm3
+; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2}
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm10
+; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3
; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm12
-; AVX512-NEXT: vcvtph2ps %xmm12, %xmm0
-; AVX512-NEXT: vucomiss %xmm0, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm11
+; AVX512-NEXT: vcvtph2ps %xmm11, %xmm5
+; AVX512-NEXT: vucomiss %xmm5, %xmm3
; AVX512-NEXT: seta %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2
-; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14
-; AVX512-NEXT: vmovd %xmm14, %eax
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13
-; AVX512-NEXT: vmovd %xmm13, %ecx
-; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vmovss %xmm3, %xmm5, %xmm5 {%k1}
; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0
; AVX512-NEXT: vucomiss %xmm0, %xmm0
; AVX512-NEXT: setp %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vcvtph2ps %xmm6, %xmm2
-; ...
[truncated]
|
… (v8f16 bitcast(v,0))) Also handles possible truncations from i32 to i16. Cleans up some of the poor codegen identified in llvm#98630
d773d80
to
ee12cc2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for the fix!
… (v8f16 bitcast(v,0))) (llvm#125877) Also handles possible truncations from i32 to i16. Cleans up some of the poor codegen identified in llvm#98630
Also handles possible truncations from i32 to i16.
Cleans up some of the poor codegen identified in #98630