-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] mayFoldIntoStore - peek through oneuse bitcase users to find a store node #123366
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] mayFoldIntoStore - peek through oneuse bitcase users to find a store node #123366
Conversation
…store node mayFoldIntoStore currently just checks the direct (oneuse) user of a SDValue to check its stored, which prevents cases where we bitcast the value prior to storing (usually the bitcast will be removed later). This patch peeks up through oneuse BITCAST nodes chain to see if its eventually stored. The main use of mayFoldIntoStore is v8i16 EXTRACT_VECTOR_ELT lowering which will only use PEXTRW/PEXTRB for index0 extractions (vs the faster MOVD) if the extracted value will be folded into a store on SSE41+ targets. Fixes llvm#107086
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesmayFoldIntoStore currently just checks the direct (oneuse) user of a SDValue to check its stored, which prevents cases where we bitcast the value prior to storing (usually the bitcast will be removed later). This patch peeks up through oneuse BITCAST nodes chain to see if its eventually stored. The main use of mayFoldIntoStore is v8i16 EXTRACT_VECTOR_ELT lowering which will only use PEXTRW/PEXTRB for index0 extractions (vs the faster MOVD) if the extracted value will be folded into a store on SSE41+ targets. Fixes #107086 Full diff: https://github.com/llvm/llvm-project/pull/123366.diff 11 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 84736f18011a9d..8fe2781b148fcf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2812,7 +2812,16 @@ bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
}
bool X86::mayFoldIntoStore(SDValue Op) {
- return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
+ if (!Op.hasOneUse())
+ return false;
+ // Peek through (oneuse) bitcast users
+ SDNode *User = *Op->user_begin();
+ while (User->getOpcode() == ISD::BITCAST) {
+ if (!User->hasOneUse())
+ return false;
+ User = *User->user_begin();
+ }
+ return ISD::isNormalStore(User);
}
bool X86::mayFoldIntoZeroExtend(SDValue Op) {
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index e911a24d830f75..04087c4f0dd5ed 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -53,8 +53,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
%val = load half, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll
index efc457e35e7f37..c6c088297c0ea6 100644
--- a/llvm/test/CodeGen/X86/cvt16.ll
+++ b/llvm/test/CodeGen/X86/cvt16.ll
@@ -34,8 +34,7 @@ define void @test1(float %src, ptr %dest) nounwind {
; F16C-LABEL: test1:
; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; SOFTFLOAT-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index bf93c8a1f5b511..e1b677e8550944 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -316,8 +316,7 @@ define void @fptrunc_float_to_f16(ptr %val, ptr%ret) nounwind strictfp {
; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rsi)
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
; X86-LABEL: fptrunc_float_to_f16:
@@ -411,8 +410,7 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: movw %ax, (%rdi)
+; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
;
; X86-LABEL: fsqrt_f16:
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 0d8290b120fa4f..3af8b1aec1feb2 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -12,8 +12,7 @@ define void @test_half_ceil(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_ceil:
@@ -108,8 +107,7 @@ define void @test_half_cos(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq cosf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -167,8 +165,7 @@ define void @test_half_exp(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq expf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -226,8 +223,7 @@ define void @test_half_exp2(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq exp2f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -285,8 +281,7 @@ define void @test_half_exp10(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq exp10f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -342,8 +337,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_fabs:
@@ -383,8 +377,7 @@ define void @test_half_floor(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_floor:
@@ -438,8 +431,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
; F16C-NEXT: callq fmaf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -525,8 +517,7 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_fneg:
@@ -568,8 +559,7 @@ define void @test_half_log(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq logf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -627,8 +617,7 @@ define void @test_half_log2(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq log2f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -686,8 +675,7 @@ define void @test_half_log10(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq log10f@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -743,8 +731,7 @@ define void @test_half_nearbyint(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_nearbyint:
@@ -797,8 +784,7 @@ define void @test_half_pow(half %a0, half %a1, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: callq powf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -876,8 +862,7 @@ define void @test_half_powi(half %a0, i32 %a1, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq __powisf2@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -943,8 +928,7 @@ define void @test_half_rint(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_rint:
@@ -996,8 +980,7 @@ define void @test_half_sin(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq sinf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -1053,8 +1036,7 @@ define void @test_half_sqrt(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_sqrt:
@@ -1107,8 +1089,7 @@ define void @test_half_tan(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: callq tanf@PLT
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rbx)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
;
@@ -1164,8 +1145,7 @@ define void @test_half_trunc(half %a0, ptr %p0) nounwind {
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; F16C-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; FP16-LABEL: test_half_trunc:
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index eae9b25e43e06f..0f73129d984bd9 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -176,8 +176,7 @@ define void @float_to_half(float %0) strictfp {
; X86-F16C: # %bb.0:
; X86-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X86-F16C-NEXT: vmovd %xmm0, %eax
-; X86-F16C-NEXT: movw %ax, a
+; X86-F16C-NEXT: vpextrw $0, %xmm0, a
; X86-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: float_to_half:
@@ -197,9 +196,8 @@ define void @float_to_half(float %0) strictfp {
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT: vmovd %xmm0, %eax
-; X64-F16C-NEXT: movq a@GOTPCREL(%rip), %rcx
-; X64-F16C-NEXT: movw %ax, (%rcx)
+; X64-F16C-NEXT: movq a@GOTPCREL(%rip), %rax
+; X64-F16C-NEXT: vpextrw $0, %xmm0, (%rax)
; X64-F16C-NEXT: retq
%2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
store half %2, ptr @a, align 2
@@ -354,8 +352,7 @@ define void @add() strictfp {
; X86-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X86-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X86-F16C-NEXT: vmovd %xmm0, %eax
-; X86-F16C-NEXT: movw %ax, c
+; X86-F16C-NEXT: vpextrw $0, %xmm0, c
; X86-F16C-NEXT: retl
;
; X64-NOF16C-LABEL: add:
@@ -392,9 +389,8 @@ define void @add() strictfp {
; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; X64-F16C-NEXT: vmovd %xmm0, %eax
-; X64-F16C-NEXT: movq c@GOTPCREL(%rip), %rcx
-; X64-F16C-NEXT: movw %ax, (%rcx)
+; X64-F16C-NEXT: movq c@GOTPCREL(%rip), %rax
+; X64-F16C-NEXT: vpextrw $0, %xmm0, (%rax)
; X64-F16C-NEXT: retq
%1 = load half, ptr @a, align 2
%2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 7388429143df56..3cbf5c11235ea8 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -16,8 +16,7 @@ define void @truncsfhf(float %in, ptr %ptr) nounwind {
; CHECK-F16C-LABEL: truncsfhf:
; CHECK-F16C: ## %bb.0:
; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-F16C-NEXT: vmovd %xmm0, %eax
-; CHECK-F16C-NEXT: movw %ax, (%rdi)
+; CHECK-F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; CHECK-F16C-NEXT: retq
;
; CHECK-FP16-LABEL: truncsfhf:
@@ -108,8 +107,7 @@ define void @strict_truncsfhf(float %in, ptr %ptr) nounwind strictfp {
; CHECK-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-F16C-NEXT: vmovd %xmm0, %eax
-; CHECK-F16C-NEXT: movw %ax, (%rdi)
+; CHECK-F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; CHECK-F16C-NEXT: retq
;
; CHECK-FP16-LABEL: strict_truncsfhf:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7bac075e486809..6e7f109a5da5c2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -146,8 +146,7 @@ define void @test_trunc32(float %in, ptr %addr) #0 {
; BWON-F16C-LABEL: test_trunc32:
; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovd %xmm0, %eax
-; BWON-F16C-NEXT: movw %ax, (%rdi)
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc32:
@@ -265,8 +264,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 {
; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovd %xmm0, %eax
-; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi)
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_sitofp_i64:
@@ -398,8 +396,7 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 {
; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: .LBB10_3:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovd %xmm0, %eax
-; BWON-F16C-NEXT: movw %ax, (%rsi)
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi)
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_uitofp_i64:
@@ -1075,8 +1072,7 @@ define void @main.158() #0 {
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
; BWON-F16C-NEXT: .LBB20_2: # %entry
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT: vmovd %xmm0, %eax
-; BWON-F16C-NEXT: movw %ax, (%rax)
+; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rax)
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: main.158:
diff --git a/llvm/test/CodeGen/X86/pr91005.ll b/llvm/test/CodeGen/X86/pr91005.ll
index 97fd1ce4568826..d73cd7482c3904 100644
--- a/llvm/test/CodeGen/X86/pr91005.ll
+++ b/llvm/test/CodeGen/X86/pr91005.ll
@@ -16,8 +16,7 @@ define void @PR91005(ptr %0) minsize {
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: movw %ax, (%rdi)
+; CHECK-NEXT: vpextrw $0, %xmm0, (%rdi)
; CHECK-NEXT: .LBB0_2: # %common.ret
; CHECK-NEXT: retq
%2 = bitcast <2 x half> poison to <2 x i16>
diff --git a/llvm/test/CodeGen/X86/pr95278.ll b/llvm/test/CodeGen/X86/pr95278.ll
index 32783696f4692a..104fc04d68cdb2 100644
--- a/llvm/test/CodeGen/X86/pr95278.ll
+++ b/llvm/test/CodeGen/X86/pr95278.ll
@@ -8,8 +8,7 @@ define void @PR95278(ptr %p0, ptr %p1) {
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-NEXT: vmovd %xmm0, %eax
-; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: vpextrw $0, %xmm0, (%rsi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%load = load <1024 x half>, ptr %p0, align 2
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index ec916060563a75..4e50b56323311f 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -2596,15 +2596,13 @@ define void @store_cvt_f32_to_i16(float %a0, ptr %a1) nounwind {
; F16C-LABEL: store_cvt_f32_to_i16:
; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; F16C-NEXT: vmovd %xmm0, %eax
-; F16C-NEXT: movw %ax, (%rdi)
+; F16C-NEXT: vpextrw $0, %xmm0, (%rdi)
; F16C-NEXT: retq
;
; AVX512-LABEL: store_cvt_f32_to_i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
|
LGTM |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
@abhishek-kaushik22 Thanks for the merge, but I'd prefer to handle these myself in future - I only merge upstream when I know I'm going to have access to my PC for an hour or so afterward in the case of buildbot failures etc. |
mayFoldIntoStore currently just checks the direct (oneuse) user of a SDValue to check its stored, which prevents cases where we bitcast the value prior to storing (usually the bitcast will be removed later).
This patch peeks up through oneuse BITCAST nodes chain to see if its eventually stored.
The main use of mayFoldIntoStore is v8i16 EXTRACT_VECTOR_ELT lowering which will only use PEXTRW/PEXTRB for index0 extractions (vs the faster MOVD) if the extracted value will be folded into a store on SSE41+ targets.
Fixes #107086