-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[DAG] Don't split f64 constant stores if the fp imm is legal #74622
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-arm Author: Simon Pilgrim (RKSimon) ChangesIf the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores Another cleanup step for #74304 Full diff: https://github.com/llvm/llvm-project/pull/74622.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2a3425a42607e..0c5b2894a2e76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20911,7 +20911,8 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
// transform should not be done in this case.
SDValue Tmp;
- switch (CFP->getSimpleValueType(0).SimpleTy) {
+ MVT SimpleVT = CFP->getSimpleValueType(0);
+ switch (SimpleVT.SimpleTy) {
default:
llvm_unreachable("Unknown FP type");
case MVT::f16: // We don't do this for these yet.
@@ -20940,7 +20941,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
Ptr, ST->getMemOperand());
}
- if (ST->isSimple() &&
+ if (ST->isSimple() && !TLI.isFPImmLegal(CFP->getValueAPF(), SimpleVT) &&
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
// Many FP stores are not made apparent until after legalize, e.g. for
// argument passing. Since this is so common, custom legalize the
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7a54141fa711a..5e1f9fbcdde0a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -461,7 +461,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
ST->getOriginalAlign(), MMOFlags, AAInfo);
}
- if (CFP->getValueType(0) == MVT::f64) {
+ if (CFP->getValueType(0) == MVT::f64 &&
+ !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) {
// If this target supports 64-bit registers, do a single 64-bit store.
if (TLI.isTypeLegal(MVT::i64)) {
SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
diff --git a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
index e32f19ef67452..dabbb1e38a86b 100644
--- a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -104,10 +104,8 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, sp, #8
-; CHECK-NEXT: movw r1, #0
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: movt r1, #16368
-; CHECK-NEXT: strd r0, r1, [sp]
+; CHECK-NEXT: vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT: vstr d16, [sp]
; CHECK-NEXT: bl test_1double_nosplit
; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: pop {r11, pc}
@@ -138,10 +136,8 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: movw r1, #0
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: movt r1, #16368
-; CHECK-NEXT: strd r0, r1, [sp, #8]
+; CHECK-NEXT: vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT: vstr d16, [sp, #8]
; CHECK-NEXT: bl test_1double_misaligned
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: pop {r11, pc}
diff --git a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
index e861fe397f849..7e2a911c89281 100644
--- a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
+++ b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
@@ -300,16 +300,16 @@ entry:
ret double %call
}
; CHECK-LABEL: g2_1_call:
-; CHECK: movw r0, #0
-; CHECK: mov r1, #0
-; CHECK: movt r0, #16352
-; CHECK: str r1, [sp]
-; CHECK: stmib sp, {r0, r1}
-; CHECK: str r1, [sp, #12]
-; CHECK: str r1, [sp, #16]
-; CHECK: str r1, [sp, #20]
-; CHECK: str r1, [sp, #24]
-; CHECK: str r1, [sp, #28]
+; CHECK: vmov.f64 d16, #5.000000e-01
+; CHECK: mov r0, #0
+; CHECK: str r0, [sp, #8]
+; CHECK: str r0, [sp, #12]
+; CHECK: str r0, [sp, #16]
+; CHECK: vmov.i32 d0, #0x0
+; CHECK: str r0, [sp, #20]
+; CHECK: str r0, [sp, #24]
+; CHECK: str r0, [sp, #28]
+; CHECK: vstr d16, [sp]
; CHECK: bl g2_1
; pass in memory, alignment 8
diff --git a/llvm/test/CodeGen/Mips/pr49200.ll b/llvm/test/CodeGen/Mips/pr49200.ll
index 80a2bdd4e95ee..2a9f207b29e58 100644
--- a/llvm/test/CodeGen/Mips/pr49200.ll
+++ b/llvm/test/CodeGen/Mips/pr49200.ll
@@ -11,11 +11,10 @@ define dso_local void @foo() #0 {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addiusp -24
-; CHECK-NEXT: li16 $2, 0
-; CHECK-NEXT: sw $2, 4($sp)
-; CHECK-NEXT: sw $2, 0($sp)
-; CHECK-NEXT: sw $2, 12($sp)
-; CHECK-NEXT: sw $2, 8($sp)
+; CHECK-NEXT: mtc1 $zero, $f0
+; CHECK-NEXT: mthc1 $zero, $f0
+; CHECK-NEXT: sdc1 $f0, 0($sp)
+; CHECK-NEXT: sdc1 $f0, 8($sp)
; CHECK-NEXT: ldc1 $f0, 0($sp)
; CHECK-NEXT: sdc1 $f0, 16($sp)
; CHECK-NEXT: addiusp 24
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 32e45adcb94d7..5f77e2cb46cbf 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -865,9 +865,9 @@ define double @f19() #0 {
; X87-NEXT: .cfi_def_cfa_offset 32
; X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
; X87-NEXT: fstpl {{[0-9]+}}(%esp)
+; X87-NEXT: fld1
+; X87-NEXT: fstpl (%esp)
; X87-NEXT: wait
-; X87-NEXT: movl $1072693248, {{[0-9]+}}(%esp) # imm = 0x3FF00000
-; X87-NEXT: movl $0, (%esp)
; X87-NEXT: calll fmod
; X87-NEXT: addl $28, %esp
; X87-NEXT: .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 44c57c54ba023..ec128fc6686c8 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -91,10 +91,11 @@ define double @ldexp_f64(i8 zeroext %x) {
;
; WIN32-LABEL: ldexp_f64:
; WIN32: # %bb.0:
+; WIN32-NEXT: subl $12, %esp
; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: pushl $1072693248 # imm = 0x3FF00000
-; WIN32-NEXT: pushl $0
+; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT: fld1
+; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _ldexp
; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index c6eecdcdf99cc..480a0970bd39d 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -18,26 +18,17 @@ define void @bork(ptr nocapture align 4 %dst) nounwind {
; SLOW_32-LABEL: bork:
; SLOW_32: # %bb.0:
; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SLOW_32-NEXT: movl $0, 4(%eax)
-; SLOW_32-NEXT: movl $0, (%eax)
-; SLOW_32-NEXT: movl $0, 12(%eax)
-; SLOW_32-NEXT: movl $0, 8(%eax)
-; SLOW_32-NEXT: movl $0, 20(%eax)
-; SLOW_32-NEXT: movl $0, 16(%eax)
-; SLOW_32-NEXT: movl $0, 28(%eax)
-; SLOW_32-NEXT: movl $0, 24(%eax)
-; SLOW_32-NEXT: movl $0, 36(%eax)
-; SLOW_32-NEXT: movl $0, 32(%eax)
-; SLOW_32-NEXT: movl $0, 44(%eax)
-; SLOW_32-NEXT: movl $0, 40(%eax)
-; SLOW_32-NEXT: movl $0, 52(%eax)
-; SLOW_32-NEXT: movl $0, 48(%eax)
-; SLOW_32-NEXT: movl $0, 60(%eax)
-; SLOW_32-NEXT: movl $0, 56(%eax)
-; SLOW_32-NEXT: movl $0, 68(%eax)
-; SLOW_32-NEXT: movl $0, 64(%eax)
-; SLOW_32-NEXT: movl $0, 76(%eax)
-; SLOW_32-NEXT: movl $0, 72(%eax)
+; SLOW_32-NEXT: xorps %xmm0, %xmm0
+; SLOW_32-NEXT: movsd %xmm0, 72(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 64(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 56(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 48(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 40(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 32(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 24(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 16(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 8(%eax)
+; SLOW_32-NEXT: movsd %xmm0, (%eax)
; SLOW_32-NEXT: retl
;
; SLOW_64-LABEL: bork:
diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll
index 753b7ce33d2be..205849e7d05db 100644
--- a/llvm/test/CodeGen/X86/pr38738.ll
+++ b/llvm/test/CodeGen/X86/pr38738.ll
@@ -130,22 +130,15 @@ define void @tryset(ptr nocapture %x) {
; X86SSE2-LABEL: tryset:
; X86SSE2: # %bb.0:
; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86SSE2-NEXT: movl $0, 4(%eax)
-; X86SSE2-NEXT: movl $0, (%eax)
-; X86SSE2-NEXT: movl $0, 12(%eax)
-; X86SSE2-NEXT: movl $0, 8(%eax)
-; X86SSE2-NEXT: movl $0, 20(%eax)
-; X86SSE2-NEXT: movl $0, 16(%eax)
-; X86SSE2-NEXT: movl $0, 28(%eax)
-; X86SSE2-NEXT: movl $0, 24(%eax)
-; X86SSE2-NEXT: movl $0, 36(%eax)
-; X86SSE2-NEXT: movl $0, 32(%eax)
-; X86SSE2-NEXT: movl $0, 44(%eax)
-; X86SSE2-NEXT: movl $0, 40(%eax)
-; X86SSE2-NEXT: movl $0, 52(%eax)
-; X86SSE2-NEXT: movl $0, 48(%eax)
-; X86SSE2-NEXT: movl $0, 60(%eax)
-; X86SSE2-NEXT: movl $0, 56(%eax)
+; X86SSE2-NEXT: xorps %xmm0, %xmm0
+; X86SSE2-NEXT: movsd %xmm0, 56(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 48(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 40(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 32(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 24(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 16(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 8(%eax)
+; X86SSE2-NEXT: movsd %xmm0, (%eax)
; X86SSE2-NEXT: retl
;
; X64AVX-LABEL: tryset:
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index 85afa83e3f08e..d74d195439bda 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -1,16 +1,16 @@
; Intel chips with slow unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
; Intel chips with fast unaligned memory accesses
@@ -26,15 +26,15 @@
; AMD chips with slow unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
; AMD chips with fast unaligned memory accesses
@@ -67,26 +67,40 @@
; SLOW-NOT: not a recognized processor
; FAST-NOT: not a recognized processor
define void @store_zeros(ptr %a) {
-; SLOW-LABEL: store_zeros:
-; SLOW: # %bb.0:
-; SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NOT: movl
+; SLOW-SCALAR-LABEL: store_zeros:
+; SLOW-SCALAR: # %bb.0:
+; SLOW-SCALAR-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NOT: movl
+;
+; SLOW-SSE-LABEL: store_zeros:
+; SLOW-SSE: # %bb.0:
+; SLOW-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SLOW-SSE-NEXT: xorps %xmm0, %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NOT: movsd
;
; FAST-SSE-LABEL: store_zeros:
; FAST-SSE: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/zero-remat.ll b/llvm/test/CodeGen/X86/zero-remat.ll
index 60bb2c420cda4..000e0d14b711f 100644
--- a/llvm/test/CodeGen/X86/zero-remat.ll
+++ b/llvm/test/CodeGen/X86/zero-remat.ll
@@ -19,11 +19,12 @@ define double @foo() nounwind {
;
; CHECK-32-LABEL: foo:
; CHECK-32: # %bb.0:
-; CHECK-32-NEXT: pushl $0
-; CHECK-32-NEXT: pushl $0
+; CHECK-32-NEXT: subl $8, %esp
+; CHECK-32-NEXT: fldz
+; CHECK-32-NEXT: fstpl (%esp)
; CHECK-32-NEXT: calll bar@PLT
-; CHECK-32-NEXT: addl $8, %esp
; CHECK-32-NEXT: fldz
+; CHECK-32-NEXT: addl $8, %esp
; CHECK-32-NEXT: retl
call void @bar(double 0.0)
ret double 0.0
|
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores Another cleanup step for #74304 Full diff: https://github.com/llvm/llvm-project/pull/74622.diff 11 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2a3425a42607e..0c5b2894a2e76 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20911,7 +20911,8 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
// transform should not be done in this case.
SDValue Tmp;
- switch (CFP->getSimpleValueType(0).SimpleTy) {
+ MVT SimpleVT = CFP->getSimpleValueType(0);
+ switch (SimpleVT.SimpleTy) {
default:
llvm_unreachable("Unknown FP type");
case MVT::f16: // We don't do this for these yet.
@@ -20940,7 +20941,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
Ptr, ST->getMemOperand());
}
- if (ST->isSimple() &&
+ if (ST->isSimple() && !TLI.isFPImmLegal(CFP->getValueAPF(), SimpleVT) &&
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
// Many FP stores are not made apparent until after legalize, e.g. for
// argument passing. Since this is so common, custom legalize the
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7a54141fa711a..5e1f9fbcdde0a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -461,7 +461,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
ST->getOriginalAlign(), MMOFlags, AAInfo);
}
- if (CFP->getValueType(0) == MVT::f64) {
+ if (CFP->getValueType(0) == MVT::f64 &&
+ !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64)) {
// If this target supports 64-bit registers, do a single 64-bit store.
if (TLI.isTypeLegal(MVT::i64)) {
SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
diff --git a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
index e32f19ef67452..dabbb1e38a86b 100644
--- a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -104,10 +104,8 @@ define arm_aapcs_vfpcc void @test_1double_nosplit([4 x float], [4 x double], [3
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, sp, #8
-; CHECK-NEXT: movw r1, #0
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: movt r1, #16368
-; CHECK-NEXT: strd r0, r1, [sp]
+; CHECK-NEXT: vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT: vstr d16, [sp]
; CHECK-NEXT: bl test_1double_nosplit
; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: pop {r11, pc}
@@ -138,10 +136,8 @@ define arm_aapcs_vfpcc void @test_1double_misaligned([4 x double], [4 x double],
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: movw r1, #0
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: movt r1, #16368
-; CHECK-NEXT: strd r0, r1, [sp, #8]
+; CHECK-NEXT: vmov.f64 d16, #1.000000e+00
+; CHECK-NEXT: vstr d16, [sp, #8]
; CHECK-NEXT: bl test_1double_misaligned
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: pop {r11, pc}
diff --git a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
index e861fe397f849..7e2a911c89281 100644
--- a/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
+++ b/llvm/test/CodeGen/ARM/ha-alignstack-call.ll
@@ -300,16 +300,16 @@ entry:
ret double %call
}
; CHECK-LABEL: g2_1_call:
-; CHECK: movw r0, #0
-; CHECK: mov r1, #0
-; CHECK: movt r0, #16352
-; CHECK: str r1, [sp]
-; CHECK: stmib sp, {r0, r1}
-; CHECK: str r1, [sp, #12]
-; CHECK: str r1, [sp, #16]
-; CHECK: str r1, [sp, #20]
-; CHECK: str r1, [sp, #24]
-; CHECK: str r1, [sp, #28]
+; CHECK: vmov.f64 d16, #5.000000e-01
+; CHECK: mov r0, #0
+; CHECK: str r0, [sp, #8]
+; CHECK: str r0, [sp, #12]
+; CHECK: str r0, [sp, #16]
+; CHECK: vmov.i32 d0, #0x0
+; CHECK: str r0, [sp, #20]
+; CHECK: str r0, [sp, #24]
+; CHECK: str r0, [sp, #28]
+; CHECK: vstr d16, [sp]
; CHECK: bl g2_1
; pass in memory, alignment 8
diff --git a/llvm/test/CodeGen/Mips/pr49200.ll b/llvm/test/CodeGen/Mips/pr49200.ll
index 80a2bdd4e95ee..2a9f207b29e58 100644
--- a/llvm/test/CodeGen/Mips/pr49200.ll
+++ b/llvm/test/CodeGen/Mips/pr49200.ll
@@ -11,11 +11,10 @@ define dso_local void @foo() #0 {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addiusp -24
-; CHECK-NEXT: li16 $2, 0
-; CHECK-NEXT: sw $2, 4($sp)
-; CHECK-NEXT: sw $2, 0($sp)
-; CHECK-NEXT: sw $2, 12($sp)
-; CHECK-NEXT: sw $2, 8($sp)
+; CHECK-NEXT: mtc1 $zero, $f0
+; CHECK-NEXT: mthc1 $zero, $f0
+; CHECK-NEXT: sdc1 $f0, 0($sp)
+; CHECK-NEXT: sdc1 $f0, 8($sp)
; CHECK-NEXT: ldc1 $f0, 0($sp)
; CHECK-NEXT: sdc1 $f0, 16($sp)
; CHECK-NEXT: addiusp 24
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 32e45adcb94d7..5f77e2cb46cbf 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -865,9 +865,9 @@ define double @f19() #0 {
; X87-NEXT: .cfi_def_cfa_offset 32
; X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
; X87-NEXT: fstpl {{[0-9]+}}(%esp)
+; X87-NEXT: fld1
+; X87-NEXT: fstpl (%esp)
; X87-NEXT: wait
-; X87-NEXT: movl $1072693248, {{[0-9]+}}(%esp) # imm = 0x3FF00000
-; X87-NEXT: movl $0, (%esp)
; X87-NEXT: calll fmod
; X87-NEXT: addl $28, %esp
; X87-NEXT: .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index 44c57c54ba023..ec128fc6686c8 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -91,10 +91,11 @@ define double @ldexp_f64(i8 zeroext %x) {
;
; WIN32-LABEL: ldexp_f64:
; WIN32: # %bb.0:
+; WIN32-NEXT: subl $12, %esp
; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: pushl $1072693248 # imm = 0x3FF00000
-; WIN32-NEXT: pushl $0
+; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT: fld1
+; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _ldexp
; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
index c6eecdcdf99cc..480a0970bd39d 100644
--- a/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/llvm/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -18,26 +18,17 @@ define void @bork(ptr nocapture align 4 %dst) nounwind {
; SLOW_32-LABEL: bork:
; SLOW_32: # %bb.0:
; SLOW_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SLOW_32-NEXT: movl $0, 4(%eax)
-; SLOW_32-NEXT: movl $0, (%eax)
-; SLOW_32-NEXT: movl $0, 12(%eax)
-; SLOW_32-NEXT: movl $0, 8(%eax)
-; SLOW_32-NEXT: movl $0, 20(%eax)
-; SLOW_32-NEXT: movl $0, 16(%eax)
-; SLOW_32-NEXT: movl $0, 28(%eax)
-; SLOW_32-NEXT: movl $0, 24(%eax)
-; SLOW_32-NEXT: movl $0, 36(%eax)
-; SLOW_32-NEXT: movl $0, 32(%eax)
-; SLOW_32-NEXT: movl $0, 44(%eax)
-; SLOW_32-NEXT: movl $0, 40(%eax)
-; SLOW_32-NEXT: movl $0, 52(%eax)
-; SLOW_32-NEXT: movl $0, 48(%eax)
-; SLOW_32-NEXT: movl $0, 60(%eax)
-; SLOW_32-NEXT: movl $0, 56(%eax)
-; SLOW_32-NEXT: movl $0, 68(%eax)
-; SLOW_32-NEXT: movl $0, 64(%eax)
-; SLOW_32-NEXT: movl $0, 76(%eax)
-; SLOW_32-NEXT: movl $0, 72(%eax)
+; SLOW_32-NEXT: xorps %xmm0, %xmm0
+; SLOW_32-NEXT: movsd %xmm0, 72(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 64(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 56(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 48(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 40(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 32(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 24(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 16(%eax)
+; SLOW_32-NEXT: movsd %xmm0, 8(%eax)
+; SLOW_32-NEXT: movsd %xmm0, (%eax)
; SLOW_32-NEXT: retl
;
; SLOW_64-LABEL: bork:
diff --git a/llvm/test/CodeGen/X86/pr38738.ll b/llvm/test/CodeGen/X86/pr38738.ll
index 753b7ce33d2be..205849e7d05db 100644
--- a/llvm/test/CodeGen/X86/pr38738.ll
+++ b/llvm/test/CodeGen/X86/pr38738.ll
@@ -130,22 +130,15 @@ define void @tryset(ptr nocapture %x) {
; X86SSE2-LABEL: tryset:
; X86SSE2: # %bb.0:
; X86SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86SSE2-NEXT: movl $0, 4(%eax)
-; X86SSE2-NEXT: movl $0, (%eax)
-; X86SSE2-NEXT: movl $0, 12(%eax)
-; X86SSE2-NEXT: movl $0, 8(%eax)
-; X86SSE2-NEXT: movl $0, 20(%eax)
-; X86SSE2-NEXT: movl $0, 16(%eax)
-; X86SSE2-NEXT: movl $0, 28(%eax)
-; X86SSE2-NEXT: movl $0, 24(%eax)
-; X86SSE2-NEXT: movl $0, 36(%eax)
-; X86SSE2-NEXT: movl $0, 32(%eax)
-; X86SSE2-NEXT: movl $0, 44(%eax)
-; X86SSE2-NEXT: movl $0, 40(%eax)
-; X86SSE2-NEXT: movl $0, 52(%eax)
-; X86SSE2-NEXT: movl $0, 48(%eax)
-; X86SSE2-NEXT: movl $0, 60(%eax)
-; X86SSE2-NEXT: movl $0, 56(%eax)
+; X86SSE2-NEXT: xorps %xmm0, %xmm0
+; X86SSE2-NEXT: movsd %xmm0, 56(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 48(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 40(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 32(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 24(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 16(%eax)
+; X86SSE2-NEXT: movsd %xmm0, 8(%eax)
+; X86SSE2-NEXT: movsd %xmm0, (%eax)
; X86SSE2-NEXT: retl
;
; X64AVX-LABEL: tryset:
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index 85afa83e3f08e..d74d195439bda 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -1,16 +1,16 @@
; Intel chips with slow unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
; Intel chips with fast unaligned memory accesses
@@ -26,15 +26,15 @@
; AMD chips with slow unaligned memory accesses
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
; AMD chips with fast unaligned memory accesses
@@ -67,26 +67,40 @@
; SLOW-NOT: not a recognized processor
; FAST-NOT: not a recognized processor
define void @store_zeros(ptr %a) {
-; SLOW-LABEL: store_zeros:
-; SLOW: # %bb.0:
-; SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NEXT: movl $0
-; SLOW-NOT: movl
+; SLOW-SCALAR-LABEL: store_zeros:
+; SLOW-SCALAR: # %bb.0:
+; SLOW-SCALAR-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NEXT: movl $0
+; SLOW-SCALAR-NOT: movl
+;
+; SLOW-SSE-LABEL: store_zeros:
+; SLOW-SSE: # %bb.0:
+; SLOW-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SLOW-SSE-NEXT: xorps %xmm0, %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NEXT: movsd %xmm0
+; SLOW-SSE-NOT: movsd
;
; FAST-SSE-LABEL: store_zeros:
; FAST-SSE: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/zero-remat.ll b/llvm/test/CodeGen/X86/zero-remat.ll
index 60bb2c420cda4..000e0d14b711f 100644
--- a/llvm/test/CodeGen/X86/zero-remat.ll
+++ b/llvm/test/CodeGen/X86/zero-remat.ll
@@ -19,11 +19,12 @@ define double @foo() nounwind {
;
; CHECK-32-LABEL: foo:
; CHECK-32: # %bb.0:
-; CHECK-32-NEXT: pushl $0
-; CHECK-32-NEXT: pushl $0
+; CHECK-32-NEXT: subl $8, %esp
+; CHECK-32-NEXT: fldz
+; CHECK-32-NEXT: fstpl (%esp)
; CHECK-32-NEXT: calll bar@PLT
-; CHECK-32-NEXT: addl $8, %esp
; CHECK-32-NEXT: fldz
+; CHECK-32-NEXT: addl $8, %esp
; CHECK-32-NEXT: retl
call void @bar(double 0.0)
ret double 0.0
|
e6adfbc
to
bbd9cde
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
if (ST->isSimple() && !TLI.isFPImmLegal(CFP->getValueAPF(), MVT::f64) && | ||
TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe add this as the last condition?
If the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores Another cleanup step for llvm#74304
bbd9cde
to
a6da0b9
Compare
If the target can generate a specific fp immediate constant, then don't split the store into 2 x i32 stores
Another cleanup step for #74304