-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[X86] combineStore - fold scalar float store(fabs/fneg(load())) -> store(and/xor(load(),c)) #118680
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ore(and/xor(load(),c)) As noted on llvm#117557 - its not worth performing scalar float fabs/fneg on the fpu if we're not doing any other fp ops.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesAs noted on #117557 - its not worth performing scalar float fabs/fneg on the fpu if we're not doing any other fp ops. This is currently limited to store + load pairs - I could try to extend this further if necessary, but we need to be careful that we don't end up in an infinite loop with the DAGCombiner foldBitcastedFPLogic combine. Fixes #117557 Full diff: https://github.com/llvm/llvm-project/pull/118680.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7b46b109de3b09..a23a4b67df5424 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52661,6 +52661,29 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
St->getMemOperand()->getFlags());
}
+ // Convert scalar fabs/fneg load-store to integer equivalents.
+ if ((VT == MVT::f32 || VT == MVT::f64) &&
+ (StoredVal.getOpcode() == ISD::FABS ||
+ StoredVal.getOpcode() == ISD::FNEG) &&
+ ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
+ StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
+ MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
+ if (TLI.isTypeLegal(IntVT)) {
+ APInt SignMask = APInt::getSignMask(VT.getScalarSizeInBits());
+ unsigned SignOp = ISD::XOR;
+ if (StoredVal.getOpcode() == ISD::FABS) {
+ SignMask = ~SignMask;
+ SignOp = ISD::AND;
+ }
+ SDValue LogicOp = DAG.getNode(
+ SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
+ DAG.getConstant(SignMask, dl, IntVT));
+ return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
+ }
+ }
+
// If we are saving a 32-byte vector and 32-byte stores are slow, such as on
// Sandy Bridge, perform two 16-byte stores.
unsigned Fast;
diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll
index d337c7693ff7d6..b9aad9075261b1 100644
--- a/llvm/test/CodeGen/X86/combine-fabs.ll
+++ b/llvm/test/CodeGen/X86/combine-fabs.ll
@@ -135,20 +135,16 @@ define <4 x float> @combine_vec_fabs_fcopysign(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %2
}
-; TODO: store(fabs(load())) - convert scalar to integer
+; store(fabs(load())) - convert scalar to integer
define void @combine_fabs_int_rmw_f64(ptr %ptr) {
; SSE-LABEL: combine_fabs_int_rmw_f64:
; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movlps %xmm0, (%rdi)
+; SSE-NEXT: andb $127, 7(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: combine_fabs_int_rmw_f64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovlps %xmm0, (%rdi)
+; AVX-NEXT: andb $127, 7(%rdi)
; AVX-NEXT: retq
%1 = load double, ptr %ptr
%2 = call double @llvm.fabs.f64(double %1)
@@ -159,17 +155,16 @@ define void @combine_fabs_int_rmw_f64(ptr %ptr) {
define void @combine_fabs_int_f32(ptr %src, ptr %dst) {
; SSE-LABEL: combine_fabs_int_f32:
; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movss %xmm0, (%rsi)
+; SSE-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; SSE-NEXT: andl (%rdi), %eax
+; SSE-NEXT: movl %eax, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: combine_fabs_int_f32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
-; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovss %xmm0, (%rsi)
+; AVX-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
+; AVX-NEXT: andl (%rdi), %eax
+; AVX-NEXT: movl %eax, (%rsi)
; AVX-NEXT: retq
%1 = load float, ptr %src
%2 = call float @llvm.fabs.f32(float %1)
diff --git a/llvm/test/CodeGen/X86/combine-fneg.ll b/llvm/test/CodeGen/X86/combine-fneg.ll
index e8e3465c99383d..855b64229a9c33 100644
--- a/llvm/test/CodeGen/X86/combine-fneg.ll
+++ b/llvm/test/CodeGen/X86/combine-fneg.ll
@@ -205,21 +205,17 @@ define <4 x float> @fneg(<4 x float> %Q) nounwind {
ret <4 x float> %tmp
}
-; TODO: store(fneg(load())) - convert scalar to integer
+; store(fneg(load())) - convert scalar to integer
define void @fneg_int_rmw_f32(ptr %ptr) {
; X86-SSE-LABEL: fneg_int_rmw_f32:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE-NEXT: movss %xmm0, (%eax)
+; X86-SSE-NEXT: xorb $-128, 3(%eax)
; X86-SSE-NEXT: retl
;
; X64-SSE-LABEL: fneg_int_rmw_f32:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE-NEXT: movss %xmm0, (%rdi)
+; X64-SSE-NEXT: xorb $-128, 3(%rdi)
; X64-SSE-NEXT: retq
%1 = load float, ptr %ptr
%2 = fneg float %1
@@ -246,19 +242,12 @@ define void @fneg_int_f64(ptr %src, ptr %dst) {
; X86-SSE2-NEXT: movlps %xmm0, (%eax)
; X86-SSE2-NEXT: retl
;
-; X64-SSE1-LABEL: fneg_int_f64:
-; X64-SSE1: # %bb.0:
-; X64-SSE1-NEXT: fldl (%rdi)
-; X64-SSE1-NEXT: fchs
-; X64-SSE1-NEXT: fstpl (%rsi)
-; X64-SSE1-NEXT: retq
-;
-; X64-SSE2-LABEL: fneg_int_f64:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: movlps %xmm0, (%rsi)
-; X64-SSE2-NEXT: retq
+; X64-SSE-LABEL: fneg_int_f64:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-SSE-NEXT: xorq (%rdi), %rax
+; X64-SSE-NEXT: movq %rax, (%rsi)
+; X64-SSE-NEXT: retq
%1 = load double, ptr %src
%2 = fneg double %1
store double %2, ptr %dst
diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll
index 1b9d6c91ad2b23..6fa4a310956dc6 100644
--- a/llvm/test/CodeGen/X86/fsxor-alignment.ll
+++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll
@@ -8,15 +8,16 @@
define void @foo(ptr %p, ptr %q, float %s, float %y) nounwind {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
+; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; CHECK-NEXT: xorps %xmm1, %xmm0
-; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT: xorps %xmm1, %xmm2
-; CHECK-NEXT: movss %xmm0, (%ecx)
-; CHECK-NEXT: movss %xmm2, (%eax)
+; CHECK-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT: xorl %edx, %esi
+; CHECK-NEXT: movl %esi, (%ecx)
+; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movl %edx, (%eax)
+; CHECK-NEXT: popl %esi
; CHECK-NEXT: retl
%ss = fsub float -0.0, %s
%yy = fsub float -0.0, %y
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
@@ -52661,6 +52661,29 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, | |||
St->getMemOperand()->getFlags()); | |||
} | |||
|
|||
// Convert scalar fabs/fneg load-store to integer equivalents. | |||
if ((VT == MVT::f32 || VT == MVT::f64) && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we consider more FP types like half, bfloat, fp128?
As noted on #117557 - its not worth performing scalar float fabs/fneg on the fpu if we're not doing any other fp ops.
This is currently limited to store + load pairs - I could try to extend this further if necessary, but we need to be careful that we don't end up in an infinite loop with the DAGCombiner foldBitcastedFPLogic combine.
Fixes #117557