-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Combine LRINT/LLRINT and TRUNC when TRUNC has nsw flag #126217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Try to improve performance after llvm#125848
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesTry to improve performance after #125848 Full diff: https://github.com/llvm/llvm-project/pull/126217.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb2102..182cdd90c9d680a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53919,6 +53919,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
+ if ((N->getFlags().hasNoUnsignedWrap() || N->getFlags().hasNoSignedWrap()) &&
+ (Src.getOpcode() == ISD::LRINT || Src.getOpcode() == ISD::LLRINT) &&
+ VT.getScalarType() == MVT::i32 && Src.hasOneUse())
+ return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll
index 402daf80a15e873..2f69824c5f61573 100644
--- a/llvm/test/CodeGen/X86/llrint-conv.ll
+++ b/llvm/test/CodeGen/X86/llrint-conv.ll
@@ -183,6 +183,94 @@ entry:
ret i64 %0
}
+define i32 @combine_f32_trunc(float %x) nounwind {
+; SSE-LABEL: combine_trunc:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtss2si %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_trunc:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtss2si %xmm0, %eax
+; AVX-NEXT: retq
+; X86-NOSSE-LABEL: combine_f32_trunc:
+; X86-NOSSE: # %bb.0: # %entry
+; X86-NOSSE-NEXT: pushl %eax
+; X86-NOSSE-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: fistpl (%esp)
+; X86-NOSSE-NEXT: movl (%esp), %eax
+; X86-NOSSE-NEXT: popl %ecx
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE2-LABEL: combine_f32_trunc:
+; X86-SSE2: # %bb.0: # %entry
+; X86-SSE2-NEXT: cvtss2si {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-AVX-LABEL: combine_f32_trunc:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: vcvtss2si {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: combine_f32_trunc:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: cvtss2si %xmm0, %eax
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: combine_f32_trunc:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: vcvtss2si %xmm0, %eax
+; X64-AVX-NEXT: retq
+entry:
+ %0 = tail call i64 @llvm.llrint.f32(float %x)
+ %1 = trunc nsw i64 %0 to i32
+ ret i32 %1
+}
+
+define i32 @combine_f64_trunc(double %x) nounwind {
+; SSE-LABEL: combine_trunc:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtss2si %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_trunc:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtss2si %xmm0, %eax
+; AVX-NEXT: retq
+; X86-NOSSE-LABEL: combine_f64_trunc:
+; X86-NOSSE: # %bb.0: # %entry
+; X86-NOSSE-NEXT: pushl %eax
+; X86-NOSSE-NEXT: fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: fistpl (%esp)
+; X86-NOSSE-NEXT: movl (%esp), %eax
+; X86-NOSSE-NEXT: popl %ecx
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE2-LABEL: combine_f64_trunc:
+; X86-SSE2: # %bb.0: # %entry
+; X86-SSE2-NEXT: cvtsd2si {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-AVX-LABEL: combine_f64_trunc:
+; X86-AVX: # %bb.0: # %entry
+; X86-AVX-NEXT: vcvtsd2si {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: combine_f64_trunc:
+; X64-SSE: # %bb.0: # %entry
+; X64-SSE-NEXT: cvtsd2si %xmm0, %eax
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: combine_f64_trunc:
+; X64-AVX: # %bb.0: # %entry
+; X64-AVX-NEXT: vcvtsd2si %xmm0, %eax
+; X64-AVX-NEXT: retq
+entry:
+ %0 = tail call i64 @llvm.llrint.f64(double %x)
+ %1 = trunc nuw i64 %0 to i32
+ ret i32 %1
+}
+
declare i64 @llvm.llrint.f32(float) nounwind readnone
declare i64 @llvm.llrint.f64(double) nounwind readnone
declare i64 @llvm.llrint.f80(x86_fp80) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index 38fa09085e1898d..6b9acc02ad7c983 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -63,6 +63,38 @@ entry:
ret i32 %1
}
+define i32 @combine_f32_trunc(float %x) {
+; SSE-LABEL: combine_f32_trunc:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtss2si %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_f32_trunc:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtss2si %xmm0, %eax
+; AVX-NEXT: retq
+entry:
+ %0 = tail call i64 @llvm.lrint.i64.f32(float %x)
+ %1 = trunc nuw i64 %0 to i32
+ ret i32 %1
+}
+
+define i32 @combine_f64_trunc(double %x) {
+; SSE-LABEL: combine_f64_trunc:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtsd2si %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_f64_trunc:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsd2si %xmm0, %eax
+; AVX-NEXT: retq
+entry:
+ %0 = tail call i64 @llvm.lrint.i64.f64(double %x)
+ %1 = trunc nsw i64 %0 to i32
+ ret i32 %1
+}
+
declare i64 @llvm.lrint.i64.f32(float) nounwind readnone
declare i64 @llvm.lrint.i64.f64(double) nounwind readnone
declare i64 @llvm.lrint.i64.f80(x86_fp80) nounwind readnone
|
llvm/test/CodeGen/X86/llrint-conv.ll
Outdated
; AVX-LABEL: combine_trunc: | ||
; AVX: # %bb.0: # %entry | ||
; AVX-NEXT: vcvtss2si %xmm0, %eax | ||
; AVX-NEXT: retq |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cleanup fixes check prefixes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, I copy&pasted them. Done, thanks!
Do you have examples where we are able to infer an nuw/nsw flag for a truncate after lrint? |
I was wondering that too. It doesn't look like clang ever sets these flags on trunc, though I suppose some other frontend might. I was also wondering if it would be reasonable to apply this transformation in InstCombine -- for instance, convert |
Not yet. The idea was inspired by the discussions from @andykaylor and @jcranmer-intel in #125324, i.e., we need a flag works for the purpose of For now, I only have some draft ideas like generating nuw/nsw flags when user specify it through Clang option. This may bring unexpected result if we apply it globally, so I also have an idea that user can specify it through builtin like __builtin_assume for only those that they want. But I don't know if it's easy to implement. The last idea is to provide a new builtin, e.g., __builtin_trunc_nuw/nsw for the purpose. It is the easiest way we can implement. In a word, I don't worry about the user scenario. And it looks to me a reasonable from the backend's perspective.
The concern of putting it in the middle end is we don't know if a target prefers to use small size lrint and to which size it prefers the most. I'm afraid backend may geneate suboptimal code if we arbitrarily combine to |
if ((N->getFlags().hasNoUnsignedWrap() || N->getFlags().hasNoSignedWrap()) && | ||
(Src.getOpcode() == ISD::LRINT || Src.getOpcode() == ISD::LLRINT) && | ||
VT.getScalarType() == MVT::i32 && Src.hasOneUse()) | ||
return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to check SSE and not soft float?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no sse might be ok falling back to x87, but soft float may result in calling lrint/llrint library expecting an i32 result when the result is really i64.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's ok. It is already UB if the hight 32-bit is not 0 when nuw/nsw is set.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The call lowering code will read from a 32-bit register instead of the real 64-bit register that is written. The upper bits of the 64-bit value could be all 1s if the value is negative. If the i32 result value is used by a zero extend, SelectionDAG will incorrectly remove the zero extend because it thinks the call wrote a 32-bit register which would automatically zero the upper bits. But since the call really wrote 64 bits this would be wrong.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fortunately, SelectionDAG combine zext + trunc
first, so we generate the zero upper instruction finally. Add zero_upperbits_softfloat
for regression test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if the zext becomes adjacent to the trunc after some other combine or legalization step. There no guarantee the trunc hasn't already been combined with the lrint.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, done.
@phoebewang have you looked into compiling |
Do you mean the FE proposes I posted before? No, it's not in my priority list. But I don't think we need to wait for it, there are many nsw/nuw trunc transformations if you search in the tests. |
(int)rintf(x) compiles to llvm.rintf+fptosi. The fptosi has undefined behavior if the fp value doesn't fit into an integer. This idiom is something that could be optimized without needing any new frontend support. Maybe it's an alternative sequence we could propose to users instead of trying to make this lrint+trunc work? |
Based on Craig's suggestion on llvm#126217
Good point, thanks! #126477 |
We could add TTI functions to get additional information if needed. Your suggestion that this could potentially enable better vectorization is the reason I think it would be beneficial to have this in InstCombine. If we leave it to codegen, it may be too late for vectorization. That is, the presence of the trunc instruction may cause the vectorizer to give up. |
The vectorization is a good point, and I agree we probably fail to vectorize |
Let's pursuit #126477 |
Based on Craig's suggestion on #126217 Alive2: https://alive2.llvm.org/ce/z/9XNpWt
Try to improve performance after #125848