[X86] Combine LRINT/LLRINT and TRUNC when TRUNC has nsw flag #126217

phoebewang · 2025-02-07T10:03:49Z

Try to improve performance after #125848

Try to improve performance after llvm#125848

llvmbot · 2025-02-07T10:04:23Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Try to improve performance after #125848

Full diff: https://github.com/llvm/llvm-project/pull/126217.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+5)
(modified) llvm/test/CodeGen/X86/llrint-conv.ll (+88)
(modified) llvm/test/CodeGen/X86/lrint-conv-i64.ll (+32)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb2102..182cdd90c9d680a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53919,6 +53919,11 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
+  if ((N->getFlags().hasNoUnsignedWrap() || N->getFlags().hasNoSignedWrap()) &&
+      (Src.getOpcode() == ISD::LRINT || Src.getOpcode() == ISD::LLRINT) &&
+      VT.getScalarType() == MVT::i32 && Src.hasOneUse())
+    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/llrint-conv.ll b/llvm/test/CodeGen/X86/llrint-conv.ll
index 402daf80a15e873..2f69824c5f61573 100644
--- a/llvm/test/CodeGen/X86/llrint-conv.ll
+++ b/llvm/test/CodeGen/X86/llrint-conv.ll
@@ -183,6 +183,94 @@ entry:
   ret i64 %0
 }
 
+define i32 @combine_f32_trunc(float %x) nounwind {
+; SSE-LABEL: combine_trunc:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtss2si %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_trunc:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    retq
+; X86-NOSSE-LABEL: combine_f32_trunc:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fistpl (%esp)
+; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: combine_f32_trunc:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    cvtss2si {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: combine_f32_trunc:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    vcvtss2si {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: combine_f32_trunc:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    cvtss2si %xmm0, %eax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: combine_f32_trunc:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    vcvtss2si %xmm0, %eax
+; X64-AVX-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.llrint.f32(float %x)
+  %1 = trunc nsw i64 %0 to i32
+  ret i32 %1
+}
+
+define i32 @combine_f64_trunc(double %x) nounwind {
+; SSE-LABEL: combine_trunc:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtss2si %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_trunc:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    retq
+; X86-NOSSE-LABEL: combine_f64_trunc:
+; X86-NOSSE:       # %bb.0: # %entry
+; X86-NOSSE-NEXT:    pushl %eax
+; X86-NOSSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    fistpl (%esp)
+; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    popl %ecx
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: combine_f64_trunc:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    cvtsd2si {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-AVX-LABEL: combine_f64_trunc:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    vcvtsd2si {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: combine_f64_trunc:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    cvtsd2si %xmm0, %eax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: combine_f64_trunc:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    vcvtsd2si %xmm0, %eax
+; X64-AVX-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.llrint.f64(double %x)
+  %1 = trunc nuw i64 %0 to i32
+  ret i32 %1
+}
+
 declare i64 @llvm.llrint.f32(float) nounwind readnone
 declare i64 @llvm.llrint.f64(double) nounwind readnone
 declare i64 @llvm.llrint.f80(x86_fp80) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/lrint-conv-i64.ll b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
index 38fa09085e1898d..6b9acc02ad7c983 100644
--- a/llvm/test/CodeGen/X86/lrint-conv-i64.ll
+++ b/llvm/test/CodeGen/X86/lrint-conv-i64.ll
@@ -63,6 +63,38 @@ entry:
   ret i32 %1
 }
 
+define i32 @combine_f32_trunc(float %x) {
+; SSE-LABEL: combine_f32_trunc:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtss2si %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_f32_trunc:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.lrint.i64.f32(float %x)
+  %1 = trunc nuw i64 %0 to i32
+  ret i32 %1
+}
+
+define i32 @combine_f64_trunc(double %x) {
+; SSE-LABEL: combine_f64_trunc:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvtsd2si %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_f64_trunc:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtsd2si %xmm0, %eax
+; AVX-NEXT:    retq
+entry:
+  %0 = tail call i64 @llvm.lrint.i64.f64(double %x)
+  %1 = trunc nsw i64 %0 to i32
+  ret i32 %1
+}
+
 declare i64 @llvm.lrint.i64.f32(float) nounwind readnone
 declare i64 @llvm.lrint.i64.f64(double) nounwind readnone
 declare i64 @llvm.lrint.i64.f80(x86_fp80) nounwind readnone

RKSimon · 2025-02-07T14:15:32Z

llvm/test/CodeGen/X86/llrint-conv.ll

+; AVX-LABEL: combine_trunc:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtss2si %xmm0, %eax
+; AVX-NEXT:    retq


cleanup fixes check prefixes

Oh, I copy&pasted them. Done, thanks!

llvm/lib/Target/X86/X86ISelLowering.cpp

topperc · 2025-02-07T17:35:44Z

Do you have examples where we are able to infer an nuw/nsw flag for a truncate after lrint?

andykaylor · 2025-02-07T17:45:55Z

Do you have examples where we are able to infer an nuw/nsw flag for a truncate after lrint?

I was wondering that too. It doesn't look like clang ever sets these flags on trunc, though I suppose some other frontend might.

I was also wondering if it would be reasonable to apply this transformation in InstCombine -- for instance, convert llvm.lrint.i64.f64+(nsw/nuw)trunc to llvm.rint.i32? That's assuming we have reason to believe that this is useful.

phoebewang · 2025-02-08T07:57:59Z

Do you have examples where we are able to infer an nuw/nsw flag for a truncate after lrint?

Not yet. The idea was inspired by the discussions from @andykaylor and @jcranmer-intel in #125324, i.e., we need a flag works for the purpose of no-overflow. And the nuw/nsw flags on trunc looks the best solution to me. But I don't think the flags can be infered by compiler, unless e.g., we have a range data bind to the trunc or lrint instruction. The expected scenario is user explicitly set it with some FE help.

For now, I only have some draft ideas like generating nuw/nsw flags when user specify it through Clang option. This may bring unexpected result if we apply it globally, so I also have an idea that user can specify it through builtin like __builtin_assume for only those that they want. But I don't know if it's easy to implement. The last idea is to provide a new builtin, e.g., __builtin_trunc_nuw/nsw for the purpose. It is the easiest way we can implement.

In a word, I don't worry about the user scenario. And it looks to me a reasonable from the backend's perspective.

I was also wondering if it would be reasonable to apply this transformation in InstCombine -- for instance, convert llvm.lrint.i64.f64+(nsw/nuw)trunc to llvm.rint.i32? That's assuming we have reason to believe that this is useful.

The concern of putting it in the middle end is we don't know if a target prefers to use small size lrint and to which size it prefers the most. I'm afraid backend may geneate suboptimal code if we arbitrarily combine to llvm.rint.i16 or llvm.rint.i8.

topperc · 2025-02-08T21:49:06Z

llvm/lib/Target/X86/X86ISelLowering.cpp

+  if ((N->getFlags().hasNoUnsignedWrap() || N->getFlags().hasNoSignedWrap()) &&
+      (Src.getOpcode() == ISD::LRINT || Src.getOpcode() == ISD::LLRINT) &&
+      VT.getScalarType() == MVT::i32 && Src.hasOneUse())
+    return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));


Do we need to check SSE and not soft float?

no sse might be ok falling back to x87, but soft float may result in calling lrint/llrint library expecting an i32 result when the result is really i64.

I think it's ok. It is already UB if the hight 32-bit is not 0 when nuw/nsw is set.

The call lowering code will read from a 32-bit register instead of the real 64-bit register that is written. The upper bits of the 64-bit value could be all 1s if the value is negative. If the i32 result value is used by a zero extend, SelectionDAG will incorrectly remove the zero extend because it thinks the call wrote a 32-bit register which would automatically zero the upper bits. But since the call really wrote 64 bits this would be wrong.

Fortunately, SelectionDAG combine zext + trunc first, so we generate the zero upper instruction finally. Add zero_upperbits_softfloat for regression test.

What if the zext becomes adjacent to the trunc after some other combine or legalization step. There no guarantee the trunc hasn't already been combined with the lrint.

Makes sense, done.

llvm/lib/Target/X86/X86ISelLowering.cpp

topperc · 2025-02-09T21:19:01Z

@phoebewang have you looked into compiling (int)rintf(x) to cvtss2si?

phoebewang · 2025-02-10T03:10:05Z

@phoebewang have you looked into compiling (int)rintf(x) to cvtss2si?

Do you mean the FE proposes I posted before? No, it's not in my priority list. But I don't think we need to wait for it, there are many nsw/nuw trunc transformations if you search in the tests.

topperc · 2025-02-10T03:27:07Z

@phoebewang have you looked into compiling (int)rintf(x) to cvtss2si?

Do you mean the FE proposes I posted before? No, it's not in my priority list. But I don't think we need to wait for it, there are many nsw/nuw trunc transformations if you search in the tests.

(int)rintf(x) compiles to llvm.rintf+fptosi. The fptosi has undefined behavior if the fp value doesn't fit into an integer. This idiom is something that could be optimized without needing any new frontend support. Maybe it's an alternative sequence we could propose to users instead of trying to make this lrint+trunc work?

Based on Craig's suggestion on llvm#126217

phoebewang · 2025-02-10T07:16:52Z

@phoebewang have you looked into compiling (int)rintf(x) to cvtss2si?

Do you mean the FE proposes I posted before? No, it's not in my priority list. But I don't think we need to wait for it, there are many nsw/nuw trunc transformations if you search in the tests.

(int)rintf(x) compiles to llvm.rintf+fptosi. The fptosi has undefined behavior if the fp value doesn't fit into an integer. This idiom is something that could be optimized without needing any new frontend support. Maybe it's an alternative sequence we could propose to users instead of trying to make this lrint+trunc work?

Good point, thanks! #126477

andykaylor · 2025-02-10T17:59:16Z

I was also wondering if it would be reasonable to apply this transformation in InstCombine -- for instance, convert llvm.lrint.i64.f64+(nsw/nuw)trunc to llvm.rint.i32? That's assuming we have reason to believe that this is useful.

The concern of putting it in the middle end is we don't know if a target prefers to use small size lrint and to which size it prefers the most. I'm afraid backend may geneate suboptimal code if we arbitrarily combine to llvm.rint.i16 or llvm.rint.i8.

We could add TTI functions to get additional information if needed. Your suggestion that this could potentially enable better vectorization is the reason I think it would be beneficial to have this in InstCombine. If we leave it to codegen, it may be too late for vectorization. That is, the presence of the trunc instruction may cause the vectorizer to give up.

phoebewang · 2025-02-11T05:58:18Z

I was also wondering if it would be reasonable to apply this transformation in InstCombine -- for instance, convert llvm.lrint.i64.f64+(nsw/nuw)trunc to llvm.rint.i32? That's assuming we have reason to believe that this is useful.

The concern of putting it in the middle end is we don't know if a target prefers to use small size lrint and to which size it prefers the most. I'm afraid backend may geneate suboptimal code if we arbitrarily combine to llvm.rint.i16 or llvm.rint.i8.

We could add TTI functions to get additional information if needed. Your suggestion that this could potentially enable better vectorization is the reason I think it would be beneficial to have this in InstCombine. If we leave it to codegen, it may be too late for vectorization. That is, the presence of the trunc instruction may cause the vectorizer to give up.

The vectorization is a good point, and I agree we probably fail to vectorize llvm.lrint.i64.fxx due to the cost (we don't have LRINT cost model for now, but the vector cost is high for proir AVX512DQ target if we add them). It makes the solution of #126477 more valuable because the vector cost of both FRINT and FP_TO_SINT is low, at least for SSE4.1 and later, though we haven't model them either.

phoebewang · 2025-02-11T05:59:57Z

Let's pursuit #126477

Based on Craig's suggestion on #126217 Alive2: https://alive2.llvm.org/ce/z/9XNpWt

[X86] Combine LRINT/LLRINT and TRUNC when nuw/nsw

cf29bc4

Try to improve performance after llvm#125848

phoebewang requested review from nikic, RKSimon, topperc, andykaylor and jcranmer-intel February 7, 2025 10:03

llvmbot added the backend:X86 label Feb 7, 2025

RKSimon reviewed Feb 7, 2025

View reviewed changes

Clean up check prefixes

0be125e

RKSimon reviewed Feb 7, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Show resolved Hide resolved

Add vector tests

d170e8d

topperc reviewed Feb 8, 2025

View reviewed changes

topperc reviewed Feb 9, 2025

View reviewed changes

llvm/lib/Target/X86/X86ISelLowering.cpp Show resolved Hide resolved

Remove NoUnsignedWrap check

fc0ac26

Check no SoftFloat

c8709dd

phoebewang changed the title ~~[X86] Combine LRINT/LLRINT and TRUNC when nuw/nsw~~ [X86] Combine LRINT/LLRINT and TRUNC when it has nsw flag Feb 10, 2025

phoebewang changed the title ~~[X86] Combine LRINT/LLRINT and TRUNC when it has nsw flag~~ [X86] Combine LRINT/LLRINT and TRUNC when TRUNC has nsw flag Feb 10, 2025

phoebewang added a commit to phoebewang/llvm-project that referenced this pull request Feb 10, 2025

[X86] Combine FRINT + FP_TO_SINT to LRINT

2225596

Based on Craig's suggestion on llvm#126217

phoebewang mentioned this pull request Feb 10, 2025

[X86] Combine FRINT + FP_TO_SINT to LRINT #126477

Merged

phoebewang closed this Feb 11, 2025

phoebewang mentioned this pull request Feb 11, 2025

[CostModel] FRINT/LRINT/LLRINT cost is not modeled on X86 #126673

Open

phoebewang added a commit that referenced this pull request Feb 21, 2025

[X86] Combine FRINT + FP_TO_SINT to LRINT (#126477)

3302bef

Based on Craig's suggestion on #126217 Alive2: https://alive2.llvm.org/ce/z/9XNpWt

[X86] Combine LRINT/LLRINT and TRUNC when TRUNC has nsw flag #126217

[X86] Combine LRINT/LLRINT and TRUNC when TRUNC has nsw flag #126217

Uh oh!

Conversation

phoebewang commented Feb 7, 2025

Uh oh!

llvmbot commented Feb 7, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

topperc commented Feb 7, 2025

Uh oh!

andykaylor commented Feb 7, 2025

Uh oh!

phoebewang commented Feb 8, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

topperc commented Feb 9, 2025

Uh oh!

phoebewang commented Feb 10, 2025

Uh oh!

topperc commented Feb 10, 2025

Uh oh!

phoebewang commented Feb 10, 2025

Uh oh!

andykaylor commented Feb 10, 2025

Uh oh!

phoebewang commented Feb 11, 2025

Uh oh!

phoebewang commented Feb 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

phoebewang commented Feb 11, 2025 •

edited

Loading