-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SDAG][NVPTX] Add TLI check for preferring custom FP_TO_SINT operations to FP_TO_UINT #132470
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s | ||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s | ||
|
||
define i8 @cvt_u8_f32(float %x) { | ||
; CHECK-LABEL: cvt_u8_f32( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .b32 %r<2>; | ||
; CHECK-NEXT: .reg .f32 %f<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0]; | ||
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1; | ||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1; | ||
; CHECK-NEXT: ret; | ||
%a = fptoui float %x to i8 | ||
ret i8 %a | ||
} | ||
|
||
define i8 @cvt_u8_f64(double %x) { | ||
; CHECK-LABEL: cvt_u8_f64( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .b32 %r<2>; | ||
; CHECK-NEXT: .reg .f64 %fd<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0]; | ||
; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1; | ||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1; | ||
; CHECK-NEXT: ret; | ||
%a = fptoui double %x to i8 | ||
ret i8 %a | ||
} | ||
|
||
define float @cvt_f32_i8(i8 %x) { | ||
; CHECK-LABEL: cvt_f32_i8( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .f32 %f<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0]; | ||
; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1; | ||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1; | ||
; CHECK-NEXT: ret; | ||
%a = uitofp i8 %x to float | ||
ret float %a | ||
} | ||
|
||
define double @cvt_f64_i8(i8 %x) { | ||
; CHECK-LABEL: cvt_f64_i8( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .f64 %fd<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0]; | ||
; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1; | ||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; | ||
; CHECK-NEXT: ret; | ||
%a = uitofp i8 %x to double | ||
ret double %a | ||
} | ||
|
||
define float @cvt_f32_s8(i8 %x) { | ||
; CHECK-LABEL: cvt_f32_s8( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .f32 %f<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0]; | ||
; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1; | ||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1; | ||
; CHECK-NEXT: ret; | ||
%a = sitofp i8 %x to float | ||
ret float %a | ||
} | ||
|
||
define double @cvt_f64_s8(i8 %x) { | ||
; CHECK-LABEL: cvt_f64_s8( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .f64 %fd<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0]; | ||
; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1; | ||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; | ||
; CHECK-NEXT: ret; | ||
%a = sitofp i8 %x to double | ||
ret double %a | ||
} | ||
|
||
define i8 @cvt_s8_f32(float %x) { | ||
; CHECK-LABEL: cvt_s8_f32( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .b32 %r<3>; | ||
; CHECK-NEXT: .reg .f32 %f<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0]; | ||
; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1; | ||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; | ||
; CHECK-NEXT: and.b32 %r2, %r1, 255; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2; | ||
; CHECK-NEXT: ret; | ||
%a = fptosi float %x to i8 | ||
ret i8 %a | ||
} | ||
|
||
define i8 @cvt_s8_f64(double %x) { | ||
; CHECK-LABEL: cvt_s8_f64( | ||
; CHECK: { | ||
; CHECK-NEXT: .reg .b16 %rs<2>; | ||
; CHECK-NEXT: .reg .b32 %r<3>; | ||
; CHECK-NEXT: .reg .f64 %fd<2>; | ||
; CHECK-EMPTY: | ||
; CHECK-NEXT: // %bb.0: | ||
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0]; | ||
; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1; | ||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; | ||
; CHECK-NEXT: and.b32 %r2, %r1, 255; | ||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2; | ||
; CHECK-NEXT: ret; | ||
%a = fptosi double %x to i8 | ||
ret i8 %a | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does this change buy us?
It appears that on this test LLVM currently generates the same output (AFAICT): https://godbolt.org/z/svM1Kns79
It's possible that I've missed something. Can you point me at the suboptimal code LLVM generates now?
Perhaps we just need a few more test cases to cover the changes introduced by the patch..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The first two test cases should have different PTX with the change.
If we look at line 21 in the PTX of that godbolt link:
cvt.rzi.s16.f32 %rs1, %f1;
Compare that to the expected output in the test, line 10:
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
The difference is the s16 vs u16. Current LLVM is generating a signed cvt instruction for what should be an unsigned value.
You can see the SASS differences here: https://godbolt.org/z/GjqYzejPz
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it matter in practice?
The change affects only conversions to
u8
and given that we always actually convert to the 16-bit integer, for the valid range of inputs[0f .. 255f]
, the low 8 bits of the result will be correct regardless of whether we used signed or unsigned conversion. For the inputs outside of the valid input range LLVM IR considers results to bepoison
.If the goal is to have upper bits of the result always be 0, even for out of range inputs, then this patch is insufficient. It will still fill in upper bits for some out-of range input values, because we're converting to a 16-bit int. E.g converting
65535.0
will give us0xffff
. If all-zero MSBs are indeed the goal here, we'd need to explicitly mask the low 8 bits after conversion, instead. Considering that this code has been in place in the current shape for a very long time, I do not think this is necessary in practice.Perhaps I'm still missing something. Can you elaborate on what motivates this change and what exactly is the issue it is intended to solve. Simply using PTX instruction with a better matching name but no effect on the valid inputs is not worth plumbing special case into generic LLVM code. IMO.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I generally agree with your reasoning as to why this seems irrelevant on the PTX level. However...
Can you provide your opinion on the SASS diff I linked?
Without change:
With change:
It's definitely something of a corner case, but it does seem better. The biggest thing for me is that this implies that the signed/unsigned distinction is relevant for ptxas and is it not just a better name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's obviously one less instruction. However, considering that float to
u8
conversion happens approximately never, and that even when it does, it would be dwarfed by whatever operations we perform in FP, I'd be really surprised if you were to see any measurable differences on anything other than an artificial benchmark targeting just this instruction.On the other hand, it adds non-zero complexity to LLVM which will need to be maintained ~forever. Considering near-zero benefit vs non-zero maintenance cost, I'd say it's not worth it. The impact is minor either way, so if the change could be plausibly used for other targets, I'd be fine with it.
@arsenm -- would AMDGPU benefit from the distinction in signedness in FP-to-int conversions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the cases that require larger expansions, unsigned is always preferable