Skip to content

[SDAG][NVPTX] Add TLI check for preferring custom FP_TO_SINT operations to FP_TO_UINT #132470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3464,6 +3464,34 @@ class TargetLoweringBase {
return false;
}

// Get the preferred opcode for FP_TO_XINT nodes.
// By default, this checks if the provded operation is an illegal FP_TO_UINT
// and if so, checks if FP_TO_SINT is legal or custom for use as a
// replacement. If both UINT and SINT conversions are Custom, we choose SINT
// by default because that's the right thing on PPC.
virtual unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const {
if (isOperationLegal(Op, ToVT))
return Op;
switch (Op) {
case ISD::FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::FP_TO_SINT, ToVT))
return ISD::FP_TO_SINT;
break;
case ISD::STRICT_FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, ToVT))
return ISD::STRICT_FP_TO_SINT;
break;
case ISD::VP_FP_TO_UINT:
if (isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, ToVT))
return ISD::VP_FP_TO_SINT;
break;
default:
break;
}
return Op;
}

/// Create the IR node for the given complex deinterleaving operation.
/// If one cannot be created using all the given inputs, nullptr should be
/// returned.
Expand Down
22 changes: 2 additions & 20 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,28 +849,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {

SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
unsigned NewOpc = N->getOpcode();
unsigned NewOpc =
TLI.getPreferredFPToIntOpcode(N->getOpcode(), N->getValueType(0), NVT);
SDLoc dl(N);

// If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
// not Legal, check to see if we can use FP_TO_SINT instead. (If both UINT
// and SINT conversions are Custom, there is no way to tell which is
// preferable. We choose SINT because that's the right thing on PPC.)
if (N->getOpcode() == ISD::FP_TO_UINT &&
!TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
NewOpc = ISD::FP_TO_SINT;

if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
NewOpc = ISD::STRICT_FP_TO_SINT;

if (N->getOpcode() == ISD::VP_FP_TO_UINT &&
!TLI.isOperationLegal(ISD::VP_FP_TO_UINT, NVT) &&
TLI.isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, NVT))
NewOpc = ISD::VP_FP_TO_SINT;

SDValue Res;
if (N->isStrictFPOpcode()) {
Res = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
Expand Down
27 changes: 27 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6214,6 +6214,33 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
return nullptr;
}

// Rather than default to SINT when both UINT and SINT are custom, we only
// change the opcode when UINT is not legal and SINT is. UINT is preferred when
// both are custom since unsigned CVT instructions can lead to slightly better
// SASS code with fewer instructions.
unsigned NVPTXTargetLowering::getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const {
if (isOperationLegal(Op, ToVT))
return Op;
switch (Op) {
case ISD::FP_TO_UINT:
if (isOperationLegal(ISD::FP_TO_SINT, ToVT))
return ISD::FP_TO_SINT;
break;
case ISD::STRICT_FP_TO_UINT:
if (isOperationLegal(ISD::STRICT_FP_TO_SINT, ToVT))
return ISD::STRICT_FP_TO_SINT;
break;
case ISD::VP_FP_TO_UINT:
if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
return ISD::VP_FP_TO_SINT;
break;
default:
break;
}
return Op;
}

// Pin NVPTXTargetObjectFile's vtables to this file.
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,9 @@ class NVPTXTargetLowering : public TargetLowering {
Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
AtomicOrdering Ord) const override;

unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT,
EVT ToVT) const override;

private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
Expand Down
134 changes: 134 additions & 0 deletions llvm/test/CodeGen/NVPTX/convert-fp-i8.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this change buy us?
It appears that on this test LLVM currently generates the same output (AFAICT): https://godbolt.org/z/svM1Kns79

It's possible that I've missed something. Can you point me at the suboptimal code LLVM generates now?
Perhaps we just need a few more test cases to cover the changes introduced by the patch..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first two test cases should have different PTX with the change.
If we look at line 21 in the PTX of that godbolt link:
cvt.rzi.s16.f32 %rs1, %f1;
Compare that to the expected output in the test, line 10:
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
The difference is the s16 vs u16. Current LLVM is generating a signed cvt instruction for what should be an unsigned value.

You can see the SASS differences here: https://godbolt.org/z/GjqYzejPz

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it matter in practice?

The change affects only conversions to u8 and given that we always actually convert to the 16-bit integer, for the valid range of inputs [0f .. 255f], the low 8 bits of the result will be correct regardless of whether we used signed or unsigned conversion. For the inputs outside of the valid input range LLVM IR considers results to be poison.

If the goal is to have upper bits of the result always be 0, even for out of range inputs, then this patch is insufficient. It will still fill in upper bits for some out-of range input values, because we're converting to a 16-bit int. E.g converting 65535.0 will give us 0xffff. If all-zero MSBs are indeed the goal here, we'd need to explicitly mask the low 8 bits after conversion, instead. Considering that this code has been in place in the current shape for a very long time, I do not think this is necessary in practice.

Perhaps I'm still missing something. Can you elaborate on what motivates this change and what exactly is the issue it is intended to solve. Simply using PTX instruction with a better matching name but no effect on the valid inputs is not worth plumbing special case into generic LLVM code. IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I generally agree with your reasoning as to why this seems irrelevant on the PTX level. However...

Does it matter in practice?

Can you provide your opinion on the SASS diff I linked?
Without change:

cvt_u8_f32:
 F2I.S16.TRUNC.NTZ R4, R4 
 LOP3.LUT R4, R4, 0xffff, RZ, 0xc0, !PT 
 RET.ABS.NODEC R20 0x0

With change:

cvt_u8_f32:
 F2I.U16.TRUNC.NTZ R4, R4 
 RET.ABS.NODEC R20 0x0

It's definitely something of a corner case, but it does seem better. The biggest thing for me is that this implies that the signed/unsigned distinction is relevant for ptxas and is it not just a better name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide your opinion on the SASS diff I linked?

There's obviously one less instruction. However, considering that float to u8 conversion happens approximately never, and that even when it does, it would be dwarfed by whatever operations we perform in FP, I'd be really surprised if you were to see any measurable differences on anything other than an artificial benchmark targeting just this instruction.

On the other hand, it adds non-zero complexity to LLVM which will need to be maintained ~forever. Considering near-zero benefit vs non-zero maintenance cost, I'd say it's not worth it. The impact is minor either way, so if the change could be plausibly used for other targets, I'd be fine with it.

@arsenm -- would AMDGPU benefit from the distinction in signedness in FP-to-int conversions?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the cases that require larger expansions, unsigned is always preferable

; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s

define i8 @cvt_u8_f32(float %x) {
; CHECK-LABEL: cvt_u8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0];
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui float %x to i8
ret i8 %a
}

define i8 @cvt_u8_f64(double %x) {
; CHECK-LABEL: cvt_u8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0];
; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = fptoui double %x to i8
ret i8 %a
}

define float @cvt_f32_i8(i8 %x) {
; CHECK-LABEL: cvt_f32_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0];
; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to float
ret float %a
}

define double @cvt_f64_i8(i8 %x) {
; CHECK-LABEL: cvt_f64_i8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0];
; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = uitofp i8 %x to double
ret double %a
}

define float @cvt_f32_s8(i8 %x) {
; CHECK-LABEL: cvt_f32_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0];
; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1;
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to float
ret float %a
}

define double @cvt_f64_s8(i8 %x) {
; CHECK-LABEL: cvt_f64_s8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0];
; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1;
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
; CHECK-NEXT: ret;
%a = sitofp i8 %x to double
ret double %a
}

define i8 @cvt_s8_f32(float %x) {
; CHECK-LABEL: cvt_s8_f32(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0];
; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi float %x to i8
ret i8 %a
}

define i8 @cvt_s8_f64(double %x) {
; CHECK-LABEL: cvt_s8_f64(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f64 %fd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0];
; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
; CHECK-NEXT: and.b32 %r2, %r1, 255;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = fptosi double %x to i8
ret i8 %a
}
Loading