Skip to content

[AMDGPU] Implement 'llvm.get.fpenv' and 'llvm.set.fpenv' #83906

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,9 @@ BUILTIN(__builtin_amdgcn_read_exec_hi, "Ui", "nc")

BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")

BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@spavloff was there supposed to be a generic builtin for these?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no builtin as far as I'm aware. I think there might be some pragmas however. Because the intrinsic takes different kinds I figured it was easiest to just specify my use-case. The alternative would be to make this generic and have a LUT by target triple in CGBuiltin that just contains the one entry for AMDGPU.

BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")

//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
Expand Down
11 changes: 11 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18406,6 +18406,17 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
return Builder.CreateCall(F);
}
case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
{llvm::Type::getInt64Ty(getLLVMContext())});
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
return Builder.CreateCall(F, {Env});
}
case AMDGPU::BI__builtin_amdgcn_read_exec:
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/Sema/SemaChecking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5307,6 +5307,9 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
// position of memory order and scope arguments in the builtin
unsigned OrderIndex, ScopeIndex;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_get_fpenv:
case AMDGPU::BI__builtin_amdgcn_set_fpenv:
return false;
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
Expand Down
12 changes: 12 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,18 @@ unsigned test_wavefrontsize() {
return __builtin_amdgcn_wavefrontsize();
}

// CHECK-LABEL test_get_fpenv(
unsigned long test_get_fpenv() {
// CHECK: call i64 @llvm.get.fpenv.i64()
return __builtin_amdgcn_get_fpenv();
}

// CHECK-LABEL test_set_fpenv(
void test_set_fpenv(unsigned long env) {
// CHECK: call void @llvm.set.fpenv.i64(i64 %[[ENV:.+]])
__builtin_amdgcn_set_fpenv(env);
}

// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }
7 changes: 7 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,13 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
register do not exactly match the FLT_ROUNDS values,
so a conversion is performed.

:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
This stores information related to the current rounding mode,
denormalization mode, enabled traps, and floating point exceptions.
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.

:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.

llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
provided by each lane in the wavefront.
Intrinsic takes a hint for reduction strategy using second operand
Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26432,6 +26432,7 @@ similar to C library function 'fesetround', however this intrinsic does not
return any value and uses platform-independent representation of IEEE rounding
modes.

.. _int_get_fpenv:

'``llvm.get.fpenv``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand All @@ -26455,6 +26456,7 @@ Semantics:
The '``llvm.get.fpenv``' intrinsic reads the current floating-point environment
and returns it as an integer value.

.. _int_set_fpenv:

'``llvm.set.fpenv``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ Changes to the AArch64 Backend
Changes to the AMDGPU Backend
-----------------------------

* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.

Changes to the ARM Backend
--------------------------

Expand Down
52 changes: 52 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_STACKRESTORE)
.legalFor({PrivatePtr});

getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});

getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));

Expand Down Expand Up @@ -2128,6 +2130,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
return legalizeStackSave(MI, B);
case TargetOpcode::G_GET_FPENV:
return legalizeGetFPEnv(MI, MRI, B);
case TargetOpcode::G_SET_FPENV:
return legalizeSetFPEnv(MI, MRI, B);
default:
return false;
}
Expand Down Expand Up @@ -6940,6 +6946,52 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
return true;
}

static constexpr unsigned FPEnvModeBitField =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);

static constexpr unsigned FPEnvTrapBitField =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);

bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
if (MRI.getType(Src) != S64)
return false;

auto ModeReg =
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvModeBitField);
auto TrapReg =
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(FPEnvTrapBitField);
B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Register Src = MI.getOperand(0).getReg();
if (MRI.getType(Src) != S64)
return false;

auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(static_cast<int16_t>(FPEnvModeBitField))
.addReg(Unmerge.getReg(0));
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
/*HasSideEffects=*/true, /*isConvergent=*/false)
.addImm(static_cast<int16_t>(FPEnvTrapBitField))
.addReg(Unmerge.getReg(1));
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;

bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
GISelChangeObserver &Observer,
Expand Down
72 changes: 72 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);

// TODO: Could move this to custom lowering, could benefit from combines on
// extract of relevant bits.
Expand Down Expand Up @@ -4079,6 +4081,72 @@ SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
}

SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
if (Op.getValueType() != MVT::i64)
return Op;

uint32_t ModeHwReg =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
uint32_t TrapHwReg =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);

SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
SDValue IntrinID =
DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
Op.getOperand(0), IntrinID, ModeHwRegImm);
SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
Op.getOperand(0), IntrinID, TrapHwRegImm);
SDValue TokenReg =
DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
GetTrapReg.getValue(1));

SDValue CvtPtr =
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);

return DAG.getMergeValues({Result, TokenReg}, SL);
}

SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
if (Op.getOperand(1).getValueType() != MVT::i64)
return Op;

SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
DAG.getConstant(0, SL, MVT::i32));
SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
DAG.getConstant(1, SL, MVT::i32));

SDValue ReadFirstLaneID =
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
ReadFirstLaneID, NewModeReg);
NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
ReadFirstLaneID, NewTrapReg);

unsigned ModeHwReg =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
unsigned TrapHwReg =
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);

SDValue IntrinID =
DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
SDValue SetModeReg =
DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
IntrinID, ModeHwRegImm, NewModeReg);
SDValue SetTrapReg =
DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
IntrinID, TrapHwRegImm, NewTrapReg);
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
}

Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
Expand Down Expand Up @@ -5663,6 +5731,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND:
return lowerFP_EXTEND(Op, DAG);
case ISD::GET_FPENV:
return lowerGET_FPENV(Op, DAG);
case ISD::SET_FPENV:
return lowerSET_FPENV(Op, DAG);
}
return SDValue();
}
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {

SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
Expand Down
Loading