Skip to content

Commit e349a9d

Browse files
committed
[AMDGPU] Implement 'llvm.get.fpenv' and 'llvm.set.fpenv'
Summary: This patch implements the LLVM floating point environment control intrinsics and also exposes it through clang. We encode the floating point environment as a 64-bit value that simply concatenates the values of the mode registers and the current trap status. We only fetch the bits relevant for floating point instructions. That is, rounding mode, denormalization mode, ieee, dx10 clamp, debug, enabled traps, f16 overflow, and active exceptions.
1 parent 32e2294 commit e349a9d

File tree

11 files changed

+269
-0
lines changed

11 files changed

+269
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,9 @@ BUILTIN(__builtin_amdgcn_read_exec_hi, "Ui", "nc")
325325

326326
BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
327327

328+
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
329+
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
330+
328331
//===----------------------------------------------------------------------===//
329332
// R600-NI only builtins.
330333
//===----------------------------------------------------------------------===//

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18406,6 +18406,17 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1840618406
CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
1840718407
return Builder.CreateCall(F, {Addr});
1840818408
}
18409+
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
18410+
Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv,
18411+
{llvm::Type::getInt64Ty(getLLVMContext())});
18412+
return Builder.CreateCall(F);
18413+
}
18414+
case AMDGPU::BI__builtin_amdgcn_set_fpenv: {
18415+
Function *F = CGM.getIntrinsic(Intrinsic::set_fpenv,
18416+
{llvm::Type::getInt64Ty(getLLVMContext())});
18417+
llvm::Value *Env = EmitScalarExpr(E->getArg(0));
18418+
return Builder.CreateCall(F, {Env});
18419+
}
1840918420
case AMDGPU::BI__builtin_amdgcn_read_exec:
1841018421
return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
1841118422
case AMDGPU::BI__builtin_amdgcn_read_exec_lo:

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,18 @@ unsigned test_wavefrontsize() {
839839
return __builtin_amdgcn_wavefrontsize();
840840
}
841841

842+
// CHECK-LABEL test_get_fpenv(
843+
unsigned long test_get_fpenv() {
844+
// CHECK: call i64 @llvm.get.fpenv.i64()
845+
return __builtin_amdgcn_get_fpenv();
846+
}
847+
848+
// CHECK-LABEL test_set_fpenv(
849+
void test_set_fpenv(unsigned long env) {
850+
// CHECK: call void @llvm.set.fpenv.i64(i64 %[[ENV:.+]])
851+
__builtin_amdgcn_set_fpenv(env);
852+
}
853+
842854
// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
843855
// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
844856
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { convergent mustprogress nocallback nofree nounwind willreturn memory(none) }

llvm/docs/AMDGPUUsage.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,13 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11511151
register do not exactly match the FLT_ROUNDS values,
11521152
so a conversion is performed.
11531153

1154+
:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
1155+
This stores information related to the current rounding mode,
1156+
denormalization mode, enabled traps, and floating point exceptions.
1157+
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
1158+
1159+
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
1160+
11541161
llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
11551162
provided by each lane in the wavefront.
11561163
Intrinsic takes a hint for reduction strategy using second operand

llvm/docs/LangRef.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26427,6 +26427,7 @@ similar to C library function 'fesetround', however this intrinsic does not
2642726427
return any value and uses platform-independent representation of IEEE rounding
2642826428
modes.
2642926429

26430+
.. _int_get_fpenv:
2643026431

2643126432
'``llvm.get.fpenv``' Intrinsic
2643226433
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -26450,6 +26451,7 @@ Semantics:
2645026451
The '``llvm.get.fpenv``' intrinsic reads the current floating-point environment
2645126452
and returns it as an integer value.
2645226453

26454+
.. _int_set_fpenv:
2645326455

2645426456
'``llvm.set.fpenv``' Intrinsic
2645526457
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

llvm/docs/ReleaseNotes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ Changes to the AArch64 Backend
7070
Changes to the AMDGPU Backend
7171
-----------------------------
7272

73+
* Implemented the ``llvm.get.fpenv`` and ``llvm.set.fpenv`` intrinsics.
74+
7375
Changes to the ARM Backend
7476
--------------------------
7577

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
905905
getActionDefinitionsBuilder(G_STACKRESTORE)
906906
.legalFor({PrivatePtr});
907907

908+
getActionDefinitionsBuilder(G_GET_FPENV).customFor({S64});
909+
getActionDefinitionsBuilder(G_SET_FPENV).customFor({S64});
910+
908911
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
909912
.customIf(typeIsNot(0, PrivatePtr));
910913

@@ -2128,6 +2131,10 @@ bool AMDGPULegalizerInfo::legalizeCustom(
21282131
return legalizeFPTruncRound(MI, B);
21292132
case TargetOpcode::G_STACKSAVE:
21302133
return legalizeStackSave(MI, B);
2134+
case TargetOpcode::G_GET_FPENV:
2135+
return legalizeGetFPEnv(MI, B);
2136+
case TargetOpcode::G_SET_FPENV:
2137+
return legalizeSetFPEnv(MI, B);
21312138
default:
21322139
return false;
21332140
}
@@ -6940,6 +6947,42 @@ bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
69406947
return true;
69416948
}
69426949

6950+
static constexpr unsigned FPEnvModeBitField =
6951+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
6952+
6953+
static constexpr unsigned FPEnvTrapBitField =
6954+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
6955+
6956+
bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
6957+
MachineIRBuilder &B) const {
6958+
auto ModeReg =
6959+
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6960+
/*HasSideEffects=*/true, /*isConvergent=*/false)
6961+
.addImm(FPEnvModeBitField);
6962+
auto TrapReg =
6963+
B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
6964+
/*HasSideEffects=*/true, /*isConvergent=*/false)
6965+
.addImm(FPEnvTrapBitField);
6966+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {ModeReg, TrapReg});
6967+
MI.eraseFromParent();
6968+
return true;
6969+
}
6970+
6971+
bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
6972+
MachineIRBuilder &B) const {
6973+
auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
6974+
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
6975+
/*HasSideEffects=*/true, /*isConvergent=*/false)
6976+
.addImm(static_cast<int16_t>(FPEnvModeBitField))
6977+
.addReg(Unmerge.getReg(0));
6978+
B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
6979+
/*HasSideEffects=*/true, /*isConvergent=*/false)
6980+
.addImm(static_cast<int16_t>(FPEnvTrapBitField))
6981+
.addReg(Unmerge.getReg(1));
6982+
MI.eraseFromParent();
6983+
return true;
6984+
}
6985+
69436986
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
69446987
MachineInstr &MI) const {
69456988
MachineIRBuilder &B = Helper.MIRBuilder;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
214214
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
215215
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
216216

217+
bool legalizeGetFPEnv(MachineInstr &MI, MachineIRBuilder &B) const;
218+
bool legalizeSetFPEnv(MachineInstr &MI, MachineIRBuilder &B) const;
219+
217220
bool legalizeImageIntrinsic(
218221
MachineInstr &MI, MachineIRBuilder &B,
219222
GISelChangeObserver &Observer,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
876876

877877
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
878878
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
879+
setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
880+
setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
879881

880882
// TODO: Could move this to custom lowering, could benefit from combines on
881883
// extract of relevant bits.
@@ -4079,6 +4081,70 @@ SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
40794081
return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
40804082
}
40814083

4084+
SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4085+
SDLoc SL(Op);
4086+
assert(Op.getValueType() == MVT::i64);
4087+
4088+
uint32_t ModeHwReg =
4089+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4090+
SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4091+
uint32_t TrapHwReg =
4092+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4093+
SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4094+
4095+
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4096+
SDValue IntrinID =
4097+
DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4098+
SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4099+
Op.getOperand(0), IntrinID, ModeHwRegImm);
4100+
SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4101+
Op.getOperand(0), IntrinID, TrapHwRegImm);
4102+
SDValue MergedReg =
4103+
DAG.getMergeValues({GetModeReg.getValue(1), GetTrapReg.getValue(1)}, SL);
4104+
4105+
SDValue CvtPtr =
4106+
DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4107+
SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4108+
4109+
return DAG.getMergeValues({Result, MergedReg}, SL);
4110+
}
4111+
4112+
SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4113+
SDLoc SL(Op);
4114+
assert(Op.getOperand(1).getValueType() == MVT::i64);
4115+
4116+
SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4117+
SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4118+
DAG.getConstant(0, SL, MVT::i32));
4119+
SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4120+
DAG.getConstant(1, SL, MVT::i32));
4121+
4122+
SDValue ReadFirstLaneID =
4123+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4124+
NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4125+
ReadFirstLaneID, NewModeReg);
4126+
NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4127+
ReadFirstLaneID, NewTrapReg);
4128+
4129+
unsigned ModeHwReg =
4130+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
4131+
SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4132+
unsigned TrapHwReg =
4133+
AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
4134+
SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4135+
4136+
SDVTList VTList = DAG.getVTList(MVT::Other);
4137+
SDValue IntrinID =
4138+
DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4139+
SDValue SetModeReg =
4140+
DAG.getNode(ISD::INTRINSIC_VOID, SL, VTList, Op.getOperand(0), IntrinID,
4141+
ModeHwRegImm, NewModeReg);
4142+
SDValue SetTrapReg =
4143+
DAG.getNode(ISD::INTRINSIC_VOID, SL, VTList, Op.getOperand(0), IntrinID,
4144+
TrapHwRegImm, NewTrapReg);
4145+
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4146+
}
4147+
40824148
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
40834149
const MachineFunction &MF) const {
40844150
Register Reg = StringSwitch<Register>(RegName)
@@ -5663,6 +5729,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
56635729
case ISD::FP_EXTEND:
56645730
case ISD::STRICT_FP_EXTEND:
56655731
return lowerFP_EXTEND(Op, DAG);
5732+
case ISD::GET_FPENV:
5733+
return lowerGET_FPENV(Op, DAG);
5734+
case ISD::SET_FPENV:
5735+
return lowerSET_FPENV(Op, DAG);
56665736
}
56675737
return SDValue();
56685738
}

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
425425

426426
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
427427
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
428+
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
429+
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
428430

429431
Register getRegisterByName(const char* RegName, LLT VT,
430432
const MachineFunction &MF) const override;
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6 %s
3+
; RUN: llc -mtriple=amdgcn -global-isel -mcpu=tahiti < %s | FileCheck -check-prefixes=GFX6 %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8 %s
5+
; RUN: llc -mtriple=amdgcn -global-isel -mcpu=fiji < %s | FileCheck -check-prefixes=GFX8 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
7+
; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
8+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s
9+
; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s
10+
; RUN: llc -mtriple=amdgcn -amdgpu-enable-delay-alu=0 -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
11+
; RUN: llc -mtriple=amdgcn -amdgpu-enable-delay-alu=0 -global-isel -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
12+
13+
declare i64 @llvm.get.fpenv.i64()
14+
15+
define i64 @get_fpenv() {
16+
; GFX6-LABEL: get_fpenv:
17+
; GFX6: ; %bb.0: ; %entry
18+
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19+
; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 23)
20+
; GFX6-NEXT: s_getreg_b32 s5, hwreg(HW_REG_TRAPSTS, 0, 5)
21+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
22+
; GFX6-NEXT: v_mov_b32_e32 v1, s5
23+
; GFX6-NEXT: s_setpc_b64 s[30:31]
24+
;
25+
; GFX8-LABEL: get_fpenv:
26+
; GFX8: ; %bb.0: ; %entry
27+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28+
; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 23)
29+
; GFX8-NEXT: s_getreg_b32 s5, hwreg(HW_REG_TRAPSTS, 0, 5)
30+
; GFX8-NEXT: v_mov_b32_e32 v0, s4
31+
; GFX8-NEXT: v_mov_b32_e32 v1, s5
32+
; GFX8-NEXT: s_setpc_b64 s[30:31]
33+
;
34+
; GFX9-LABEL: get_fpenv:
35+
; GFX9: ; %bb.0: ; %entry
36+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37+
; GFX9-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 23)
38+
; GFX9-NEXT: s_getreg_b32 s5, hwreg(HW_REG_TRAPSTS, 0, 5)
39+
; GFX9-NEXT: v_mov_b32_e32 v0, s4
40+
; GFX9-NEXT: v_mov_b32_e32 v1, s5
41+
; GFX9-NEXT: s_setpc_b64 s[30:31]
42+
;
43+
; GFX10-LABEL: get_fpenv:
44+
; GFX10: ; %bb.0: ; %entry
45+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46+
; GFX10-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 23)
47+
; GFX10-NEXT: s_getreg_b32 s5, hwreg(HW_REG_TRAPSTS, 0, 5)
48+
; GFX10-NEXT: v_mov_b32_e32 v0, s4
49+
; GFX10-NEXT: v_mov_b32_e32 v1, s5
50+
; GFX10-NEXT: s_setpc_b64 s[30:31]
51+
;
52+
; GFX11-LABEL: get_fpenv:
53+
; GFX11: ; %bb.0: ; %entry
54+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
55+
; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 23)
56+
; GFX11-NEXT: s_getreg_b32 s1, hwreg(HW_REG_TRAPSTS, 0, 5)
57+
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
58+
; GFX11-NEXT: s_setpc_b64 s[30:31]
59+
entry:
60+
%0 = tail call i64 @llvm.get.fpenv.i64()
61+
ret i64 %0
62+
}
63+
64+
declare void @llvm.set.fpenv.i64(i64)
65+
66+
define void @set_fpenv(i64 %env) {
67+
; GFX6-LABEL: set_fpenv:
68+
; GFX6: ; %bb.0: ; %entry
69+
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
71+
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 23), s4
72+
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
73+
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_TRAPSTS, 0, 5), s4
74+
; GFX6-NEXT: s_setpc_b64 s[30:31]
75+
;
76+
; GFX8-LABEL: set_fpenv:
77+
; GFX8: ; %bb.0: ; %entry
78+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79+
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
80+
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 23), s4
81+
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
82+
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_TRAPSTS, 0, 5), s4
83+
; GFX8-NEXT: s_setpc_b64 s[30:31]
84+
;
85+
; GFX9-LABEL: set_fpenv:
86+
; GFX9: ; %bb.0: ; %entry
87+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88+
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
89+
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 23), s4
90+
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
91+
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_TRAPSTS, 0, 5), s4
92+
; GFX9-NEXT: s_setpc_b64 s[30:31]
93+
;
94+
; GFX10-LABEL: set_fpenv:
95+
; GFX10: ; %bb.0: ; %entry
96+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97+
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
98+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 23), s4
99+
; GFX10-NEXT: v_readfirstlane_b32 s4, v1
100+
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_TRAPSTS, 0, 5), s4
101+
; GFX10-NEXT: s_setpc_b64 s[30:31]
102+
;
103+
; GFX11-LABEL: set_fpenv:
104+
; GFX11: ; %bb.0: ; %entry
105+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
107+
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 23), s0
108+
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
109+
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_TRAPSTS, 0, 5), s0
110+
; GFX11-NEXT: s_setpc_b64 s[30:31]
111+
entry:
112+
tail call void @llvm.set.fpenv.i64(i64 %env)
113+
ret void
114+
}

0 commit comments

Comments
 (0)