Skip to content

Commit 17bd806

Browse files
committed
AMDGPU: Implement llvm.get.fpmode
Currently s_getreg_b32 is missing the possible mode use. Really we need separate pseudos for mode-only accesses, but leave this as a pre-existing issue. https://reviews.llvm.org/D152710
1 parent 21ac457 commit 17bd806

File tree

5 files changed

+771
-0
lines changed

5 files changed

+771
-0
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,13 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
999999
:ref:`llvm.stacksave.p5 <int_stacksave>` Implemented, must use the alloca address space.
10001000
:ref:`llvm.stackrestore.p5 <int_stackrestore>` Implemented, must use the alloca address space.
10011001

1002+
:ref:`llvm.get.fpmode.i32 <int_get_fpmode>` The natural floating-point mode type is i32. This
1003+
implemented by extracting relevant bits out of the MODE
1004+
register with s_getreg_b32. The first 10 bits are the
1005+
core floating-point mode. Bits 12:18 are the exception
1006+
mask. On gfx9+, bit 23 is FP16_OVFL. Bitfields not
1007+
relevant to floating-point instructions are 0s.
1008+
10021009
:ref:`llvm.get.rounding<int_get_rounding>` AMDGPU supports two separately controllable rounding
10031010
modes depending on the floating-point type. One
10041011
controls float, and the other controls both double and

llvm/docs/LangRef.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25660,6 +25660,7 @@ The '``llvm.reset.fpenv``' intrinsic sets the current floating-point environment
2566025660
to default state. It is similar to the call 'fesetenv(FE_DFL_ENV)', except it
2566125661
does not return any value.
2566225662

25663+
.. _int_get_fpmode:
2566325664

2566425665
'``llvm.get.fpmode``' Intrinsic
2566525666
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
757757
setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
758758
setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
759759

760+
// TODO: Could move this to custom lowering, could benefit from combines on
761+
// extract of relevant bits.
762+
setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
763+
760764
setTargetDAGCombine({ISD::ADD,
761765
ISD::UADDO_CARRY,
762766
ISD::SUB,

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,8 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
870870

871871
// This is hasSideEffects to allow its use in readcyclecounter selection.
872872
// FIXME: Need to truncate immediate to 16-bits.
873+
// FIXME: Missing mode register use. Should have separate pseudos for
874+
// known may read MODE and only read MODE.
873875
def S_GETREG_B32 : SOPK_Pseudo <
874876
"s_getreg_b32",
875877
(outs SReg_32:$sdst), (ins hwreg:$simm16),
@@ -1424,6 +1426,66 @@ def : GCNPat <
14241426
(S_WAIT_EVENT (i16 0))
14251427
>;
14261428

1429+
// The first 10 bits of the mode register are the core FP mode on all
1430+
// subtargets.
1431+
//
1432+
// The high bits include additional fields, intermixed with some
1433+
// non-floating point environment information. We extract the full
1434+
// register and clear non-relevant bits.
1435+
//
1436+
// EXCP_EN covers floating point exceptions, but also some other
1437+
// non-FP exceptions.
1438+
//
1439+
// Bits 12-18 cover the relevant exception mask on all subtargets.
1440+
//
1441+
// FIXME: Bit 18 is int_div0, should this be in the FP environment? I
1442+
// think the only source is v_rcp_iflag_i32.
1443+
//
1444+
// On GFX9+:
1445+
// Bit 23 is the additional FP16_OVFL mode.
1446+
//
1447+
// Bits 19, 20, and 21 cover non-FP exceptions and differ between
1448+
// gfx9/10/11, so we ignore them here.
1449+
1450+
// TODO: Would it be cheaper to emit multiple s_getreg_b32 calls for
1451+
// the ranges and combine the results?
1452+
1453+
defvar fp_round_mask = !add(!shl(1, 4), -1);
1454+
defvar fp_denorm_mask = !shl(!add(!shl(1, 4), -1), 4);
1455+
defvar dx10_clamp_mask = !shl(1, 8);
1456+
defvar ieee_mode_mask = !shl(1, 9);
1457+
1458+
// Covers fp_round, fp_denorm, dx10_clamp, and IEEE bit.
1459+
defvar fpmode_mask =
1460+
!or(fp_round_mask, fp_denorm_mask, dx10_clamp_mask, ieee_mode_mask);
1461+
1462+
defvar fp_excp_en_mask = !shl(!add(!shl(1, 7), -1), 12);
1463+
defvar fp16_ovfl = !shl(1, 23);
1464+
defvar fpmode_mask_gfx6plus = !or(fpmode_mask, fp_excp_en_mask);
1465+
defvar fpmode_mask_gfx9plus = !or(fpmode_mask_gfx6plus, fp16_ovfl);
1466+
1467+
class GetFPModePat<int fpmode_mask> : GCNPat<
1468+
(i32 get_fpmode),
1469+
(S_AND_B32 (i32 fpmode_mask),
1470+
(S_GETREG_B32 getHwRegImm<
1471+
HWREG.MODE, 0,
1472+
!add(!logtwo(fpmode_mask), 1)>.ret))
1473+
>;
1474+
1475+
// TODO: Might be worth moving to custom lowering so the and is
1476+
// exposed to demanded bits optimizations. Most users probably only
1477+
// care about the rounding or denorm mode bits. We also can reduce the
1478+
// demanded read from the getreg immediate.
1479+
let SubtargetPredicate = isGFX9Plus in {
1480+
// Last bit = FP16_OVFL
1481+
def : GetFPModePat<fpmode_mask_gfx9plus>;
1482+
}
1483+
1484+
// Last bit = EXCP_EN.int_div0
1485+
let SubtargetPredicate = isNotGFX9Plus in {
1486+
def : GetFPModePat<fpmode_mask_gfx6plus>;
1487+
}
1488+
14271489
//===----------------------------------------------------------------------===//
14281490
// SOP2 Patterns
14291491
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)