Skip to content

Commit 3e51451

Browse files
committed
[libc] Implement basic 'fenv.h' utilities on the AMD GPU
Summary: This patch implements a basic floating point environment on the AMDGPU. Users should be able to check rounding modes or certain floating point exceptions using the standard functions. This patch implements the basic set, but only exposes the `fegetround` and `fesetround` utilities. This ps because getting the exceptions to work is difficult due to the behavior with the DX10_CLAMP bit that is always set. It is worth noting that this utility is not strictly standards conformant because we can only control this behavior on individual warps. Whether or not we can say it's truly implemented then is an exercise to the reader.
1 parent 23f3651 commit 3e51451

File tree

6 files changed

+336
-18
lines changed

6 files changed

+336
-18
lines changed

libc/config/gpu/api.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
5454
}
5555

5656
def FenvAPI: PublicAPI<"fenv.h"> {
57-
let Types = ["fenv_t"];
57+
let Types = ["fenv_t", "fexcept_t"];
5858
}
5959

6060
def StdIOAPI : PublicAPI<"stdio.h"> {

libc/config/gpu/entrypoints.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,19 @@ set(TARGET_LIBC_ENTRYPOINTS
183183
libc.src.gpu.rpc_host_call
184184
)
185185

186+
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
187+
set(extra_entrypoints
188+
# fenv.h entrypoints
189+
libc.src.fenv.fegetenv
190+
libc.src.fenv.fegetround
191+
libc.src.fenv.fesetenv
192+
libc.src.fenv.fesetround
193+
)
194+
endif()
195+
186196
set(TARGET_LIBM_ENTRYPOINTS
197+
${extra_entrypoints}
198+
187199
# math.h entrypoints
188200
libc.src.math.acos
189201
libc.src.math.acosf

libc/include/llvm-libc-macros/math-macros.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#define math_errhandling 0
3333
#elif defined(__NO_MATH_ERRNO__)
3434
#define math_errhandling (MATH_ERREXCEPT)
35+
#elif defined(__AMDGPU__)
36+
#define math_errhandling (MATH_ERREXCEPT)
3537
#elif defined(__NVPTX__) || defined(__AMDGPU__)
3638
#define math_errhandling (MATH_ERRNO)
3739
#else

libc/src/__support/FPUtil/FEnvImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
#include "arm/FEnvImpl.h"
3434
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
3535
#include "riscv/FEnvImpl.h"
36+
#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
37+
#include "amdgpu/FEnvImpl.h"
3638
#else
3739

3840
namespace LIBC_NAMESPACE::fputil {
Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
10+
#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
11+
12+
#include "src/__support/GPU/utils.h"
13+
#include "src/__support/macros/attributes.h"
14+
#include "src/__support/macros/properties/architectures.h"
15+
16+
#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
17+
#error "Invalid include"
18+
#endif
19+
20+
#include <fenv.h>
21+
#include <stdint.h>
22+
23+
namespace LIBC_NAMESPACE {
24+
namespace fputil {
25+
26+
namespace internal {
27+
28+
// Gets the immediate argument to access the AMDGPU hardware register. The
29+
// register access is encoded in a 16-bit immediate value according to the
30+
// following layout.
31+
//
32+
// ┌──────────────┬──────────────┬───────────────┐
33+
// │ SIZE[15:11] │ OFFSET[10:6] │ ID[5:0] │
34+
// └──────────────┴──────────────┴───────────────┘
35+
//
36+
// This will read the size number of bits starting at the offset bit from the
37+
// corresponding hardware register ID.
38+
LIBC_INLINE constexpr uint16_t get_register(uint32_t id, uint32_t offset,
39+
uint32_t size) {
40+
return static_cast<uint16_t>(size << 11 | offset << 6 | id);
41+
}
42+
43+
// Integral identifiers for the relevant hardware registers.
44+
enum Register : uint16_t {
45+
// The mode register controls the floating point behaviour of the device. It
46+
// can be read or written to by the kernel during runtime It is laid out as a
47+
// bit field with the following offsets and sizes listed for the relevant
48+
// entries.
49+
//
50+
// ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
51+
// │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENORM[7:4] │ ROUND[3:0] │
52+
// └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
53+
//
54+
// The rounding mode and denormal modes both control f64/f16 and f32 precision
55+
// operations separately with two bits. The accepted values for the rounding
56+
// mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
57+
// respectively.
58+
//
59+
// The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
60+
// vector ALU. When set this will clamp NaN values to zero and pass them
61+
// otherwise. A hardware bug causes this bit to prevent floating exceptions
62+
// from being recorded if this bit is set on all generations before GFX12.
63+
//
64+
// The IEEE bit controls whether or not floating point operations supporting
65+
// exception gathering are IEEE 754-2008 compliant.
66+
//
67+
// The EXCP field indicates which exceptions will cause the instruction to
68+
// take a trap if traps are enabled, see the status register. The bit layout
69+
// is identical to that in the trap status register. We are only concerned
70+
// with the first six bits and ignore the other three.
71+
HW_REG_MODE = 1,
72+
HW_REG_MODE_ROUND = get_register(HW_REG_MODE, 0, 4),
73+
HW_REG_MODE_CLAMP = get_register(HW_REG_MODE, 8, 1),
74+
HW_REG_MODE_EXCP = get_register(HW_REG_MODE, 12, 6),
75+
76+
// The status register is a read-only register that contains information about
77+
// how the kernel was launched. The sixth bit TRAP_EN[6] indicates whether or
78+
// not traps are enabled for this kernel. If this bit is set along with the
79+
// corresponding bit in the mode register then a trap will be taken.
80+
HW_REG_STATUS = 2,
81+
HW_REG_STATUS_TRAP_EN = get_register(HW_REG_STATUS, 6, 1),
82+
83+
// The trap status register contains information about the status of the
84+
// exceptions. These bits are accumulated regarless of trap handling statuss
85+
// and are sticky until cleared.
86+
//
87+
// 5 4 3 2 1 0
88+
// ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
89+
// │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
90+
// └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
91+
//
92+
// These exceptions indicate that at least one lane in the current wavefront
93+
// signalled an floating point exception. There is no way to increase the
94+
// granularity.
95+
HW_REG_TRAPSTS = 3,
96+
HW_REG_TRAPSTS_EXCP = get_register(HW_REG_TRAPSTS, 0, 6),
97+
};
98+
99+
// The six bits used to encode the standard floating point exceptions in the
100+
// trap status register.
101+
enum ExceptionFlags : uint32_t {
102+
EXCP_INVALID_F = 0x1,
103+
EXCP_DENORMAL_F = 0x2,
104+
EXCP_DIV_BY_ZERO_F = 0x4,
105+
EXCP_OVERFLOW_F = 0x8,
106+
EXCP_UNDERFLOW_F = 0x10,
107+
EXCP_INEXACT_F = 0x20,
108+
};
109+
110+
// The two bit encoded rounding modes used in the mode register.
111+
enum RoundingFlags : uint32_t {
112+
ROUND_TO_NEAREST = 0x0,
113+
ROUND_UPWARD = 0x1,
114+
ROUND_DOWNWARD = 0x2,
115+
ROUND_TOWARD_ZERO = 0x3,
116+
};
117+
118+
// Exception flags are individual bits in the corresponding hardware register.
119+
// This converts between the exported C standard values and the hardware values.
120+
LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
121+
return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
122+
#ifdef __FE_DENORM
123+
(excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) |
124+
#endif // __FE_DENORM
125+
(excepts & FE_DIVBYZERO ? EXCP_DIV_BY_ZERO_F : 0) |
126+
(excepts & FE_OVERFLOW ? EXCP_OVERFLOW_F : 0) |
127+
(excepts & FE_UNDERFLOW ? EXCP_UNDERFLOW_F : 0) |
128+
(excepts & FE_INEXACT ? EXCP_INEXACT_F : 0);
129+
}
130+
131+
LIBC_INLINE uint32_t get_except_value_for_status(uint32_t status) {
132+
return (status & EXCP_INVALID_F ? FE_INVALID : 0) |
133+
#ifdef __FE_DENORM
134+
(status & EXCP_DENORMAL_F ? __FE_DENORM : 0) |
135+
#endif // __FE_DENORM
136+
(status & EXCP_DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
137+
(status & EXCP_OVERFLOW_F ? FE_OVERFLOW : 0) |
138+
(status & EXCP_UNDERFLOW_F ? FE_UNDERFLOW : 0) |
139+
(status & EXCP_INEXACT_F ? FE_INEXACT : 0);
140+
}
141+
142+
// FIXME: These require the 'noinline' attribute to pessimistically flush the
143+
// state. Otherwise, reading from the register may return stale results.
144+
145+
// Access the six bits in the trap status register for the floating point
146+
// exceptions.
147+
[[gnu::noinline]] LIBC_INLINE void set_trap_status(uint32_t status) {
148+
uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), status);
149+
__builtin_amdgcn_s_setreg(HW_REG_TRAPSTS_EXCP, val);
150+
}
151+
152+
[[gnu::noinline]] LIBC_INLINE uint32_t get_trap_status() {
153+
return __builtin_amdgcn_s_getreg(HW_REG_TRAPSTS_EXCP);
154+
}
155+
156+
// Access the six bits in the mode register that control which exceptions will
157+
// result in a trap being taken. Uses the same flags as the status register.
158+
[[gnu::noinline]] LIBC_INLINE void set_enabled_trap(uint32_t flags) {
159+
uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
160+
__builtin_amdgcn_s_setreg(HW_REG_MODE_EXCP, val);
161+
}
162+
163+
[[gnu::noinline]] LIBC_INLINE uint32_t get_enabled_trap() {
164+
return __builtin_amdgcn_s_getreg(HW_REG_MODE_EXCP);
165+
}
166+
167+
// Access the four bits in the mode register's ROUND[3:0] field. The hardware
168+
// supports setting the f64/f16 and f32 precision rounding modes separately but
169+
// we will assume that these always match.
170+
[[gnu::noinline]] LIBC_INLINE void set_rounding_mode(uint32_t flags) {
171+
uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
172+
__builtin_amdgcn_s_setreg(HW_REG_MODE_ROUND, val << 2 | val);
173+
}
174+
175+
[[gnu::noinline]] LIBC_INLINE uint32_t get_rounding_mode() {
176+
return __builtin_amdgcn_s_getreg(HW_REG_MODE_ROUND) & 0x3;
177+
}
178+
179+
// NOTE: On architectures before GFX12 the DX10_CLAMP bit supresses all floating
180+
// point exceptions. In order to get them to be presented we need to
181+
// manually set if off.
182+
[[gnu::noinline]] LIBC_INLINE void set_clamp_low() {
183+
__builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 0);
184+
}
185+
186+
[[gnu::noinline]] LIBC_INLINE void set_clamp_high() {
187+
__builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 1);
188+
}
189+
190+
} // namespace internal
191+
192+
LIBC_INLINE int clear_except(int excepts) {
193+
uint32_t status = internal::get_status_value_for_except(excepts);
194+
uint32_t invert = ~status & 0x3f;
195+
uint32_t active = internal::get_trap_status();
196+
internal::set_trap_status(active & invert);
197+
return 0;
198+
}
199+
200+
LIBC_INLINE int test_except(int excepts) {
201+
uint32_t status = internal::get_status_value_for_except(excepts);
202+
uint32_t active = internal::get_trap_status();
203+
return internal::get_except_value_for_status(active) & status;
204+
}
205+
206+
LIBC_INLINE int get_except() { return internal::get_trap_status(); }
207+
208+
LIBC_INLINE int set_except(int excepts) {
209+
internal::set_trap_status(internal::get_status_value_for_except(excepts));
210+
return 0;
211+
}
212+
213+
LIBC_INLINE int enable_except(int excepts) {
214+
uint32_t status = internal::get_status_value_for_except(excepts);
215+
uint32_t active = internal::get_trap_status();
216+
internal::set_enabled_trap(status);
217+
return internal::get_except_value_for_status(active);
218+
}
219+
220+
LIBC_INLINE int disable_except(int excepts) {
221+
uint32_t status = internal::get_status_value_for_except(excepts);
222+
uint32_t invert = ~status & 0x3f;
223+
uint32_t active = internal::get_enabled_trap();
224+
internal::set_enabled_trap(active & invert);
225+
return active;
226+
}
227+
228+
LIBC_INLINE int raise_except(int excepts) {
229+
uint32_t status = internal::get_status_value_for_except(excepts);
230+
enable_except(status);
231+
internal::set_trap_status(status);
232+
return 0;
233+
}
234+
235+
LIBC_INLINE int get_round() {
236+
switch (internal::get_rounding_mode()) {
237+
case internal::ROUND_TO_NEAREST:
238+
return FE_TONEAREST;
239+
case internal::ROUND_UPWARD:
240+
return FE_UPWARD;
241+
case internal::ROUND_DOWNWARD:
242+
return FE_DOWNWARD;
243+
case internal::ROUND_TOWARD_ZERO:
244+
return FE_TOWARDZERO;
245+
}
246+
__builtin_unreachable();
247+
}
248+
249+
LIBC_INLINE int set_round(int rounding_mode) {
250+
switch (rounding_mode) {
251+
case FE_TONEAREST:
252+
internal::set_rounding_mode(internal::ROUND_TO_NEAREST);
253+
break;
254+
case FE_UPWARD:
255+
internal::set_rounding_mode(internal::ROUND_UPWARD);
256+
break;
257+
case FE_DOWNWARD:
258+
internal::set_rounding_mode(internal::ROUND_DOWNWARD);
259+
break;
260+
case FE_TOWARDZERO:
261+
internal::set_rounding_mode(internal::ROUND_TOWARD_ZERO);
262+
break;
263+
default:
264+
return 1;
265+
}
266+
return 0;
267+
}
268+
269+
// The fenv_t struct for the AMD GPU is simply a 32-bit integer field of the
270+
// current state. We combine the four bits for the rounding mode with the six
271+
// bits for the exception state and the six bits for the enabled exceptions.
272+
//
273+
// ┌────────────────────────────┬─────────────────┬─────────────┬─────────────┐
274+
// │ UNUSED[31:16] │ ENABLED[15:10] │ STATUS[9:4] │ ROUND[3:0] │
275+
// └────────────────────────────┴─────────────────┴─────────────┴─────────────┘
276+
//
277+
// The top sixteen bits are currently unused and should be zero.
278+
LIBC_INLINE int get_env(fenv_t *env) {
279+
if (!env)
280+
return 1;
281+
282+
uint32_t rounding = internal::get_rounding_mode();
283+
uint32_t status = internal::get_trap_status();
284+
uint32_t enabled = internal::get_enabled_trap();
285+
env->__fpc = enabled << 10 | status << 4 | rounding;
286+
return 0;
287+
}
288+
289+
LIBC_INLINE int set_env(const fenv_t *env) {
290+
if (!env)
291+
return 1;
292+
293+
internal::set_rounding_mode(env->__fpc & 0xf);
294+
internal::set_trap_status((env->__fpc >> 4) & 0x3f);
295+
internal::set_enabled_trap((env->__fpc >> 10) & 0x3f);
296+
return 0;
297+
}
298+
299+
} // namespace fputil
300+
} // namespace LIBC_NAMESPACE
301+
302+
#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H

0 commit comments

Comments
 (0)