Skip to content

Commit 7585511

Browse files
committed
[libc] Implement basic 'fenv.h' utilities on the AMD GPU
Summary: This patch implements a basic floating point environment on the AMDGPU. Users should be able to check rounding modes or certain floating point exceptions using the standard functions. This patch implements the basic set, but only exposes the `fegetround` and `fesetround` utilities. This ps because getting the exceptions to work is difficult due to the behavior with the DX10_CLAMP bit that is always set. It is worth noting that this utility is not strictly standards conformant because we can only control this behavior on individual warps. Whether or not we can say it's truly implemented then is an exercise to the reader.
1 parent 70b79a9 commit 7585511

File tree

8 files changed

+323
-25
lines changed

8 files changed

+323
-25
lines changed

libc/config/gpu/api.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
5555
}
5656

5757
def FenvAPI: PublicAPI<"fenv.h"> {
58-
let Types = ["fenv_t"];
58+
let Types = ["fenv_t", "fexcept_t"];
5959
}
6060

6161
def StdIOAPI : PublicAPI<"stdio.h"> {

libc/config/gpu/entrypoints.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,19 @@ set(TARGET_LIBC_ENTRYPOINTS
214214
libc.src.gpu.rpc_fprintf
215215
)
216216

217+
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
218+
set(extra_entrypoints
219+
# fenv.h entrypoints
220+
libc.src.fenv.fegetenv
221+
libc.src.fenv.fegetround
222+
libc.src.fenv.fesetenv
223+
libc.src.fenv.fesetround
224+
)
225+
endif()
226+
217227
set(TARGET_LIBM_ENTRYPOINTS
228+
${extra_entrypoints}
229+
218230
# math.h entrypoints
219231
libc.src.math.acos
220232
libc.src.math.acosf

libc/include/llvm-libc-macros/math-macros.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
#define math_errhandling 0
4646
#elif defined(__NO_MATH_ERRNO__)
4747
#define math_errhandling (MATH_ERREXCEPT)
48+
#elif defined(__AMDGPU__)
49+
#define math_errhandling (MATH_ERREXCEPT)
4850
#elif defined(__NVPTX__) || defined(__AMDGPU__)
4951
#define math_errhandling (MATH_ERRNO)
5052
#else

libc/include/llvm-libc-types/fenv_t.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ typedef struct {
2727
typedef unsigned int fenv_t;
2828
#elif defined(__AMDGPU__) || defined(__NVPTX__)
2929
typedef struct {
30-
unsigned int __fpc;
30+
unsigned long long __fpc;
3131
} fenv_t;
3232
#else
3333
#error "fenv_t not defined for your platform"

libc/src/__support/FPUtil/FEnvImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#include "arm/FEnvImpl.h"
3333
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
3434
#include "riscv/FEnvImpl.h"
35+
#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
36+
#include "amdgpu/FEnvImpl.h"
3537
#else
3638

3739
namespace LIBC_NAMESPACE::fputil {
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
10+
#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
11+
12+
#include "src/__support/GPU/utils.h"
13+
#include "src/__support/macros/attributes.h"
14+
#include "src/__support/macros/properties/architectures.h"
15+
16+
#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
17+
#error "Invalid include"
18+
#endif
19+
20+
#include "hdr/types/fenv_t.h"
21+
#include <stdint.h>
22+
23+
namespace LIBC_NAMESPACE {
24+
namespace fputil {
25+
26+
namespace internal {
27+
// Retuns the current status of the AMDGPU floating point environment. In
28+
// practice this is simply a 64-bit concatenation of the mode register and the
29+
// trap status register.
30+
//
31+
// The mode register controls the floating point behaviour of the device. It
32+
// can be read or written to by the kernel during runtime It is laid out as a
33+
// bit field with the following offsets and sizes listed for the relevant
34+
// entries.
35+
//
36+
// ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
37+
// │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENORM[7:4] │ ROUND[3:0] │
38+
// └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
39+
//
40+
// The rounding mode and denormal modes both control f64/f16 and f32 precision
41+
// operations separately with two bits. The accepted values for the rounding
42+
// mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
43+
// respectively.
44+
//
45+
// The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
46+
// vector ALU. When set this will clamp NaN values to zero and pass them
47+
// otherwise. A hardware bug causes this bit to prevent floating exceptions
48+
// from being recorded if this bit is set on all generations before GFX12.
49+
//
50+
// The IEEE bit controls whether or not floating point operations supporting
51+
// exception gathering are IEEE 754-2008 compliant.
52+
//
53+
// The EXCP field indicates which exceptions will cause the instruction to
54+
// take a trap if traps are enabled, see the status register. The bit layout
55+
// is identical to that in the trap status register. We are only concerned
56+
// with the first six bits and ignore the other three.
57+
//
58+
// The trap status register contains information about the status of the
59+
// exceptions. These bits are accumulated regarless of trap handling statuss
60+
// and are sticky until cleared.
61+
//
62+
// 5 4 3 2 1 0
63+
// ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
64+
// │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
65+
// └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
66+
//
67+
// These exceptions indicate that at least one lane in the current wavefront
68+
// signalled an floating point exception. There is no way to increase the
69+
// granularity.
70+
//
71+
// The returned value has the following layout.
72+
//
73+
// ┌────────────────────┬─────────────────────┐
74+
// │ Trap Status[38:32] │ Mode Register[31:0] │
75+
// └────────────────────┴─────────────────────┘
76+
LIBC_INLINE uint64_t get_fpenv() { return __builtin_amdgcn_get_fpenv(); }
77+
78+
// Set the floating point environment using the same layout as above.
79+
LIBC_INLINE void set_fpenv(uint64_t env) { __builtin_amdgcn_set_fpenv(env); }
80+
81+
// The six bits used to encode the standard floating point exceptions in the
82+
// trap status register.
83+
enum ExceptionFlags : uint32_t {
84+
EXCP_INVALID_F = 0x1,
85+
EXCP_DENORMAL_F = 0x2,
86+
EXCP_DIV_BY_ZERO_F = 0x4,
87+
EXCP_OVERFLOW_F = 0x8,
88+
EXCP_UNDERFLOW_F = 0x10,
89+
EXCP_INEXACT_F = 0x20,
90+
};
91+
92+
// The values used by the AMDGPU backend to handle the `llvm.get.rounding`
93+
// intrinsic function. See the values in the documentation for more information.
94+
// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-rounding-mode-enumeration-values-table
95+
enum RoundingFlags : uint32_t {
96+
ROUND_TOWARD_ZERO = 0x0,
97+
ROUND_TO_NEAREST = 0x1,
98+
ROUND_UPWARD = 0x2,
99+
ROUND_DOWNWARD = 0x3,
100+
};
101+
102+
// Exception flags are individual bits in the corresponding hardware register.
103+
// This converts between the exported C standard values and the hardware values.
104+
LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
105+
return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
106+
#ifdef __FE_DENORM
107+
(excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) |
108+
#endif // __FE_DENORM
109+
(excepts & FE_DIVBYZERO ? EXCP_DIV_BY_ZERO_F : 0) |
110+
(excepts & FE_OVERFLOW ? EXCP_OVERFLOW_F : 0) |
111+
(excepts & FE_UNDERFLOW ? EXCP_UNDERFLOW_F : 0) |
112+
(excepts & FE_INEXACT ? EXCP_INEXACT_F : 0);
113+
}
114+
115+
LIBC_INLINE uint32_t get_except_value_for_status(uint32_t status) {
116+
return (status & EXCP_INVALID_F ? FE_INVALID : 0) |
117+
#ifdef __FE_DENORM
118+
(status & EXCP_DENORMAL_F ? __FE_DENORM : 0) |
119+
#endif // __FE_DENORM
120+
(status & EXCP_DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
121+
(status & EXCP_OVERFLOW_F ? FE_OVERFLOW : 0) |
122+
(status & EXCP_UNDERFLOW_F ? FE_UNDERFLOW : 0) |
123+
(status & EXCP_INEXACT_F ? FE_INEXACT : 0);
124+
}
125+
126+
// Set the hardware rounding mode using the llvm.set.rounding intrinsic
127+
// function.
128+
// FIXME: This requires `noinline` to flush the hardware register in time.
129+
LIBC_INLINE void set_rounding_mode(uint32_t mode) {
130+
__builtin_set_flt_rounds(mode);
131+
}
132+
133+
// Get the hardware rounding mode using the llvm.get.rounding intrinsic
134+
// function.
135+
// FIXME: This requires `noinline` to flush the hardware register in time.
136+
LIBC_INLINE uint32_t get_rounding_mode() {
137+
return __builtin_flt_rounds();
138+
}
139+
140+
} // namespace internal
141+
142+
// TODO: Not implemented yet.
143+
LIBC_INLINE int clear_except(int) { return 0; }
144+
145+
// TODO: Not implemented yet.
146+
LIBC_INLINE int test_except(int) { return 0; }
147+
148+
// TODO: Not implemented yet.
149+
LIBC_INLINE int get_except() { return 0; }
150+
151+
// TODO: Not implemented yet.
152+
LIBC_INLINE int set_except(int) { return 0; }
153+
154+
// TODO: Not implemented yet.
155+
LIBC_INLINE int enable_except(int) { return 0; }
156+
157+
// TODO: Not implemented yet.
158+
LIBC_INLINE int disable_except(int) { return 0; }
159+
160+
// TODO: Not implemented yet.
161+
LIBC_INLINE int raise_except(int) { return 0; }
162+
163+
// Get the currently set rounding mode from the environment. The AMDGPU backend
164+
// supports an extension for separate f64 / f32 rounding control. If the
165+
// provided value is outside of the standard region we handle it without
166+
// modification.
167+
LIBC_INLINE int get_round() {
168+
uint32_t mode = internal::get_rounding_mode();
169+
switch (mode) {
170+
case internal::ROUND_TO_NEAREST:
171+
return FE_TONEAREST;
172+
case internal::ROUND_UPWARD:
173+
return FE_UPWARD;
174+
case internal::ROUND_DOWNWARD:
175+
return FE_DOWNWARD;
176+
case internal::ROUND_TOWARD_ZERO:
177+
return FE_TOWARDZERO;
178+
default:
179+
return mode;
180+
}
181+
__builtin_unreachable();
182+
}
183+
184+
// Set the rounding mode for the environment. If the provided mode is above the
185+
// expected range we assume it is an extended value to control f32 / f64
186+
// separately.
187+
LIBC_INLINE int set_round(int rounding_mode) {
188+
switch (rounding_mode) {
189+
case FE_TONEAREST:
190+
internal::set_rounding_mode(internal::ROUND_TO_NEAREST);
191+
break;
192+
case FE_UPWARD:
193+
internal::set_rounding_mode(internal::ROUND_UPWARD);
194+
break;
195+
case FE_DOWNWARD:
196+
internal::set_rounding_mode(internal::ROUND_DOWNWARD);
197+
break;
198+
case FE_TOWARDZERO:
199+
internal::set_rounding_mode(internal::ROUND_TOWARD_ZERO);
200+
break;
201+
default:
202+
internal::set_rounding_mode(rounding_mode);
203+
break;
204+
}
205+
return 0;
206+
}
207+
208+
LIBC_INLINE int get_env(fenv_t *env) {
209+
if (!env)
210+
return 1;
211+
212+
env->__fpc = internal::get_fpenv();
213+
return 0;
214+
}
215+
216+
LIBC_INLINE int set_env(const fenv_t *env) {
217+
if (!env)
218+
return 1;
219+
220+
internal::set_fpenv(env->__fpc);
221+
return 0;
222+
}
223+
224+
} // namespace fputil
225+
} // namespace LIBC_NAMESPACE
226+
227+
#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H

0 commit comments

Comments
 (0)