[libc] Implement basic 'fenv.h' utilities on the AMD GPU

jhuber6 · jhuber6 · commit de4dc4a56f44 · 2024-04-02T11:57:15.000-05:00
Summary:
This patch implements a basic floating point environment on the AMDGPU.
Users should be able to check rounding modes or certain floating point
exceptions using the standard functions. This patch implements the basic
set, but only exposes the `fegetround` and `fesetround` utilities. This
ps because getting the exceptions to work is difficult due to the
behavior with the DX10_CLAMP bit that is always set.

It is worth noting that this utility is not strictly standards
conformant because we can only control this behavior on individual
warps. Whether or not we can say it's truly implemented then is an
exercise to the reader.
diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
@@ -55,7 +55,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
 }
 
 def FenvAPI: PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t"];
+  let Types = ["fenv_t", "fexcept_t"];
 }
 
 def StdIOAPI : PublicAPI<"stdio.h"> {
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
@@ -213,7 +213,19 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.gpu.rpc_host_call
 )
 
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  set(extra_entrypoints
+      # fenv.h entrypoints
+      libc.src.fenv.fegetenv
+      libc.src.fenv.fegetround
+      libc.src.fenv.fesetenv
+      libc.src.fenv.fesetround
+  )
+endif()
+
 set(TARGET_LIBM_ENTRYPOINTS
+    ${extra_entrypoints}
+
     # math.h entrypoints
     libc.src.math.acos
     libc.src.math.acosf
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
@@ -45,6 +45,8 @@
 #define math_errhandling 0
 #elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
+#elif defined(__AMDGPU__)
+#define math_errhandling (MATH_ERREXCEPT)
 #elif defined(__NVPTX__) || defined(__AMDGPU__)
 #define math_errhandling (MATH_ERRNO)
 #else
diff --git a/libc/include/llvm-libc-types/fenv_t.h b/libc/include/llvm-libc-types/fenv_t.h
@@ -27,7 +27,7 @@ typedef struct {
 typedef unsigned int fenv_t;
 #elif defined(__AMDGPU__) || defined(__NVPTX__)
 typedef struct {
-  unsigned int __fpc;
+  unsigned long long __fpc;
 } fenv_t;
 #else
 #error "fenv_t not defined for your platform"
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -31,6 +31,8 @@
 #include "arm/FEnvImpl.h"
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "riscv/FEnvImpl.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#include "amdgpu/FEnvImpl.h"
 #else
 
 namespace LIBC_NAMESPACE::fputil {
diff --git a/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
@@ -0,0 +1,213 @@
+//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "Invalid include"
+#endif
+
+#include <fenv.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+namespace internal {
+// Retuns the current status of the AMDGPU floating point environment. In
+// practice this is simply a 64-bit concatenation of the mode register and the
+// trap status register.
+//
+// The mode register controls the floating point behaviour of the device. It
+// can be read or written to by the kernel during runtime It is laid out as a
+// bit field with the following offsets and sizes listed for the relevant
+// entries.
+//
+// ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
+// │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENORM[7:4] │ ROUND[3:0] │
+// └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
+//
+// The rounding mode and denormal modes both control f64/f16 and f32 precision
+// operations separately with two bits. The accepted values for the rounding
+// mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
+// respectively.
+//
+// The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
+// vector ALU. When set this will clamp NaN values to zero and pass them
+// otherwise. A hardware bug causes this bit to prevent floating exceptions
+// from being recorded if this bit is set on all generations before GFX12.
+//
+// The IEEE bit controls whether or not floating point operations supporting
+// exception gathering are IEEE 754-2008 compliant.
+//
+// The EXCP field indicates which exceptions will cause the instruction to
+// take a trap if traps are enabled, see the status register. The bit layout
+// is identical to that in the trap status register. We are only concerned
+// with the first six bits and ignore the other three.
+//
+// The trap status register contains information about the status of the
+// exceptions. These bits are accumulated regarless of trap handling statuss
+// and are sticky until cleared.
+//
+// 5         4           3          2                1          0
+// ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
+// │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
+// └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
+//
+// These exceptions indicate that at least one lane in the current wavefront
+// signalled an floating point exception. There is no way to increase the
+// granularity.
+//
+// The returned value has the following layout.
+//
+// ┌────────────────────┬─────────────────────┐
+// │ Trap Status[38:32] │ Mode Register[31:0] │
+// └────────────────────┴─────────────────────┘
+LIBC_INLINE uint64_t get_fpenv() { return __builtin_amdgcn_get_fpenv(); }
+
+// Set the floating point environment using the same layout as above.
+LIBC_INLINE void set_fpenv(uint64_t env) { __builtin_amdgcn_set_fpenv(env); }
+
+// The six bits used to encode the standard floating point exceptions in the
+// trap status register.
+enum ExceptionFlags : uint32_t {
+  EXCP_INVALID_F = 0x1,
+  EXCP_DENORMAL_F = 0x2,
+  EXCP_DIV_BY_ZERO_F = 0x4,
+  EXCP_OVERFLOW_F = 0x8,
+  EXCP_UNDERFLOW_F = 0x10,
+  EXCP_INEXACT_F = 0x20,
+};
+
+// The two bit encoded rounding modes used in the mode register.
+enum RoundingFlags : uint32_t {
+  ROUND_TO_NEAREST = 0x0,
+  ROUND_UPWARD = 0x1,
+  ROUND_DOWNWARD = 0x2,
+  ROUND_TOWARD_ZERO = 0x3,
+};
+
+// Exception flags are individual bits in the corresponding hardware register.
+// This converts between the exported C standard values and the hardware values.
+LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
+  return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
+#ifdef __FE_DENORM
+         (excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) |
+#endif // __FE_DENORM
+         (excepts & FE_DIVBYZERO ? EXCP_DIV_BY_ZERO_F : 0) |
+         (excepts & FE_OVERFLOW ? EXCP_OVERFLOW_F : 0) |
+         (excepts & FE_UNDERFLOW ? EXCP_UNDERFLOW_F : 0) |
+         (excepts & FE_INEXACT ? EXCP_INEXACT_F : 0);
+}
+
+LIBC_INLINE uint32_t get_except_value_for_status(uint32_t status) {
+  return (status & EXCP_INVALID_F ? FE_INVALID : 0) |
+#ifdef __FE_DENORM
+         (status & EXCP_DENORMAL_F ? __FE_DENORM : 0) |
+#endif // __FE_DENORM
+         (status & EXCP_DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
+         (status & EXCP_OVERFLOW_F ? FE_OVERFLOW : 0) |
+         (status & EXCP_UNDERFLOW_F ? FE_UNDERFLOW : 0) |
+         (status & EXCP_INEXACT_F ? FE_INEXACT : 0);
+}
+
+// Access the four bits in the mode register's ROUND[3:0] field. The hardware
+// supports setting the f64/f16 and f32 precision rounding modes separately but
+// we will assume that these always match.
+LIBC_INLINE void set_rounding_mode(uint32_t flags) {
+  uint64_t old = get_fpenv() & 0xfffffffffffffff0;
+  set_fpenv(old | flags << 2 | flags);
+}
+
+// The control register can modify f32/f64 rounding modes individually. For our
+// purposes we assume that these always match as we do not expose this through
+// the C interface.
+LIBC_INLINE uint32_t get_rounding_mode() { return get_fpenv() & 0x3; }
+
+} // namespace internal
+
+// TODO: Not implemented yet.
+LIBC_INLINE int clear_except(int) { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int test_except(int) { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int get_except() { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int set_except(int) { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int enable_except(int) { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int disable_except(int) { return 0; }
+
+// TODO: Not implemented yet.
+LIBC_INLINE int raise_except(int) { return 0; }
+
+LIBC_INLINE int get_round() {
+  switch (internal::get_rounding_mode()) {
+  case internal::ROUND_TO_NEAREST:
+    return FE_TONEAREST;
+  case internal::ROUND_UPWARD:
+    return FE_UPWARD;
+  case internal::ROUND_DOWNWARD:
+    return FE_DOWNWARD;
+  case internal::ROUND_TOWARD_ZERO:
+    return FE_TOWARDZERO;
+  }
+  __builtin_unreachable();
+}
+
+LIBC_INLINE int set_round(int rounding_mode) {
+  switch (rounding_mode) {
+  case FE_TONEAREST:
+    internal::set_rounding_mode(internal::ROUND_TO_NEAREST);
+    break;
+  case FE_UPWARD:
+    internal::set_rounding_mode(internal::ROUND_UPWARD);
+    break;
+  case FE_DOWNWARD:
+    internal::set_rounding_mode(internal::ROUND_DOWNWARD);
+    break;
+  case FE_TOWARDZERO:
+    internal::set_rounding_mode(internal::ROUND_TOWARD_ZERO);
+    break;
+  default:
+    return 1;
+  }
+  return 0;
+}
+
+LIBC_INLINE int get_env(fenv_t *env) {
+  if (!env)
+    return 1;
+
+  env->__fpc = internal::get_fpenv();
+  return 0;
+}
+
+LIBC_INLINE int set_env(const fenv_t *env) {
+  if (!env)
+    return 1;
+
+  internal::set_fpenv(env->__fpc);
+  return 0;
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt
@@ -1,20 +1,20 @@
-add_custom_target(libc_fenv_unittests)
+add_custom_target(libc_fenv_tests)
 
-add_libc_unittest(
+add_libc_test(
   rounding_mode_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     rounding_mode_test.cpp
   DEPENDS
     libc.src.fenv.fegetround
     libc.src.fenv.fesetround
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_status_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_status_test.cpp
   DEPENDS
@@ -24,24 +24,27 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   getenv_and_setenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     getenv_and_setenv_test.cpp
   DEPENDS
     libc.src.fenv.fegetenv
     libc.src.fenv.fegetround
     libc.src.fenv.fesetenv
     libc.src.fenv.fesetround
+    libc.src.fenv.feclearexcept
+    libc.src.fenv.feraiseexcept
+    libc.src.fenv.fetestexcept
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_flags_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_flags_test.cpp
   DEPENDS
@@ -50,10 +53,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feupdateenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feupdateenv_test.cpp
   DEPENDS
@@ -62,21 +65,21 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feclearexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feclearexcept_test.cpp
   DEPENDS
     libc.src.fenv.feclearexcept
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feenableexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feenableexcept_test.cpp
   DEPENDS
@@ -96,7 +99,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     enabled_exceptions_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       enabled_exceptions_test.cpp
     DEPENDS
@@ -113,7 +116,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     feholdexcept_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       feholdexcept_test.cpp
     DEPENDS

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {`
`55`	`55`	`}`
`56`	`56`
`57`	`57`	`def FenvAPI: PublicAPI<"fenv.h"> {`
`58`		`- let Types = ["fenv_t"];`
	`58`	`+ let Types = ["fenv_t", "fexcept_t"];`
`59`	`59`	`}`
`60`	`60`
`61`	`61`	`def StdIOAPI : PublicAPI<"stdio.h"> {`