[libc] Implement basic 'fenv.h' utilities on the AMD GPU #83500

jhuber6 · 2024-02-29T23:06:57Z

Summary:
This patch implements a basic floating point environment on the AMDGPU.
Users should be able to check rounding modes or certain floating point
exceptions using the standard functions. This patch implements the basic
set, but only exposes the fegetround and fesetround utilities. This
ps because getting the exceptions to work is difficult due to the
behavior with the DX10_CLAMP bit that is always set.

It is worth noting that this utility is not strictly standards
conformant because we can only control this behavior on individual
warps. Whether or not we can say it's truly implemented then is an
exercise to the reader.

llvmbot · 2024-02-29T23:07:32Z

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

Changes

Summary:
This patch implements a basic floating point environment on the AMDGPU.
Users should be able to check rounding modes or certain floating point
exceptions using the standard functions. This patch implements the basic
set, but only exposes the fegetround and fesetround utilities. This
ps because getting the exceptions to work is difficult due to the
behavior with the DX10_CLAMP bit that is always set.

It is worth noting that this utility is not strictly standards
conformant because we can only control this behavior on individual
warps. Whether or not we can say it's truly implemented then is an
exercise to the reader.

Full diff: https://github.com/llvm/llvm-project/pull/83500.diff

6 Files Affected:

(modified) libc/config/gpu/api.td (+1-1)
(modified) libc/config/gpu/entrypoints.txt (+12)
(modified) libc/include/llvm-libc-macros/math-macros.h (+2)
(modified) libc/src/__support/FPUtil/FEnvImpl.h (+2)
(added) libc/src/__support/FPUtil/amdgpu/FEnvImpl.h (+301)
(modified) libc/test/src/fenv/CMakeLists.txt (+17-17)

diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index dbd212be56a3f1..26886d32f394fb 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -54,7 +54,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
 }
 
 def FenvAPI: PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t"];
+  let Types = ["fenv_t", "fexcept_t"];
 }
 
 def StdIOAPI : PublicAPI<"stdio.h"> {
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index fca5315fc4f0a8..8f181b85cab5b0 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -183,7 +183,19 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.gpu.rpc_host_call
 )
 
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  set(extra_entrypoints
+      # fenv.h entrypoints
+      libc.src.fenv.fegetenv
+      libc.src.fenv.fegetround
+      libc.src.fenv.fesetenv
+      libc.src.fenv.fesetround
+  )
+endif()
+
 set(TARGET_LIBM_ENTRYPOINTS
+    ${extra_entrypoints}
+
     # math.h entrypoints
     libc.src.math.acos
     libc.src.math.acosf
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index e67fe4d11b4493..4336b66617f77d 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -32,6 +32,8 @@
 #define math_errhandling 0
 #elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
+#elif defined(__AMDGPU__)
+#define math_errhandling (MATH_ERREXCEPT)
 #elif defined(__NVPTX__) || defined(__AMDGPU__)
 #define math_errhandling (MATH_ERRNO)
 #else
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 6810659733de2c..78c48f25009d41 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -33,6 +33,8 @@
 #include "arm/FEnvImpl.h"
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "riscv/FEnvImpl.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#include "amdgpu/FEnvImpl.h"
 #else
 
 namespace LIBC_NAMESPACE::fputil {
diff --git a/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
new file mode 100644
index 00000000000000..a7551dc66d0541
--- /dev/null
+++ b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
@@ -0,0 +1,301 @@
+//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "Invalid include"
+#endif
+
+#include <fenv.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+namespace internal {
+
+// Gets the immediate argument to access the AMDGPU hardware register. The
+// register access is encoded in a 16-bit immediate value according to the
+// following layout.
+//
+// ┌──────────────┬──────────────┬───────────────┐
+// │  SIZE[15:11] │ OFFSET[10:6] │    ID[5:0]    │
+// └──────────────┴──────────────┴───────────────┘
+//
+// This will read the size number of bits starting at the offset bit from the
+// corresponding hardware register ID.
+constexpr uint16_t get_register(uint8_t id, uint8_t offset, uint8_t size) {
+  return static_cast<uint16_t>(size << 11 | offset << 6 | id);
+}
+
+// Integral identifiers for the relevant hardware registers.
+enum Register : uint16_t {
+  // The mode register controls the floating point behaviour of the device. It
+  // can be read or written to by the kernel during runtime It is laid out as a
+  // bit field with the following offsets and sizes listed for the relevant
+  // entries.
+  //
+  // ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
+  // │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENOMR[7:4] │ ROUND[3:0] │
+  // └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
+  //
+  // The rounding mode and denormal modes both control f64/f16 and f32 precision
+  // operations separately with two bits. The accepted values for the rounding
+  // mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
+  // respectively.
+  //
+  // The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
+  // vector ALU. When set this will clamp NaN values to zero and pass them
+  // otherwise. A hardware bug causes this bit to prevent floating exceptions
+  // from being recorded if this bit is set on all generations before GFX12.
+  //
+  // The IEEE bit controls whether or not floating point operations supporting
+  // exception gathering are IEEE 754-2008 compliant.
+  //
+  // The EXCP field indicates which exceptions will cause the instruction to
+  // take a trap if traps are enabled, see the status register. The bit layout
+  // is identical to that in the trap status register. We are only concerned
+  // with the first six bits and ignore the other three.
+  HW_REG_MODE = 1,
+  HW_REG_MODE_ROUND = get_register(HW_REG_MODE, 0, 4),
+  HW_REG_MODE_CLAMP = get_register(HW_REG_MODE, 8, 1),
+  HW_REG_MODE_EXCP = get_register(HW_REG_MODE, 12, 6),
+
+  // The status register is a read-only register that contains information about
+  // how the kernel was launched. The sixth bit TRAP_EN[6] indicates whether or
+  // not traps are enabled for this kernel. If this bit is set along with the
+  // corresponding bit in the mode register then a trap will be taken.
+  HW_REG_STATUS = 2,
+  HW_REG_STATUS_TRAP_EN = get_register(HW_REG_STATUS, 6, 1),
+
+  // The trap status register contains information about the status of the
+  // exceptions. These bits are accumulated regarless of trap handling statuss
+  // and are sticky until cleared.
+  //
+  // 5         4           3          2                1          0
+  // ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
+  // │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
+  // └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
+  //
+  // These exceptions indicate that at least one lane in the current wavefront
+  // signalled an floating point exception. There is no way to increase the
+  // granularity.
+  HW_REG_TRAPSTS = 3,
+  HW_REG_TRAPSTS_EXCP = get_register(HW_REG_TRAPSTS, 0, 6),
+};
+
+// The six bits used to encode the standard floating point exceptions in the
+// trap status register.
+enum ExceptionFlags : uint32_t {
+  EXCP_INVALID_F = 0x1,
+  EXCP_DENORMAL_F = 0x2,
+  EXCP_DIV_BY_ZERO_F = 0x4,
+  EXCP_OVERFLOW_F = 0x8,
+  EXCP_UNDERFLOW_F = 0x10,
+  EXCP_INEXACT_F = 0x20,
+};
+
+// The two bit encoded rounding modes used in the mode register.
+enum RoundingFlags : uint32_t {
+  ROUND_TO_NEAREST = 0x0,
+  ROUND_UPWARD = 0x1,
+  ROUND_DOWNWARD = 0x2,
+  ROUND_TOWARD_ZERO = 0x3,
+};
+
+// Exception flags are individual bits in the corresponding hardware register.
+// This converts between the exported C standard values and the hardware values.
+LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
+  return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
+#ifdef __FE_DENORM
+         (excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) |
+#endif // __FE_DENORM
+         (excepts & FE_DIVBYZERO ? EXCP_DIV_BY_ZERO_F : 0) |
+         (excepts & FE_OVERFLOW ? EXCP_OVERFLOW_F : 0) |
+         (excepts & FE_UNDERFLOW ? EXCP_UNDERFLOW_F : 0) |
+         (excepts & FE_INEXACT ? EXCP_INEXACT_F : 0);
+}
+
+LIBC_INLINE uint32_t get_except_value_for_status(uint32_t status) {
+  return (status & EXCP_INVALID_F ? FE_INVALID : 0) |
+#ifdef __FE_DENORM
+         (status & EXCP_DENORMAL_F ? __FE_DENORM : 0) |
+#endif // __FE_DENORM
+         (status & EXCP_DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
+         (status & EXCP_OVERFLOW_F ? FE_OVERFLOW : 0) |
+         (status & EXCP_UNDERFLOW_F ? FE_UNDERFLOW : 0) |
+         (status & EXCP_INEXACT_F ? FE_INEXACT : 0);
+}
+
+// FIXME: These require the 'noinline' attribute to pessimistically flush the
+//        state. Otherwise, reading from the register may return stale results.
+
+// Access the six bits in the trap status register for the floating point
+// exceptions.
+[[gnu::noinline]] LIBC_INLINE void set_trap_status(uint32_t status) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), status);
+  __builtin_amdgcn_s_setreg(HW_REG_TRAPSTS_EXCP, val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_trap_status() {
+  return __builtin_amdgcn_s_getreg(HW_REG_TRAPSTS_EXCP);
+}
+
+// Access the six bits in the mode register that control which exceptions will
+// result in a trap being taken. Uses the same flags as the status register.
+[[gnu::noinline]] LIBC_INLINE void set_enabled_trap(uint32_t flags) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_EXCP, val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_enabled_trap() {
+  return __builtin_amdgcn_s_getreg(HW_REG_MODE_EXCP);
+}
+
+// Access the four bits in the mode register's ROUND[3:0] field. The hardware
+// supports setting the f64/f16 and f32 precision rounding modes separately but
+// we will assume that these always match.
+[[gnu::noinline]] LIBC_INLINE void set_rounding_mode(uint32_t flags) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_ROUND, val << 2 | val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_rounding_mode() {
+  return __builtin_amdgcn_s_getreg(HW_REG_MODE_ROUND) & 0x3;
+}
+
+// NOTE: On architectures before GFX12 the DX10_CLAMP bit supresses all floating
+//       point exceptions. In order to get them to be presented we need to
+//       manually set if off.
+[[gnu::noinline]] LIBC_INLINE void set_clamp_low() {
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 0);
+}
+
+[[gnu::noinline]] LIBC_INLINE void set_clamp_high() {
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 1);
+}
+
+} // namespace internal
+
+LIBC_INLINE int clear_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t invert = ~status & 0x3f;
+  uint32_t active = internal::get_trap_status();
+  internal::set_trap_status(active & invert);
+  return 0;
+}
+
+LIBC_INLINE int test_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t active = internal::get_trap_status();
+  return internal::get_except_value_for_status(active) & status;
+}
+
+LIBC_INLINE int get_except() { return internal::get_trap_status(); }
+
+LIBC_INLINE int set_except(int excepts) {
+  internal::set_trap_status(internal::get_status_value_for_except(excepts));
+  return 0;
+}
+
+LIBC_INLINE int enable_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t active = internal::get_trap_status();
+  internal::set_enabled_trap(status);
+  return internal::get_except_value_for_status(active);
+}
+
+LIBC_INLINE int disable_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t invert = ~status & 0x3f;
+  uint32_t active = internal::get_enabled_trap();
+  internal::set_enabled_trap(active & invert);
+  return active;
+}
+
+LIBC_INLINE int raise_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  enable_except(status);
+  internal::set_trap_status(status);
+  return 0;
+}
+
+LIBC_INLINE int get_round() {
+  switch (internal::get_rounding_mode()) {
+  case internal::ROUND_TO_NEAREST:
+    return FE_TONEAREST;
+  case internal::ROUND_UPWARD:
+    return FE_UPWARD;
+  case internal::ROUND_DOWNWARD:
+    return FE_DOWNWARD;
+  case internal::ROUND_TOWARD_ZERO:
+    return FE_TOWARDZERO;
+  }
+  __builtin_unreachable();
+}
+
+LIBC_INLINE int set_round(int rounding_mode) {
+  switch (rounding_mode) {
+  case FE_TONEAREST:
+    internal::set_rounding_mode(internal::ROUND_TO_NEAREST);
+    break;
+  case FE_UPWARD:
+    internal::set_rounding_mode(internal::ROUND_UPWARD);
+    break;
+  case FE_DOWNWARD:
+    internal::set_rounding_mode(internal::ROUND_DOWNWARD);
+    break;
+  case FE_TOWARDZERO:
+    internal::set_rounding_mode(internal::ROUND_TOWARD_ZERO);
+    break;
+  default:
+    return 1;
+  }
+  return 0;
+}
+
+// The fenv_t struct for the AMD GPU is simply a 32-bit integer field of the
+// current state. We combine the four bits for the rounding mode with the six
+// bits for the exception state and the six bits for the enabled exceptions.
+//
+// ┌────────────────────────────┬─────────────────┬─────────────┬─────────────┐
+// │       UNUSED[31:16]        │ ENABLED[15:10]  │ STATUS[9:4] │  ROUND[3:0] │
+// └────────────────────────────┴─────────────────┴─────────────┴─────────────┘
+//
+// The top sixteen bits are currently unused and should be zero.
+LIBC_INLINE int get_env(fenv_t *env) {
+  if (!env)
+    return 1;
+
+  uint32_t rounding = internal::get_rounding_mode();
+  uint32_t status = internal::get_trap_status();
+  uint32_t enabled = internal::get_enabled_trap();
+  env->__fpc = enabled << 10 | status << 4 | rounding;
+  return 0;
+}
+
+LIBC_INLINE int set_env(const fenv_t *env) {
+  if (!env)
+    return 1;
+
+  internal::set_rounding_mode(env->__fpc & 0xf);
+  internal::set_trap_status((env->__fpc >> 4) & 0x3f);
+  internal::set_enabled_trap((env->__fpc >> 10) & 0x3f);
+  return 0;
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt
index ba338bb6c73189..fe6a8135008723 100644
--- a/libc/test/src/fenv/CMakeLists.txt
+++ b/libc/test/src/fenv/CMakeLists.txt
@@ -1,9 +1,9 @@
-add_custom_target(libc_fenv_unittests)
+add_custom_target(libc_fenv_tests)
 
-add_libc_unittest(
+add_libc_test(
   rounding_mode_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     rounding_mode_test.cpp
   DEPENDS
@@ -11,10 +11,10 @@ add_libc_unittest(
     libc.src.fenv.fesetround
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_status_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_status_test.cpp
   DEPENDS
@@ -24,10 +24,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   getenv_and_setenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     getenv_and_setenv_test.cpp
   DEPENDS
@@ -38,10 +38,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_flags_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_flags_test.cpp
   DEPENDS
@@ -50,10 +50,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feupdateenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feupdateenv_test.cpp
   DEPENDS
@@ -62,10 +62,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feclearexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feclearexcept_test.cpp
   DEPENDS
@@ -73,10 +73,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feenableexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feenableexcept_test.cpp
   DEPENDS
@@ -96,7 +96,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     enabled_exceptions_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       enabled_exceptions_test.cpp
     DEPENDS
@@ -113,7 +113,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     feholdexcept_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       feholdexcept_test.cpp
     DEPENDS

llvmbot · 2024-02-29T23:07:33Z

@llvm/pr-subscribers-libc

Author: Joseph Huber (jhuber6)

Changes

Summary:
This patch implements a basic floating point environment on the AMDGPU.
Users should be able to check rounding modes or certain floating point
exceptions using the standard functions. This patch implements the basic
set, but only exposes the fegetround and fesetround utilities. This
ps because getting the exceptions to work is difficult due to the
behavior with the DX10_CLAMP bit that is always set.

It is worth noting that this utility is not strictly standards
conformant because we can only control this behavior on individual
warps. Whether or not we can say it's truly implemented then is an
exercise to the reader.

Full diff: https://github.com/llvm/llvm-project/pull/83500.diff

6 Files Affected:

(modified) libc/config/gpu/api.td (+1-1)
(modified) libc/config/gpu/entrypoints.txt (+12)
(modified) libc/include/llvm-libc-macros/math-macros.h (+2)
(modified) libc/src/__support/FPUtil/FEnvImpl.h (+2)
(added) libc/src/__support/FPUtil/amdgpu/FEnvImpl.h (+301)
(modified) libc/test/src/fenv/CMakeLists.txt (+17-17)

diff --git a/libc/config/gpu/api.td b/libc/config/gpu/api.td
index dbd212be56a3f1..26886d32f394fb 100644
--- a/libc/config/gpu/api.td
+++ b/libc/config/gpu/api.td
@@ -54,7 +54,7 @@ def StdlibAPI : PublicAPI<"stdlib.h"> {
 }
 
 def FenvAPI: PublicAPI<"fenv.h"> {
-  let Types = ["fenv_t"];
+  let Types = ["fenv_t", "fexcept_t"];
 }
 
 def StdIOAPI : PublicAPI<"stdio.h"> {
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index fca5315fc4f0a8..8f181b85cab5b0 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -183,7 +183,19 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.gpu.rpc_host_call
 )
 
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  set(extra_entrypoints
+      # fenv.h entrypoints
+      libc.src.fenv.fegetenv
+      libc.src.fenv.fegetround
+      libc.src.fenv.fesetenv
+      libc.src.fenv.fesetround
+  )
+endif()
+
 set(TARGET_LIBM_ENTRYPOINTS
+    ${extra_entrypoints}
+
     # math.h entrypoints
     libc.src.math.acos
     libc.src.math.acosf
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index e67fe4d11b4493..4336b66617f77d 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -32,6 +32,8 @@
 #define math_errhandling 0
 #elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
+#elif defined(__AMDGPU__)
+#define math_errhandling (MATH_ERREXCEPT)
 #elif defined(__NVPTX__) || defined(__AMDGPU__)
 #define math_errhandling (MATH_ERRNO)
 #else
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 6810659733de2c..78c48f25009d41 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -33,6 +33,8 @@
 #include "arm/FEnvImpl.h"
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "riscv/FEnvImpl.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#include "amdgpu/FEnvImpl.h"
 #else
 
 namespace LIBC_NAMESPACE::fputil {
diff --git a/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
new file mode 100644
index 00000000000000..a7551dc66d0541
--- /dev/null
+++ b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
@@ -0,0 +1,301 @@
+//===-- amdgpu floating point env manipulation functions --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
+
+#include "src/__support/GPU/utils.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#if !defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+#error "Invalid include"
+#endif
+
+#include <fenv.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace fputil {
+
+namespace internal {
+
+// Gets the immediate argument to access the AMDGPU hardware register. The
+// register access is encoded in a 16-bit immediate value according to the
+// following layout.
+//
+// ┌──────────────┬──────────────┬───────────────┐
+// │  SIZE[15:11] │ OFFSET[10:6] │    ID[5:0]    │
+// └──────────────┴──────────────┴───────────────┘
+//
+// This will read the size number of bits starting at the offset bit from the
+// corresponding hardware register ID.
+constexpr uint16_t get_register(uint8_t id, uint8_t offset, uint8_t size) {
+  return static_cast<uint16_t>(size << 11 | offset << 6 | id);
+}
+
+// Integral identifiers for the relevant hardware registers.
+enum Register : uint16_t {
+  // The mode register controls the floating point behaviour of the device. It
+  // can be read or written to by the kernel during runtime It is laid out as a
+  // bit field with the following offsets and sizes listed for the relevant
+  // entries.
+  //
+  // ┌─────┬─────────────┬─────┬─────────┬──────────┬─────────────┬────────────┐
+  // │ ... │ EXCP[20:12] │ ... │ IEEE[9] │ CLAMP[8] │ DENOMR[7:4] │ ROUND[3:0] │
+  // └─────┴─────────────┴─────┴─────────┴──────────┴─────────────┴────────────┘
+  //
+  // The rounding mode and denormal modes both control f64/f16 and f32 precision
+  // operations separately with two bits. The accepted values for the rounding
+  // mode are nearest, upward, downward, and toward given 0, 1, 2, and 3
+  // respectively.
+  //
+  // The CLAMP bit indicates that DirectX 10 handling of NaNs is enabled in the
+  // vector ALU. When set this will clamp NaN values to zero and pass them
+  // otherwise. A hardware bug causes this bit to prevent floating exceptions
+  // from being recorded if this bit is set on all generations before GFX12.
+  //
+  // The IEEE bit controls whether or not floating point operations supporting
+  // exception gathering are IEEE 754-2008 compliant.
+  //
+  // The EXCP field indicates which exceptions will cause the instruction to
+  // take a trap if traps are enabled, see the status register. The bit layout
+  // is identical to that in the trap status register. We are only concerned
+  // with the first six bits and ignore the other three.
+  HW_REG_MODE = 1,
+  HW_REG_MODE_ROUND = get_register(HW_REG_MODE, 0, 4),
+  HW_REG_MODE_CLAMP = get_register(HW_REG_MODE, 8, 1),
+  HW_REG_MODE_EXCP = get_register(HW_REG_MODE, 12, 6),
+
+  // The status register is a read-only register that contains information about
+  // how the kernel was launched. The sixth bit TRAP_EN[6] indicates whether or
+  // not traps are enabled for this kernel. If this bit is set along with the
+  // corresponding bit in the mode register then a trap will be taken.
+  HW_REG_STATUS = 2,
+  HW_REG_STATUS_TRAP_EN = get_register(HW_REG_STATUS, 6, 1),
+
+  // The trap status register contains information about the status of the
+  // exceptions. These bits are accumulated regarless of trap handling statuss
+  // and are sticky until cleared.
+  //
+  // 5         4           3          2                1          0
+  // ┌─────────┬───────────┬──────────┬────────────────┬──────────┬─────────┐
+  // │ Inexact │ Underflow │ Overflow │ Divide by zero │ Denormal │ Invalid │
+  // └─────────┴───────────┴──────────┴────────────────┴──────────┴─────────┘
+  //
+  // These exceptions indicate that at least one lane in the current wavefront
+  // signalled an floating point exception. There is no way to increase the
+  // granularity.
+  HW_REG_TRAPSTS = 3,
+  HW_REG_TRAPSTS_EXCP = get_register(HW_REG_TRAPSTS, 0, 6),
+};
+
+// The six bits used to encode the standard floating point exceptions in the
+// trap status register.
+enum ExceptionFlags : uint32_t {
+  EXCP_INVALID_F = 0x1,
+  EXCP_DENORMAL_F = 0x2,
+  EXCP_DIV_BY_ZERO_F = 0x4,
+  EXCP_OVERFLOW_F = 0x8,
+  EXCP_UNDERFLOW_F = 0x10,
+  EXCP_INEXACT_F = 0x20,
+};
+
+// The two bit encoded rounding modes used in the mode register.
+enum RoundingFlags : uint32_t {
+  ROUND_TO_NEAREST = 0x0,
+  ROUND_UPWARD = 0x1,
+  ROUND_DOWNWARD = 0x2,
+  ROUND_TOWARD_ZERO = 0x3,
+};
+
+// Exception flags are individual bits in the corresponding hardware register.
+// This converts between the exported C standard values and the hardware values.
+LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
+  return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
+#ifdef __FE_DENORM
+         (excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) |
+#endif // __FE_DENORM
+         (excepts & FE_DIVBYZERO ? EXCP_DIV_BY_ZERO_F : 0) |
+         (excepts & FE_OVERFLOW ? EXCP_OVERFLOW_F : 0) |
+         (excepts & FE_UNDERFLOW ? EXCP_UNDERFLOW_F : 0) |
+         (excepts & FE_INEXACT ? EXCP_INEXACT_F : 0);
+}
+
+LIBC_INLINE uint32_t get_except_value_for_status(uint32_t status) {
+  return (status & EXCP_INVALID_F ? FE_INVALID : 0) |
+#ifdef __FE_DENORM
+         (status & EXCP_DENORMAL_F ? __FE_DENORM : 0) |
+#endif // __FE_DENORM
+         (status & EXCP_DIV_BY_ZERO_F ? FE_DIVBYZERO : 0) |
+         (status & EXCP_OVERFLOW_F ? FE_OVERFLOW : 0) |
+         (status & EXCP_UNDERFLOW_F ? FE_UNDERFLOW : 0) |
+         (status & EXCP_INEXACT_F ? FE_INEXACT : 0);
+}
+
+// FIXME: These require the 'noinline' attribute to pessimistically flush the
+//        state. Otherwise, reading from the register may return stale results.
+
+// Access the six bits in the trap status register for the floating point
+// exceptions.
+[[gnu::noinline]] LIBC_INLINE void set_trap_status(uint32_t status) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), status);
+  __builtin_amdgcn_s_setreg(HW_REG_TRAPSTS_EXCP, val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_trap_status() {
+  return __builtin_amdgcn_s_getreg(HW_REG_TRAPSTS_EXCP);
+}
+
+// Access the six bits in the mode register that control which exceptions will
+// result in a trap being taken. Uses the same flags as the status register.
+[[gnu::noinline]] LIBC_INLINE void set_enabled_trap(uint32_t flags) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_EXCP, val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_enabled_trap() {
+  return __builtin_amdgcn_s_getreg(HW_REG_MODE_EXCP);
+}
+
+// Access the four bits in the mode register's ROUND[3:0] field. The hardware
+// supports setting the f64/f16 and f32 precision rounding modes separately but
+// we will assume that these always match.
+[[gnu::noinline]] LIBC_INLINE void set_rounding_mode(uint32_t flags) {
+  uint32_t val = gpu::broadcast_value(gpu::get_lane_mask(), flags);
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_ROUND, val << 2 | val);
+}
+
+[[gnu::noinline]] LIBC_INLINE uint32_t get_rounding_mode() {
+  return __builtin_amdgcn_s_getreg(HW_REG_MODE_ROUND) & 0x3;
+}
+
+// NOTE: On architectures before GFX12 the DX10_CLAMP bit supresses all floating
+//       point exceptions. In order to get them to be presented we need to
+//       manually set if off.
+[[gnu::noinline]] LIBC_INLINE void set_clamp_low() {
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 0);
+}
+
+[[gnu::noinline]] LIBC_INLINE void set_clamp_high() {
+  __builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 1);
+}
+
+} // namespace internal
+
+LIBC_INLINE int clear_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t invert = ~status & 0x3f;
+  uint32_t active = internal::get_trap_status();
+  internal::set_trap_status(active & invert);
+  return 0;
+}
+
+LIBC_INLINE int test_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t active = internal::get_trap_status();
+  return internal::get_except_value_for_status(active) & status;
+}
+
+LIBC_INLINE int get_except() { return internal::get_trap_status(); }
+
+LIBC_INLINE int set_except(int excepts) {
+  internal::set_trap_status(internal::get_status_value_for_except(excepts));
+  return 0;
+}
+
+LIBC_INLINE int enable_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t active = internal::get_trap_status();
+  internal::set_enabled_trap(status);
+  return internal::get_except_value_for_status(active);
+}
+
+LIBC_INLINE int disable_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  uint32_t invert = ~status & 0x3f;
+  uint32_t active = internal::get_enabled_trap();
+  internal::set_enabled_trap(active & invert);
+  return active;
+}
+
+LIBC_INLINE int raise_except(int excepts) {
+  uint32_t status = internal::get_status_value_for_except(excepts);
+  enable_except(status);
+  internal::set_trap_status(status);
+  return 0;
+}
+
+LIBC_INLINE int get_round() {
+  switch (internal::get_rounding_mode()) {
+  case internal::ROUND_TO_NEAREST:
+    return FE_TONEAREST;
+  case internal::ROUND_UPWARD:
+    return FE_UPWARD;
+  case internal::ROUND_DOWNWARD:
+    return FE_DOWNWARD;
+  case internal::ROUND_TOWARD_ZERO:
+    return FE_TOWARDZERO;
+  }
+  __builtin_unreachable();
+}
+
+LIBC_INLINE int set_round(int rounding_mode) {
+  switch (rounding_mode) {
+  case FE_TONEAREST:
+    internal::set_rounding_mode(internal::ROUND_TO_NEAREST);
+    break;
+  case FE_UPWARD:
+    internal::set_rounding_mode(internal::ROUND_UPWARD);
+    break;
+  case FE_DOWNWARD:
+    internal::set_rounding_mode(internal::ROUND_DOWNWARD);
+    break;
+  case FE_TOWARDZERO:
+    internal::set_rounding_mode(internal::ROUND_TOWARD_ZERO);
+    break;
+  default:
+    return 1;
+  }
+  return 0;
+}
+
+// The fenv_t struct for the AMD GPU is simply a 32-bit integer field of the
+// current state. We combine the four bits for the rounding mode with the six
+// bits for the exception state and the six bits for the enabled exceptions.
+//
+// ┌────────────────────────────┬─────────────────┬─────────────┬─────────────┐
+// │       UNUSED[31:16]        │ ENABLED[15:10]  │ STATUS[9:4] │  ROUND[3:0] │
+// └────────────────────────────┴─────────────────┴─────────────┴─────────────┘
+//
+// The top sixteen bits are currently unused and should be zero.
+LIBC_INLINE int get_env(fenv_t *env) {
+  if (!env)
+    return 1;
+
+  uint32_t rounding = internal::get_rounding_mode();
+  uint32_t status = internal::get_trap_status();
+  uint32_t enabled = internal::get_enabled_trap();
+  env->__fpc = enabled << 10 | status << 4 | rounding;
+  return 0;
+}
+
+LIBC_INLINE int set_env(const fenv_t *env) {
+  if (!env)
+    return 1;
+
+  internal::set_rounding_mode(env->__fpc & 0xf);
+  internal::set_trap_status((env->__fpc >> 4) & 0x3f);
+  internal::set_enabled_trap((env->__fpc >> 10) & 0x3f);
+  return 0;
+}
+
+} // namespace fputil
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_AMDGPU_FENVIMPL_H
diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt
index ba338bb6c73189..fe6a8135008723 100644
--- a/libc/test/src/fenv/CMakeLists.txt
+++ b/libc/test/src/fenv/CMakeLists.txt
@@ -1,9 +1,9 @@
-add_custom_target(libc_fenv_unittests)
+add_custom_target(libc_fenv_tests)
 
-add_libc_unittest(
+add_libc_test(
   rounding_mode_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     rounding_mode_test.cpp
   DEPENDS
@@ -11,10 +11,10 @@ add_libc_unittest(
     libc.src.fenv.fesetround
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_status_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_status_test.cpp
   DEPENDS
@@ -24,10 +24,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   getenv_and_setenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     getenv_and_setenv_test.cpp
   DEPENDS
@@ -38,10 +38,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   exception_flags_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     exception_flags_test.cpp
   DEPENDS
@@ -50,10 +50,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feupdateenv_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feupdateenv_test.cpp
   DEPENDS
@@ -62,10 +62,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feclearexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feclearexcept_test.cpp
   DEPENDS
@@ -73,10 +73,10 @@ add_libc_unittest(
     libc.src.__support.FPUtil.fenv_impl
 )
 
-add_libc_unittest(
+add_libc_test(
   feenableexcept_test
   SUITE
-    libc_fenv_unittests
+    libc_fenv_tests
   SRCS
     feenableexcept_test.cpp
   DEPENDS
@@ -96,7 +96,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     enabled_exceptions_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       enabled_exceptions_test.cpp
     DEPENDS
@@ -113,7 +113,7 @@ if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
     feholdexcept_test
     UNIT_TEST_ONLY
     SUITE
-      libc_fenv_unittests
+      libc_fenv_tests
     SRCS
       feholdexcept_test.cpp
     DEPENDS

lntue · 2024-03-01T03:00:34Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// This will read the size number of bits starting at the offset bit from the
+// corresponding hardware register ID.
+constexpr uint16_t get_register(uint8_t id, uint8_t offset, uint8_t size) {
+  return static_cast<uint16_t>(size << 11 | offset << 6 | id);


I think you should cast before shifting:

return static_cast<uint16_t>(size) << 11 | static_cast<uint16_t>(offset) << 6 | static_cast<uint16_t>(id);

I think that's just too noisy. The function is constexpr because the argument needs to be a compile time constant. I could change the arguments to uint32_t if you'd prefer.

b/c technically uint8_t(x) << 11 will always give you 0, right?

lntue · 2024-03-01T03:06:36Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+//
+// This will read the size number of bits starting at the offset bit from the
+// corresponding hardware register ID.
+constexpr uint16_t get_register(uint8_t id, uint8_t offset, uint8_t size) {


This function probably needs to be annotated with LIBC_INLINE.

lntue

LGTM on the libc side. Let's wait for some opinions from GPU side.

lntue · 2024-03-01T03:11:01Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// This will read the size number of bits starting at the offset bit from the
+// corresponding hardware register ID.
+constexpr uint16_t get_register(uint8_t id, uint8_t offset, uint8_t size) {
+  return static_cast<uint16_t>(size << 11 | offset << 6 | id);


b/c technically uint8_t(x) << 11 will always give you 0, right?

lntue · 2024-03-01T03:12:36Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+
+// The six bits used to encode the standard floating point exceptions in the
+// trap status register.
+enum ExceptionFlags : uint32_t {


Do you want to add a link to the documentation of the encodings in the comments?

b/c technically uint8_t(x) << 11 will always give you 0, right?

I think integer promotion makes it take the int32_t argument from the shift.

Do you want to add a link to the documentation of the encodings in the comments?

I can, if needed, but I pretty much copied everything relevant here so I figured it was redundant.

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

arsenm

Many of these are reinventing generic intrinsics (https://llvm.org/docs/LangRef.html#floating-point-environment-manipulation-intrinsics). We implement some of these. The rounding modes also have extended values for the mixed f32/f64 cases

arsenm · 2024-03-01T07:18:02Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// Access the four bits in the mode register's ROUND[3:0] field. The hardware
+// supports setting the f64/f16 and f32 precision rounding modes separately but
+// we will assume that these always match.


The generic intrinsic implementation handles the mismatched values

arsenm · 2024-03-01T07:18:30Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// NOTE: On architectures before GFX12 the DX10_CLAMP bit supresses all floating
+//       point exceptions. In order to get them to be presented we need to
+//       manually set if off.
+[[gnu::noinline]] LIBC_INLINE void set_clamp_low() {


"low" should be "off"?

arsenm · 2024-03-01T07:18:39Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+  __builtin_amdgcn_s_setreg(HW_REG_MODE_CLAMP, 0);
+}
+
+[[gnu::noinline]] LIBC_INLINE void set_clamp_high() {


"high" should be "on"?

jhuber6 · 2024-03-01T14:44:20Z

Many of these are reinventing generic intrinsics (https://llvm.org/docs/LangRef.html#floating-point-environment-manipulation-intrinsics). We implement some of these. The rounding modes also have extended values for the mixed f32/f64 cases

Okay, so I suppose the plan of attack here is to finish implementing these as it seems we only have setround. Then we can update this patch and make it use those intrinsics. However, I do not see any intrinsics for stuff like enabling / disabling things. Likely we will want to write the intrinsic to get the fpenv and then treat it as an implementation defined bit so doing something like writing the exceptions becomes __builtin_getenv() into __builtin_setenv() after modifying the bits.

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

libc/test/src/fenv/rounding_mode_test.cpp

lntue · 2024-05-03T20:55:56Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+#error "Invalid include"
+#endif
+
+#include <fenv.h>


Can you use "hdr/fenv_macros.h" and "hdr/types/fenv_t.h" instead?

lntue · 2024-05-03T22:10:12Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+#endif
+
+#include "hdr/types/fenv_t.h"
+


Also #include "hdr/fenv_macros.h" for other FE_* macro definitions.

github-actions · 2024-05-04T13:50:12Z

✅ With the latest revision this PR passed the C/C++ code formatter.

github-actions · 2024-05-04T13:50:12Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 70b79a9ccd03f93fc4c8464a91b6bef3aab322d3 fdfddf90f07488fb9f00e64c6e40160261c65de6 -- libc/src/__support/FPUtil/amdgpu/FEnvImpl.h libc/include/llvm-libc-macros/math-macros.h libc/include/llvm-libc-types/fenv_t.h libc/src/__support/FPUtil/FEnvImpl.h libc/test/src/fenv/rounding_mode_test.cpp

View the diff from clang-format here.

diff --git a/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
index 77f7793fbe..9b64131046 100644
--- a/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h
@@ -133,9 +133,7 @@ LIBC_INLINE void set_rounding_mode(uint32_t mode) {
 // Get the hardware rounding mode using the llvm.get.rounding intrinsic
 // function.
 // FIXME: This requires `noinline` to flush the hardware register in time.
-LIBC_INLINE uint32_t get_rounding_mode() {
-  return __builtin_flt_rounds();
-}
+LIBC_INLINE uint32_t get_rounding_mode() { return __builtin_flt_rounds(); }
 
 } // namespace internal

arsenm · 2024-05-07T10:25:23Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// This converts between the exported C standard values and the hardware values.
+LIBC_INLINE uint32_t get_status_value_for_except(uint32_t excepts) {
+  return (excepts & FE_INVALID ? EXCP_INVALID_F : 0) |
+#ifdef __FE_DENORM


I think we should just assume this is defined

/home/jhuber/Documents/llvm/llvm-project/libc/src/__support/FPUtil/amdgpu/FEnvImpl.h:108:21: error: use of undeclared identifier '__FE_DENORM' 108 | (excepts & __FE_DENORM ? EXCP_DENORMAL_F : 0) | |

Seems it's not, I'll probably just remove it then.

well it should be? Where do these come from? We definitely have the Denorm exceptions

X86 defines the flags itself, so not sure why it's using the macro:

struct ExceptionFlags { static constexpr uint16_t INVALID_F = 0x1; // Some libcs define __FE_DENORM corresponding to the denormal input // exception and include it in FE_ALL_EXCEPTS. We define and use it to // support compiling against headers provided by such libcs. static constexpr uint16_t DENORMAL_F = 0x2; static constexpr uint16_t DIV_BY_ZERO_F = 0x4; static constexpr uint16_t OVERFLOW_F = 0x8; static constexpr uint16_t UNDERFLOW_F = 0x10; static constexpr uint16_t INEXACT_F = 0x20; };

It's to map between the C standard values and the ones the hardware uses right? So there should be some code that converts fegetexcept(__FE_DENORM) to the specific hardware value.

The "C standard values" are implementation defined. We can just define them to the same thing

My guess is that the original implementation thought it would be easier to make it fixed in the header than to have target specific values, but we could probably go that direction. It would be a larger rewrite though so it might be out of scope here.

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

Summary: This patch implements a basic floating point environment on the AMDGPU. Users should be able to check rounding modes or certain floating point exceptions using the standard functions. This patch implements the basic set, but only exposes the `fegetround` and `fesetround` utilities. This ps because getting the exceptions to work is difficult due to the behavior with the DX10_CLAMP bit that is always set. It is worth noting that this utility is not strictly standards conformant because we can only control this behavior on individual warps. Whether or not we can say it's truly implemented then is an exercise to the reader.

arsenm · 2024-05-07T18:14:12Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// Set the hardware rounding mode using the llvm.set.rounding intrinsic
+// function.
+LIBC_INLINE void set_rounding_mode(uint32_t mode) {
+  __builtin_set_flt_rounds(mode);
+}
+
+// Get the hardware rounding mode using the llvm.get.rounding intrinsic
+// function.
+LIBC_INLINE uint32_t get_rounding_mode() { return __builtin_flt_rounds(); }


I would assume this would be defined in the generic implementation

arsenm · 2024-05-07T18:14:48Z

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h

+// Set the rounding mode for the environment. If the provided mode is above the
+// expected range we assume it is an extended value to control f32 / f64
+// separately.
+LIBC_INLINE int set_round(int rounding_mode) {


What's the point of this having a return value ?

jhuber6 · 2024-05-07T18:27:56Z

@lntue This seems to make the nearbyint test begin failing, most likely because set_round is no-longer a no-op. Any clue why that would be? I'm surprised it passed before if rounding was required.

arsenm · 2024-05-07T19:06:19Z

@lntue This seems to make the nearbyint test begin failing, most likely because set_round is no-longer a no-op. Any clue why that would be? I'm surprised it passed before if rounding was required.

Where's the test and failing in what way? We only have minimal strictfp support and nearbyint isn't implemented

jhuber6 · 2024-05-07T19:10:38Z

@lntue This seems to make the nearbyint test begin failing, most likely because set_round is no-longer a no-op. Any clue why that would be? I'm surprised it passed before if rounding was required.

Where's the test and failing in what way? We only have minimal strictfp support and nearbyint isn't implemented

https://github.com/llvm/llvm-project/blob/main/libc/test/src/math/smoke/NearbyIntTest.h is the test. The implementation is just a built-in. I can try to narrow it down later.

jhuber6 · 2024-05-07T19:19:13Z

> ninja -C runtimes/runtimes-amdgcn-amd-amdhsa-bins libc.test.src.math.smoke.nearbyint_test.__hermetic__
ninja: Entering directory `runtimes/runtimes-amdgcn-amd-amdhsa-bins'
[24/24] Running hermetic test libc.test.src.math.smoke.nearbyint_test.__hermetic__
FAILED: libc/test/src/math/smoke/CMakeFiles/libc.test.src.math.smoke.nearbyint_test.__hermetic__ /home/jhuber/Documents/llvm/llvm-project/build/runtimes/runtimes-amdgcn-amd-amdhsa-bins/libc/test/src/math/smoke/CMakeFiles/libc.test.src.math.smoke.nearbyint_test.__hermetic__ 
cd /home/jhuber/Documents/llvm/llvm-project/build/runtimes/runtimes-amdgcn-amd-amdhsa-bins/libc/test/src/math/smoke && /home/jhuber/Documents/llvm/llvm-project/build/bin/amdhsa-loader /home/jhuber/Documents/llvm/llvm-project/build/runtimes/runtimes-amdgcn-amd-amdhsa-bins/libc/test/src/math/smoke/libc.test.src.math.smoke.nearbyint_test.__hermetic__.__build__
[ RUN      ] LlvmLibcNearbyIntTest.TestNaN
[       OK ] LlvmLibcNearbyIntTest.TestNaN (took 4 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestInfinities
[       OK ] LlvmLibcNearbyIntTest.TestInfinities (took 6 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestZeroes
[       OK ] LlvmLibcNearbyIntTest.TestZeroes (took 6 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestIntegers
[       OK ] LlvmLibcNearbyIntTest.TestIntegers (took 26 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestSubnormalToNearest
[       OK ] LlvmLibcNearbyIntTest.TestSubnormalToNearest (took 2 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestSubnormalTowardZero
[       OK ] LlvmLibcNearbyIntTest.TestSubnormalTowardZero (took 2 us)
[ RUN      ] LlvmLibcNearbyIntTest.TestSubnormalToPosInf
/home/jhuber/Documents/llvm/llvm-project/libc/test/src/math/smoke/NearbyIntTest.h:70: FAILURE
Failed to match ((FPBits::one().get_val())) against LIBC_NAMESPACE::testing::getMatcher< LIBC_NAMESPACE::testing::TestCond::EQ>(((func(min_denormal)))).
Expected floating point value: 0x0000000000000000 = (S: 0, E: 0x0000, M: 0x0000000000000000)
Actual floating point value: 0x3FF0000000000000 = (S: 0, E: 0x03FF, M: 0x0000000000000000)
[  FAILED  ] LlvmLibcNearbyIntTest.TestSubnormalToPosInf
[ RUN      ] LlvmLibcNearbyIntTest.TestSubnormalToNegInf
/home/jhuber/Documents/llvm/llvm-project/libc/test/src/math/smoke/NearbyIntTest.h:77: FAILURE
Failed to match ((negative_one)) against LIBC_NAMESPACE::testing::getMatcher< LIBC_NAMESPACE::testing::TestCond::EQ>(((func(-min_denormal)))).
Expected floating point value: 0x8000000000000000 = (S: 1, E: 0x0000, M: 0x0000000000000000)
Actual floating point value: 0xBFF0000000000000 = (S: 1, E: 0x03FF, M: 0x0000000000000000)
[  FAILED  ] LlvmLibcNearbyIntTest.TestSubnormalToNegInf
Ran 8 tests.  PASS: 6  FAIL: 2
ninja: build stopped: subcommand failed.

This is the current output. The nearby int tests are the only ones that fail so I might just disable them for now.

arsenm · 2024-05-07T20:44:18Z

https://github.com/llvm/llvm-project/blob/main/libc/test/src/math/smoke/NearbyIntTest.h

This test is invalid, I don't see it enabling fenv access. If it were, it would print the warning about not being handled and then also not work. If clang actually emitted the strictfp code, it would fail to codegen

jhuber6 requested review from arsenm, jayfoad, JonChesterfield, lntue and michaelrj-google February 29, 2024 23:06

llvmbot added backend:AMDGPU libc labels Feb 29, 2024

lntue reviewed Mar 1, 2024

View reviewed changes

jhuber6 force-pushed the FPEnv branch from 8006a6a to 3e51451 Compare March 1, 2024 03:07

lntue approved these changes Mar 1, 2024

View reviewed changes

arsenm reviewed Mar 1, 2024

View reviewed changes

jhuber6 force-pushed the FPEnv branch from 3e51451 to de4dc4a Compare April 2, 2024 16:57

lntue approved these changes Apr 5, 2024

View reviewed changes

arsenm reviewed Apr 6, 2024

View reviewed changes

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h Outdated Show resolved Hide resolved

jhuber6 force-pushed the FPEnv branch 2 times, most recently from ca43f49 to d3c5dc0 Compare May 3, 2024 19:56

arsenm reviewed May 3, 2024

View reviewed changes

libc/test/src/fenv/rounding_mode_test.cpp Outdated Show resolved Hide resolved

arsenm reviewed May 3, 2024

View reviewed changes

libc/test/src/fenv/rounding_mode_test.cpp Outdated Show resolved Hide resolved

jhuber6 force-pushed the FPEnv branch 2 times, most recently from 38484c3 to fdfddf9 Compare May 3, 2024 20:45

lntue reviewed May 3, 2024

View reviewed changes

jhuber6 force-pushed the FPEnv branch 2 times, most recently from 7585511 to 05992ce Compare May 3, 2024 21:09

lntue reviewed May 3, 2024

View reviewed changes

jhuber6 force-pushed the FPEnv branch from 05992ce to ce578b8 Compare May 3, 2024 22:21

arsenm reviewed May 7, 2024

View reviewed changes

jhuber6 force-pushed the FPEnv branch 2 times, most recently from 76bbd2d to aecc758 Compare May 7, 2024 12:41

arsenm reviewed May 7, 2024

View reviewed changes

libc/src/__support/FPUtil/amdgpu/FEnvImpl.h Outdated Show resolved Hide resolved

jhuber6 force-pushed the FPEnv branch from aecc758 to 9bf56d3 Compare May 7, 2024 14:28

jhuber6 force-pushed the FPEnv branch from 9bf56d3 to 4edcc7c Compare May 7, 2024 17:59

arsenm reviewed May 7, 2024

View reviewed changes

[libc] Implement basic 'fenv.h' utilities on the AMD GPU #83500

Are you sure you want to change the base?

[libc] Implement basic 'fenv.h' utilities on the AMD GPU #83500

Uh oh!

Conversation

jhuber6 commented Feb 29, 2024

Uh oh!

llvmbot commented Feb 29, 2024

Uh oh!

llvmbot commented Feb 29, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lntue left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented Mar 1, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented May 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented May 4, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented May 7, 2024

Uh oh!

arsenm commented May 7, 2024

Uh oh!

jhuber6 commented May 7, 2024

Uh oh!

jhuber6 commented May 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented May 4, 2024 •

edited

Loading

jhuber6 commented May 7, 2024 •

edited

Loading