[libc][math][c23] Enable C23 _Float16 math functions on GPUs (#99248)

overmighty · web-flow · commit 81ce796095c8 · 2024-07-25T21:09:49.000+02:00
diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -17,6 +17,12 @@ set(
 # Making sure ALL_COMPILER_FEATURES is sorted.
 list(SORT ALL_COMPILER_FEATURES)
 
+# Compiler features that are unavailable on GPU targets with the in-tree Clang.
+set(
+  CPU_ONLY_COMPILER_FEATURES
+    "float128"
+)
+
 # Function to check whether the compiler supports the provided set of features.
 # Usage:
 # compiler_supports(
@@ -67,13 +73,26 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
     set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE)
   endif()
 
-  try_compile(
-    has_feature
-    ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
-    SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
-    COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
-    LINK_OPTIONS ${link_options}
-  )
+  if(LIBC_TARGET_OS_IS_GPU)
+    # CUDA shouldn't be required to build the libc, only to test it, so we can't
+    # try to build CUDA binaries here. Since GPU builds are always compiled with
+    # the in-tree Clang, we just hardcode which compiler features are available
+    # when targeting GPUs.
+    if(feature IN_LIST CPU_ONLY_COMPILER_FEATURES)
+      set(has_feature FALSE)
+    else()
+      set(has_feature TRUE)
+    endif()
+  else()
+    try_compile(
+      has_feature
+      ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
+      SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
+      COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
+      LINK_OPTIONS ${link_options}
+    )
+  endif()
+
   if(has_feature)
     list(APPEND AVAILABLE_COMPILER_FEATURES ${feature})
     if(${feature} STREQUAL "float16")
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -279,8 +279,10 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2")))
   set(SKIP_FLAG_EXPANSION_EXPLICIT_SIMD_OPT TRUE)
 endif()
 
-# Skip ROUND_OPT flag for targets that don't support SSE 4.2.
+# Skip ROUND_OPT flag for targets that don't support rounding instructions. On
+# x86, these are SSE4.1 instructions, but we already had code to check for
+# SSE4.2 support.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR
-       LIBC_TARGET_ARCHITECTURE_IS_AARCH64))
+       LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
@@ -348,6 +348,74 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.truncf
 )
 
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C23 _Float16 entrypoints
+    libc.src.math.canonicalizef16
+    libc.src.math.ceilf16
+    libc.src.math.copysignf16
+    libc.src.math.f16add
+    libc.src.math.f16addf
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16fma
+    libc.src.math.f16fmaf
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
+    libc.src.math.f16sqrt
+    libc.src.math.f16sqrtf
+    libc.src.math.f16sub
+    libc.src.math.f16subf
+    libc.src.math.fabsf16
+    libc.src.math.fdimf16
+    libc.src.math.floorf16
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
+    libc.src.math.fminf16
+    libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
+    libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
+    libc.src.math.fmodf16
+    libc.src.math.frexpf16
+    libc.src.math.fromfpf16
+    libc.src.math.fromfpxf16
+    libc.src.math.getpayloadf16
+    libc.src.math.ilogbf16
+    libc.src.math.ldexpf16
+    libc.src.math.llogbf16
+    libc.src.math.llrintf16
+    libc.src.math.llroundf16
+    libc.src.math.logbf16
+    libc.src.math.lrintf16
+    libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
+    libc.src.math.nearbyintf16
+    libc.src.math.nextafterf16
+    libc.src.math.nextdownf16
+    libc.src.math.nexttowardf16
+    libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
+    libc.src.math.rintf16
+    libc.src.math.roundevenf16
+    libc.src.math.roundf16
+    libc.src.math.scalblnf16
+    libc.src.math.scalbnf16
+    libc.src.math.setpayloadf16
+    libc.src.math.setpayloadsigf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
+    libc.src.math.truncf16
+    libc.src.math.ufromfpf16
+    libc.src.math.ufromfpxf16
+  )
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
@@ -53,4 +53,8 @@
 #define LIBC_TARGET_CPU_HAS_NEAREST_INT
 #endif
 
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#define LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
@@ -111,7 +111,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -548,7 +548,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -617,7 +617,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -686,7 +686,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -755,7 +755,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -948,7 +948,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_ceilf(x));
 #else
   return fputil::ceil(x);
diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_floorf(x));
 #else
   return fputil::floor(x);
diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_rintf(x));
 #else
   return fputil::round_using_current_rounding_mode(x);
diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) &&                                   \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundevenf(x));
 #else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp
@@ -10,12 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) {
-#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(__LIBC_USE_BUILTIN_ROUND) &&                                       \
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundf(x));
 #else
   return fputil::round(x);
diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_truncf(x));
 #else
   return fputil::trunc(x);