[libc][math] Optimize maximum and minimum functions using builtins when available (#100002)

overmighty · web-flow · commit e7f8d4be5a5d · 2024-07-23T23:59:55.000+02:00
diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -5,6 +5,8 @@
 set(
   ALL_COMPILER_FEATURES
     "builtin_ceil_floor_rint_trunc"
+    "builtin_fmax_fmin"
+    "builtin_fmaxf16_fminf16"
     "builtin_round"
     "builtin_roundeven"
     "float16"
@@ -82,6 +84,10 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
       set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
     elseif(${feature} STREQUAL "builtin_ceil_floor_rint_trunc")
       set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_RINT_TRUNC TRUE)
+    elseif(${feature} STREQUAL "builtin_fmax_fmin")
+      set(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN TRUE)
+    elseif(${feature} STREQUAL "builtin_fmaxf16_fminf16")
+      set(LIBC_COMPILER_HAS_BUILTIN_FMAXF16_FMINF16 TRUE)
     elseif(${feature} STREQUAL "builtin_round")
       set(LIBC_COMPILER_HAS_BUILTIN_ROUND TRUE)
     elseif(${feature} STREQUAL "builtin_roundeven")
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -9,6 +9,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX AVX2 AVX512F AVX512BW FMA)
   set(LIBC_COMPILE_OPTIONS_NATIVE -march=native)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
+  set(ALL_CPU_FEATURES "FullFP16")
   set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native)
 endif()
 
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -40,6 +40,17 @@ function(_get_compile_options_from_flags output_var)
     endif()
     if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
       list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
+      if(LIBC_COMPILER_HAS_BUILTIN_FMAX_FMIN)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_FMAX_FMIN")
+      endif()
+      if(LIBC_COMPILER_HAS_BUILTIN_FMAXF16_FMINF16)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_FMAXF16_FMINF16")
+      endif()
+      if("FullFP16" IN_LIST LIBC_CPU_FEATURES AND
+         CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        list(APPEND compile_options
+             "SHELL:-Xclang -target-feature -Xclang +fullfp16")
+      endif()
     endif()
   elseif(MSVC)
     if(ADD_FMA_FLAG)
diff --git a/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp b/libc/cmake/modules/compiler_features/check_builtin_fmax_fmin.cpp
@@ -0,0 +1,7 @@
+float try_builtin_fmaxf(float x, float y) { return __builtin_fmaxf(x, y); }
+float try_builtin_fminf(float x, float y) { return __builtin_fminf(x, y); }
+
+double try_builtin_fmaxf(double x, double y) { return __builtin_fmax(x, y); }
+double try_builtin_fminf(double x, double y) { return __builtin_fmin(x, y); }
+
+extern "C" void _start() {}
diff --git a/libc/cmake/modules/compiler_features/check_builtin_fmaxf16_fminf16.cpp b/libc/cmake/modules/compiler_features/check_builtin_fmaxf16_fminf16.cpp
@@ -0,0 +1,9 @@
+_Float16 try_builtin_fmaxf16(_Float16 x, _Float16 y) {
+  return __builtin_fmaxf16(x, y);
+}
+
+_Float16 try_builtin_fminf16(_Float16 x, _Float16 y) {
+  return __builtin_fminf16(x, y);
+}
+
+extern "C" void _start() {}
diff --git a/libc/cmake/modules/cpu_features/check_FullFP16.cpp b/libc/cmake/modules/cpu_features/check_FullFP16.cpp
@@ -0,0 +1,5 @@
+#include "src/__support/macros/properties/cpu_features.h"
+
+#ifndef LIBC_TARGET_CPU_HAS_FULLFP16
+#error unsupported
+#endif
diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
@@ -17,6 +17,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/types.h"
 #include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -27,6 +29,86 @@ LIBC_INLINE T abs(T x) {
   return FPBits<T>(x).abs().get_val();
 }
 
+namespace internal {
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> max(T x, T y) {
+  FPBits<T> x_bits(x);
+  FPBits<T> y_bits(y);
+
+  // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and y
+  // have different signs and both are not NaNs, we return the number with
+  // positive sign.
+  if (x_bits.sign() != y_bits.sign())
+    return x_bits.is_pos() ? x : y;
+  return x > y ? x : y;
+}
+
+#if defined(__LIBC_USE_BUILTIN_FMAXF16_FMINF16)
+template <> LIBC_INLINE float16 max(float16 x, float16 y) {
+  return __builtin_fmaxf16(x, y);
+}
+#elif !defined(LIBC_TARGET_ARCH_IS_AARCH64)
+template <> LIBC_INLINE float16 max(float16 x, float16 y) {
+  FPBits<float16> x_bits(x);
+  FPBits<float16> y_bits(y);
+
+  int16_t xi = static_cast<int16_t>(x_bits.uintval());
+  int16_t yi = static_cast<int16_t>(y_bits.uintval());
+  return ((xi > yi) != (xi < 0 && yi < 0)) ? x : y;
+}
+#endif
+
+#if defined(__LIBC_USE_BUILTIN_FMAX_FMIN) && !defined(LIBC_TARGET_ARCH_IS_X86)
+template <> LIBC_INLINE float max(float x, float y) {
+  return __builtin_fmaxf(x, y);
+}
+
+template <> LIBC_INLINE double max(double x, double y) {
+  return __builtin_fmax(x, y);
+}
+#endif
+
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> min(T x, T y) {
+  FPBits<T> x_bits(x);
+  FPBits<T> y_bits(y);
+
+  // To make sure that fmin(+0, -0) == -0 == fmin(-0, +0), whenever x and y have
+  // different signs and both are not NaNs, we return the number with negative
+  // sign.
+  if (x_bits.sign() != y_bits.sign())
+    return x_bits.is_neg() ? x : y;
+  return x < y ? x : y;
+}
+
+#if defined(__LIBC_USE_BUILTIN_FMAXF16_FMINF16)
+template <> LIBC_INLINE float16 min(float16 x, float16 y) {
+  return __builtin_fminf16(x, y);
+}
+#elif !defined(LIBC_TARGET_ARCH_IS_AARCH64)
+template <> LIBC_INLINE float16 min(float16 x, float16 y) {
+  FPBits<float16> x_bits(x);
+  FPBits<float16> y_bits(y);
+
+  int16_t xi = static_cast<int16_t>(x_bits.uintval());
+  int16_t yi = static_cast<int16_t>(y_bits.uintval());
+  return ((xi < yi) != (xi < 0 && yi < 0)) ? x : y;
+}
+#endif
+
+#if defined(__LIBC_USE_BUILTIN_FMAX_FMIN) && !defined(LIBC_TARGET_ARCH_IS_X86)
+template <> LIBC_INLINE float min(float x, float y) {
+  return __builtin_fminf(x, y);
+}
+
+template <> LIBC_INLINE double min(double x, double y) {
+  return __builtin_fmin(x, y);
+}
+#endif
+
+} // namespace internal
+
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T fmin(T x, T y) {
   const FPBits<T> bitx(x), bity(y);
@@ -35,12 +117,7 @@ LIBC_INLINE T fmin(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    // To make sure that fmin(+0, -0) == -0 == fmin(-0, +0), whenever x and
-    // y has different signs and both are not NaNs, we return the number
-    // with negative sign.
-    return bitx.is_neg() ? x : y;
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -51,12 +128,7 @@ LIBC_INLINE T fmax(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and
-    // y has different signs and both are not NaNs, we return the number
-    // with positive sign.
-    return bitx.is_neg() ? y : x;
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -67,9 +139,7 @@ LIBC_INLINE T fmaximum(T x, T y) {
     return x;
   if (bity.is_nan())
     return y;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? y : x);
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -80,9 +150,7 @@ LIBC_INLINE T fminimum(T x, T y) {
     return x;
   if (bity.is_nan())
     return y;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg()) ? x : y;
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -97,9 +165,7 @@ LIBC_INLINE T fmaximum_num(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? y : x);
-  return x > y ? x : y;
+  return internal::max(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -114,9 +180,7 @@ LIBC_INLINE T fminimum_num(T x, T y) {
     return y;
   if (bity.is_nan())
     return x;
-  if (bitx.sign() != bity.sign())
-    return (bitx.is_neg() ? x : y);
-  return x < y ? x : y;
+  return internal::min(x, y);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -192,6 +192,8 @@ add_header_library(
     libc.src.__support.uint128
     libc.src.__support.common
     libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.types
 )
 
 add_header_library(
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
@@ -14,6 +14,10 @@
 
 #include "architectures.h"
 
+#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+#define LIBC_TARGET_CPU_HAS_FULLFP16
+#endif
+
 #if defined(__SSE2__)
 #define LIBC_TARGET_CPU_HAS_SSE2
 #endif
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
diff --git a/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp b/libc/test/src/math/performance_testing/max_min_funcs_perf.cpp

Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,8 @@ add_header_library(`
`192`	`192`	`libc.src.__support.uint128`
`193`	`193`	`libc.src.__support.common`
`194`	`194`	`libc.src.__support.macros.optimization`
	`195`	`+ libc.src.__support.macros.properties.architectures`
	`196`	`+ libc.src.__support.macros.properties.types`
`195`	`197`	`)`
`196`	`198`
`197`	`199`	`add_header_library(`