Skip to content

[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available #99037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ function(_get_compile_options_from_flags output_var)
endif()
check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
check_flag(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG ${MISC_MATH_BASIC_OPS_OPT_FLAG} ${ARGN})

if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
if(ADD_FMA_FLAG)
Expand Down Expand Up @@ -37,6 +38,9 @@ function(_get_compile_options_from_flags output_var)
if(ADD_EXPLICIT_SIMD_OPT_FLAG)
list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
endif()
if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
endif()
elseif(MSVC)
if(ADD_FMA_FLAG)
list(APPEND compile_options "/arch:AVX2")
Expand Down
3 changes: 3 additions & 0 deletions libc/cmake/modules/LLVMLibCFlagRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ set(FMA_OPT_FLAG "FMA_OPT")
set(ROUND_OPT_FLAG "ROUND_OPT")
# This flag controls whether we use explicit SIMD instructions or not.
set(EXPLICIT_SIMD_OPT_FLAG "EXPLICIT_SIMD_OPT")
# This flag controls whether we use compiler builtin functions to implement
# various basic math operations or not.
set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")

# Skip FMA_OPT flag for targets that don't support fma.
if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR
Expand Down
20 changes: 17 additions & 3 deletions libc/src/math/generic/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,9 @@ add_entrypoint_object(
DEPENDS
libc.src.__support.FPUtil.basic_operations
COMPILE_OPTIONS
-O2
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand All @@ -411,7 +413,9 @@ add_entrypoint_object(
DEPENDS
libc.src.__support.FPUtil.basic_operations
COMPILE_OPTIONS
-O2
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand All @@ -423,7 +427,7 @@ add_entrypoint_object(
DEPENDS
libc.src.__support.FPUtil.basic_operations
COMPILE_OPTIONS
-O2
-O3
)

add_entrypoint_object(
Expand All @@ -435,8 +439,12 @@ add_entrypoint_object(
DEPENDS
libc.src.__support.macros.properties.types
libc.src.__support.FPUtil.basic_operations
libc.src.__support.macros.properties.architectures
libc.src.__support.macros.properties.compiler
COMPILE_OPTIONS
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand Down Expand Up @@ -1406,6 +1414,8 @@ add_entrypoint_object(
libc.src.__support.FPUtil.manipulation_functions
COMPILE_OPTIONS
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand All @@ -1418,6 +1428,8 @@ add_entrypoint_object(
libc.src.__support.FPUtil.manipulation_functions
COMPILE_OPTIONS
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand All @@ -1443,6 +1455,8 @@ add_entrypoint_object(
libc.src.__support.FPUtil.manipulation_functions
COMPILE_OPTIONS
-O3
FLAGS
MISC_MATH_BASIC_OPS_OPT
)

add_entrypoint_object(
Expand Down
4 changes: 4 additions & 0 deletions libc/src/math/generic/copysign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
return __builtin_copysign(x, y);
#else
return fputil::copysign(x, y);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
4 changes: 4 additions & 0 deletions libc/src/math/generic/copysignf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
return __builtin_copysignf(x, y);
#else
return fputil::copysign(x, y);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
4 changes: 4 additions & 0 deletions libc/src/math/generic/copysignf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(float16, copysignf16, (float16 x, float16 y)) {
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
return __builtin_copysignf16(x, y);
#else
return fputil::copysign(x, y);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
8 changes: 7 additions & 1 deletion libc/src/math/generic/fabs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@

namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return fputil::abs(x); }
LLVM_LIBC_FUNCTION(double, fabs, (double x)) {
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
return __builtin_fabs(x);
#else
return fputil::abs(x);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
8 changes: 7 additions & 1 deletion libc/src/math/generic/fabsf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@

namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return fputil::abs(x); }
LLVM_LIBC_FUNCTION(float, fabsf, (float x)) {
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
return __builtin_fabsf(x);
#else
return fputil::abs(x);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
13 changes: 12 additions & 1 deletion libc/src/math/generic/fabsf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,20 @@
#include "src/__support/FPUtil/BasicOperations.h"
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/architectures.h"
#include "src/__support/macros/properties/compiler.h"

namespace LIBC_NAMESPACE_DECL {

LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) { return fputil::abs(x); }
LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
// For x86, GCC generates better code from the generic implementation.
// https://godbolt.org/z/K9orM4hTa
#if defined(__LIBC_MISC_MATH_BASIC_OPS_OPT) && \
!(defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_GCC))
return __builtin_fabsf16(x);
#else
return fputil::abs(x);
#endif
}

} // namespace LIBC_NAMESPACE_DECL
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/algorithm.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "test/src/math/performance_testing/Timer.h"
Expand All @@ -28,11 +29,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
static void run_perf_in_range(Func myFunc, Func otherFunc,
StorageType startingBit, StorageType endingBit,
size_t N, size_t rounds, std::ofstream &log) {
if (endingBit - startingBit < N)
N = endingBit - startingBit;
if (sizeof(StorageType) <= sizeof(size_t))
N = cpp::min(N, static_cast<size_t>(endingBit - startingBit));

auto runner = [=](Func func) {
volatile T result;
[[maybe_unused]] volatile T result;
if (endingBit < startingBit) {
return;
}
Expand Down
19 changes: 19 additions & 0 deletions libc/test/src/math/performance_testing/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,17 @@ add_header_library(
single_input_single_output_diff
HDRS
SingleInputSingleOutputPerf.h
DEPENDS
libc.src.__support.CPP.algorithm
libc.src.__support.FPUtil.fp_bits
)

add_header_library(
binary_op_single_output_diff
HDRS
BinaryOpSingleOutputPerf.h
DEPENDS
libc.src.__support.CPP.algorithm
libc.src.__support.FPUtil.fp_bits
)

Expand Down Expand Up @@ -402,3 +406,18 @@ add_perf_binary(
LINK_LIBRARIES
LibcFPTestHelpers
)

add_perf_binary(
misc_basic_ops_perf
SRCS
misc_basic_ops_perf.cpp
DEPENDS
.binary_op_single_output_diff
.single_input_single_output_diff
libc.src.math.copysignf
libc.src.math.copysignf16
libc.src.math.fabsf
libc.src.math.fabsf16
COMPILE_OPTIONS
-fno-builtin
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/algorithm.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "test/src/math/performance_testing/Timer.h"
Expand All @@ -26,16 +27,21 @@ template <typename T> class SingleInputSingleOutputPerf {

static void runPerfInRange(Func myFunc, Func otherFunc,
StorageType startingBit, StorageType endingBit,
std::ofstream &log) {
size_t rounds, std::ofstream &log) {
size_t n = 10'010'001;
if (sizeof(StorageType) <= sizeof(size_t))
n = cpp::min(n, static_cast<size_t>(endingBit - startingBit));

auto runner = [=](Func func) {
constexpr StorageType N = 10'010'001;
StorageType step = (endingBit - startingBit) / N;
StorageType step = (endingBit - startingBit) / n;
if (step == 0)
step = 1;
volatile T result;
for (StorageType bits = startingBit; bits < endingBit; bits += step) {
T x = FPBits(bits).get_val();
result = func(x);
[[maybe_unused]] volatile T result;
for (size_t i = 0; i < rounds; i++) {
for (StorageType bits = startingBit; bits < endingBit; bits += step) {
T x = FPBits(bits).get_val();
result = func(x);
}
}
};

Expand All @@ -44,8 +50,7 @@ template <typename T> class SingleInputSingleOutputPerf {
runner(myFunc);
timer.stop();

StorageType numberOfRuns = endingBit - startingBit + 1;
double myAverage = static_cast<double>(timer.nanoseconds()) / numberOfRuns;
double myAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
log << "-- My function --\n";
log << " Total time : " << timer.nanoseconds() << " ns \n";
log << " Average runtime : " << myAverage << " ns/op \n";
Expand All @@ -56,8 +61,7 @@ template <typename T> class SingleInputSingleOutputPerf {
runner(otherFunc);
timer.stop();

double otherAverage =
static_cast<double>(timer.nanoseconds()) / numberOfRuns;
double otherAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
log << "-- Other function --\n";
log << " Total time : " << timer.nanoseconds() << " ns \n";
log << " Average runtime : " << otherAverage << " ns/op \n";
Expand All @@ -68,15 +72,18 @@ template <typename T> class SingleInputSingleOutputPerf {
log << " Mine / Other's : " << myAverage / otherAverage << " \n";
}

static void runPerf(Func myFunc, Func otherFunc, const char *logFile) {
static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
const char *logFile) {
std::ofstream log(logFile);
log << " Performance tests with inputs in denormal range:\n";
runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
/* endingBit= */ FPBits::max_subnormal().uintval(), log);
/* endingBit= */ FPBits::max_subnormal().uintval(), rounds,
log);
log << "\n Performance tests with inputs in normal range:\n";
runPerfInRange(myFunc, otherFunc,
/* startingBit= */ FPBits::min_normal().uintval(),
/* endingBit= */ FPBits::max_normal().uintval(), log);
/* endingBit= */ FPBits::max_normal().uintval(), rounds,
log);
}
};

Expand All @@ -86,6 +93,13 @@ template <typename T> class SingleInputSingleOutputPerf {
#define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename) \
int main() { \
LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf( \
&myFunc, &otherFunc, filename); \
&myFunc, &otherFunc, 1, filename); \
return 0; \
}

#define SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds, \
filename) \
{ \
LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf( \
&myFunc, &otherFunc, rounds, filename); \
}
41 changes: 41 additions & 0 deletions libc/test/src/math/performance_testing/misc_basic_ops_perf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===-- Performance test for miscellaneous basic operations ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "BinaryOpSingleOutputPerf.h"
#include "SingleInputSingleOutputPerf.h"
#include "src/math/copysignf.h"
#include "src/math/copysignf16.h"
#include "src/math/fabsf.h"
#include "src/math/fabsf16.h"

#include <math.h>

static constexpr size_t FLOAT16_ROUNDS = 20'000;
static constexpr size_t FLOAT_ROUNDS = 40;

// LLVM libc might be the only libc implementation with support for float16 math
// functions currently. We can't compare our float16 functions against the
// system libc, so we compare them against this placeholder function.
float16 placeholder_unaryf16(float16 x) { return x; }
float16 placeholder_binaryf16(float16 x, float16 y) { return x; }

int main() {
SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16,
placeholder_unaryf16, FLOAT16_ROUNDS,
"fabsf16_perf.log")
BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16,
placeholder_binaryf16, FLOAT16_ROUNDS,
"copysignf16_perf.log")

SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf,
FLOAT_ROUNDS, "fabsf_perf.log")
BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf,
FLOAT_ROUNDS, "copysignf_perf.log")

return 0;
}
Loading