Skip to content

Commit 301eb20

Browse files
overmightyyuxuanchen1997
authored andcommitted
[libc][math] Optimize copysign{,f,f16} and fabs{,f,f16} with builtins when available (#99037)
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251098
1 parent 3cf28ad commit 301eb20

File tree

13 files changed

+155
-24
lines changed

13 files changed

+155
-24
lines changed

libc/cmake/modules/LLVMLibCCompileOptionRules.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ function(_get_compile_options_from_flags output_var)
66
endif()
77
check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
88
check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
9+
check_flag(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG ${MISC_MATH_BASIC_OPS_OPT_FLAG} ${ARGN})
910

1011
if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
1112
if(ADD_FMA_FLAG)
@@ -37,6 +38,9 @@ function(_get_compile_options_from_flags output_var)
3738
if(ADD_EXPLICIT_SIMD_OPT_FLAG)
3839
list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
3940
endif()
41+
if(ADD_MISC_MATH_BASIC_OPS_OPT_FLAG)
42+
list(APPEND compile_options "-D__LIBC_MISC_MATH_BASIC_OPS_OPT")
43+
endif()
4044
elseif(MSVC)
4145
if(ADD_FMA_FLAG)
4246
list(APPEND compile_options "/arch:AVX2")

libc/cmake/modules/LLVMLibCFlagRules.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,9 @@ set(FMA_OPT_FLAG "FMA_OPT")
263263
set(ROUND_OPT_FLAG "ROUND_OPT")
264264
# This flag controls whether we use explicit SIMD instructions or not.
265265
set(EXPLICIT_SIMD_OPT_FLAG "EXPLICIT_SIMD_OPT")
266+
# This flag controls whether we use compiler builtin functions to implement
267+
# various basic math operations or not.
268+
set(MISC_MATH_BASIC_OPS_OPT_FLAG "MISC_MATH_BASIC_OPS_OPT")
266269

267270
# Skip FMA_OPT flag for targets that don't support fma.
268271
if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "FMA")) OR

libc/src/math/generic/CMakeLists.txt

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,9 @@ add_entrypoint_object(
424424
DEPENDS
425425
libc.src.__support.FPUtil.basic_operations
426426
COMPILE_OPTIONS
427-
-O2
427+
-O3
428+
FLAGS
429+
MISC_MATH_BASIC_OPS_OPT
428430
)
429431

430432
add_entrypoint_object(
@@ -436,7 +438,9 @@ add_entrypoint_object(
436438
DEPENDS
437439
libc.src.__support.FPUtil.basic_operations
438440
COMPILE_OPTIONS
439-
-O2
441+
-O3
442+
FLAGS
443+
MISC_MATH_BASIC_OPS_OPT
440444
)
441445

442446
add_entrypoint_object(
@@ -448,7 +452,7 @@ add_entrypoint_object(
448452
DEPENDS
449453
libc.src.__support.FPUtil.basic_operations
450454
COMPILE_OPTIONS
451-
-O2
455+
-O3
452456
)
453457

454458
add_entrypoint_object(
@@ -460,8 +464,12 @@ add_entrypoint_object(
460464
DEPENDS
461465
libc.src.__support.macros.properties.types
462466
libc.src.__support.FPUtil.basic_operations
467+
libc.src.__support.macros.properties.architectures
468+
libc.src.__support.macros.properties.compiler
463469
COMPILE_OPTIONS
464470
-O3
471+
FLAGS
472+
MISC_MATH_BASIC_OPS_OPT
465473
)
466474

467475
add_entrypoint_object(
@@ -1443,6 +1451,8 @@ add_entrypoint_object(
14431451
libc.src.__support.FPUtil.manipulation_functions
14441452
COMPILE_OPTIONS
14451453
-O3
1454+
FLAGS
1455+
MISC_MATH_BASIC_OPS_OPT
14461456
)
14471457

14481458
add_entrypoint_object(
@@ -1455,6 +1465,8 @@ add_entrypoint_object(
14551465
libc.src.__support.FPUtil.manipulation_functions
14561466
COMPILE_OPTIONS
14571467
-O3
1468+
FLAGS
1469+
MISC_MATH_BASIC_OPS_OPT
14581470
)
14591471

14601472
add_entrypoint_object(
@@ -1480,6 +1492,8 @@ add_entrypoint_object(
14801492
libc.src.__support.FPUtil.manipulation_functions
14811493
COMPILE_OPTIONS
14821494
-O3
1495+
FLAGS
1496+
MISC_MATH_BASIC_OPS_OPT
14831497
)
14841498

14851499
add_entrypoint_object(

libc/src/math/generic/copysign.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414
namespace LIBC_NAMESPACE_DECL {
1515

1616
LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
17+
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
18+
return __builtin_copysign(x, y);
19+
#else
1720
return fputil::copysign(x, y);
21+
#endif
1822
}
1923

2024
} // namespace LIBC_NAMESPACE_DECL

libc/src/math/generic/copysignf.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414
namespace LIBC_NAMESPACE_DECL {
1515

1616
LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
17+
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
18+
return __builtin_copysignf(x, y);
19+
#else
1720
return fputil::copysign(x, y);
21+
#endif
1822
}
1923

2024
} // namespace LIBC_NAMESPACE_DECL

libc/src/math/generic/copysignf16.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414
namespace LIBC_NAMESPACE_DECL {
1515

1616
LLVM_LIBC_FUNCTION(float16, copysignf16, (float16 x, float16 y)) {
17+
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
18+
return __builtin_copysignf16(x, y);
19+
#else
1720
return fputil::copysign(x, y);
21+
#endif
1822
}
1923

2024
} // namespace LIBC_NAMESPACE_DECL

libc/src/math/generic/fabs.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313

1414
namespace LIBC_NAMESPACE_DECL {
1515

16-
LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return fputil::abs(x); }
16+
LLVM_LIBC_FUNCTION(double, fabs, (double x)) {
17+
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
18+
return __builtin_fabs(x);
19+
#else
20+
return fputil::abs(x);
21+
#endif
22+
}
1723

1824
} // namespace LIBC_NAMESPACE_DECL

libc/src/math/generic/fabsf.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313

1414
namespace LIBC_NAMESPACE_DECL {
1515

16-
LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return fputil::abs(x); }
16+
LLVM_LIBC_FUNCTION(float, fabsf, (float x)) {
17+
#ifdef __LIBC_MISC_MATH_BASIC_OPS_OPT
18+
return __builtin_fabsf(x);
19+
#else
20+
return fputil::abs(x);
21+
#endif
22+
}
1723

1824
} // namespace LIBC_NAMESPACE_DECL

libc/src/math/generic/fabsf16.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,20 @@
1010
#include "src/__support/FPUtil/BasicOperations.h"
1111
#include "src/__support/common.h"
1212
#include "src/__support/macros/config.h"
13+
#include "src/__support/macros/properties/architectures.h"
14+
#include "src/__support/macros/properties/compiler.h"
1315

1416
namespace LIBC_NAMESPACE_DECL {
1517

16-
LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) { return fputil::abs(x); }
18+
LLVM_LIBC_FUNCTION(float16, fabsf16, (float16 x)) {
19+
// For x86, GCC generates better code from the generic implementation.
20+
// https://godbolt.org/z/K9orM4hTa
21+
#if defined(__LIBC_MISC_MATH_BASIC_OPS_OPT) && \
22+
!(defined(LIBC_TARGET_ARCH_IS_X86) && defined(LIBC_COMPILER_IS_GCC))
23+
return __builtin_fabsf16(x);
24+
#else
25+
return fputil::abs(x);
26+
#endif
27+
}
1728

1829
} // namespace LIBC_NAMESPACE_DECL

libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include "src/__support/CPP/algorithm.h"
910
#include "src/__support/FPUtil/FPBits.h"
1011
#include "src/__support/macros/config.h"
1112
#include "test/src/math/performance_testing/Timer.h"
@@ -28,11 +29,11 @@ template <typename T> class BinaryOpSingleOutputPerf {
2829
static void run_perf_in_range(Func myFunc, Func otherFunc,
2930
StorageType startingBit, StorageType endingBit,
3031
size_t N, size_t rounds, std::ofstream &log) {
31-
if (endingBit - startingBit < N)
32-
N = endingBit - startingBit;
32+
if (sizeof(StorageType) <= sizeof(size_t))
33+
N = cpp::min(N, static_cast<size_t>(endingBit - startingBit));
3334

3435
auto runner = [=](Func func) {
35-
volatile T result;
36+
[[maybe_unused]] volatile T result;
3637
if (endingBit < startingBit) {
3738
return;
3839
}

libc/test/src/math/performance_testing/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,17 @@ add_header_library(
9595
single_input_single_output_diff
9696
HDRS
9797
SingleInputSingleOutputPerf.h
98+
DEPENDS
99+
libc.src.__support.CPP.algorithm
100+
libc.src.__support.FPUtil.fp_bits
98101
)
99102

100103
add_header_library(
101104
binary_op_single_output_diff
102105
HDRS
103106
BinaryOpSingleOutputPerf.h
104107
DEPENDS
108+
libc.src.__support.CPP.algorithm
105109
libc.src.__support.FPUtil.fp_bits
106110
)
107111

@@ -402,3 +406,18 @@ add_perf_binary(
402406
LINK_LIBRARIES
403407
LibcFPTestHelpers
404408
)
409+
410+
add_perf_binary(
411+
misc_basic_ops_perf
412+
SRCS
413+
misc_basic_ops_perf.cpp
414+
DEPENDS
415+
.binary_op_single_output_diff
416+
.single_input_single_output_diff
417+
libc.src.math.copysignf
418+
libc.src.math.copysignf16
419+
libc.src.math.fabsf
420+
libc.src.math.fabsf16
421+
COMPILE_OPTIONS
422+
-fno-builtin
423+
)

libc/test/src/math/performance_testing/SingleInputSingleOutputPerf.h

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include "src/__support/CPP/algorithm.h"
910
#include "src/__support/FPUtil/FPBits.h"
1011
#include "src/__support/macros/config.h"
1112
#include "test/src/math/performance_testing/Timer.h"
@@ -26,16 +27,21 @@ template <typename T> class SingleInputSingleOutputPerf {
2627

2728
static void runPerfInRange(Func myFunc, Func otherFunc,
2829
StorageType startingBit, StorageType endingBit,
29-
std::ofstream &log) {
30+
size_t rounds, std::ofstream &log) {
31+
size_t n = 10'010'001;
32+
if (sizeof(StorageType) <= sizeof(size_t))
33+
n = cpp::min(n, static_cast<size_t>(endingBit - startingBit));
34+
3035
auto runner = [=](Func func) {
31-
constexpr StorageType N = 10'010'001;
32-
StorageType step = (endingBit - startingBit) / N;
36+
StorageType step = (endingBit - startingBit) / n;
3337
if (step == 0)
3438
step = 1;
35-
volatile T result;
36-
for (StorageType bits = startingBit; bits < endingBit; bits += step) {
37-
T x = FPBits(bits).get_val();
38-
result = func(x);
39+
[[maybe_unused]] volatile T result;
40+
for (size_t i = 0; i < rounds; i++) {
41+
for (StorageType bits = startingBit; bits < endingBit; bits += step) {
42+
T x = FPBits(bits).get_val();
43+
result = func(x);
44+
}
3945
}
4046
};
4147

@@ -44,8 +50,7 @@ template <typename T> class SingleInputSingleOutputPerf {
4450
runner(myFunc);
4551
timer.stop();
4652

47-
StorageType numberOfRuns = endingBit - startingBit + 1;
48-
double myAverage = static_cast<double>(timer.nanoseconds()) / numberOfRuns;
53+
double myAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
4954
log << "-- My function --\n";
5055
log << " Total time : " << timer.nanoseconds() << " ns \n";
5156
log << " Average runtime : " << myAverage << " ns/op \n";
@@ -56,8 +61,7 @@ template <typename T> class SingleInputSingleOutputPerf {
5661
runner(otherFunc);
5762
timer.stop();
5863

59-
double otherAverage =
60-
static_cast<double>(timer.nanoseconds()) / numberOfRuns;
64+
double otherAverage = static_cast<double>(timer.nanoseconds()) / n / rounds;
6165
log << "-- Other function --\n";
6266
log << " Total time : " << timer.nanoseconds() << " ns \n";
6367
log << " Average runtime : " << otherAverage << " ns/op \n";
@@ -68,15 +72,18 @@ template <typename T> class SingleInputSingleOutputPerf {
6872
log << " Mine / Other's : " << myAverage / otherAverage << " \n";
6973
}
7074

71-
static void runPerf(Func myFunc, Func otherFunc, const char *logFile) {
75+
static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
76+
const char *logFile) {
7277
std::ofstream log(logFile);
7378
log << " Performance tests with inputs in denormal range:\n";
7479
runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
75-
/* endingBit= */ FPBits::max_subnormal().uintval(), log);
80+
/* endingBit= */ FPBits::max_subnormal().uintval(), rounds,
81+
log);
7682
log << "\n Performance tests with inputs in normal range:\n";
7783
runPerfInRange(myFunc, otherFunc,
7884
/* startingBit= */ FPBits::min_normal().uintval(),
79-
/* endingBit= */ FPBits::max_normal().uintval(), log);
85+
/* endingBit= */ FPBits::max_normal().uintval(), rounds,
86+
log);
8087
}
8188
};
8289

@@ -86,6 +93,13 @@ template <typename T> class SingleInputSingleOutputPerf {
8693
#define SINGLE_INPUT_SINGLE_OUTPUT_PERF(T, myFunc, otherFunc, filename) \
8794
int main() { \
8895
LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf( \
89-
&myFunc, &otherFunc, filename); \
96+
&myFunc, &otherFunc, 1, filename); \
9097
return 0; \
9198
}
99+
100+
#define SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(T, myFunc, otherFunc, rounds, \
101+
filename) \
102+
{ \
103+
LIBC_NAMESPACE::testing::SingleInputSingleOutputPerf<T>::runPerf( \
104+
&myFunc, &otherFunc, rounds, filename); \
105+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//===-- Performance test for miscellaneous basic operations ---------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "BinaryOpSingleOutputPerf.h"
10+
#include "SingleInputSingleOutputPerf.h"
11+
#include "src/math/copysignf.h"
12+
#include "src/math/copysignf16.h"
13+
#include "src/math/fabsf.h"
14+
#include "src/math/fabsf16.h"
15+
16+
#include <math.h>
17+
18+
static constexpr size_t FLOAT16_ROUNDS = 20'000;
19+
static constexpr size_t FLOAT_ROUNDS = 40;
20+
21+
// LLVM libc might be the only libc implementation with support for float16 math
22+
// functions currently. We can't compare our float16 functions against the
23+
// system libc, so we compare them against this placeholder function.
24+
float16 placeholder_unaryf16(float16 x) { return x; }
25+
float16 placeholder_binaryf16(float16 x, float16 y) { return x; }
26+
27+
int main() {
28+
SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::fabsf16,
29+
placeholder_unaryf16, FLOAT16_ROUNDS,
30+
"fabsf16_perf.log")
31+
BINARY_OP_SINGLE_OUTPUT_PERF_EX(float16, LIBC_NAMESPACE::copysignf16,
32+
placeholder_binaryf16, FLOAT16_ROUNDS,
33+
"copysignf16_perf.log")
34+
35+
SINGLE_INPUT_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::fabsf, fabsf,
36+
FLOAT_ROUNDS, "fabsf_perf.log")
37+
BINARY_OP_SINGLE_OUTPUT_PERF_EX(float, LIBC_NAMESPACE::copysignf, copysignf,
38+
FLOAT_ROUNDS, "copysignf_perf.log")
39+
40+
return 0;
41+
}

0 commit comments

Comments
 (0)