[libc++] Implement std::gcd using the binary version #77747

serge-sans-paille · 2024-01-11T10:29:02Z

The binary version is four times faster than current implementation in my setup, and generally considered a better implementation.

Code inspired by https://en.algorithmica.org/hpc/algorithms/gcd/ which itself is inspired by https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor/

Fix #77648

llvmbot · 2024-01-11T10:29:30Z

@llvm/pr-subscribers-libcxx

Author: None (serge-sans-paille)

Changes

The binary version is four times faster than current implementation in my setup, and generally considered a better implementation.

Code inspired by https://en.algorithmica.org/hpc/algorithms/gcd/ which itself is inspired by https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor/

Fix #77648

Full diff: https://github.com/llvm/llvm-project/pull/77747.diff

4 Files Affected:

(modified) libcxx/include/__bit/countr.h (+8-3)
(modified) libcxx/include/__numeric/gcd_lcm.h (+21-3)
(modified) libcxx/test/libcxx/transitive_includes/cxx26.csv (+23)
(modified) libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp (+26)

diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h
index 0cc679f87a99d9..e47b8245be3cea 100644
--- a/libcxx/include/__bit/countr.h
+++ b/libcxx/include/__bit/countr.h
@@ -35,10 +35,8 @@ _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_ct
   return __builtin_ctzll(__x);
 }
 
-#if _LIBCPP_STD_VER >= 20
-
 template <__libcpp_unsigned_integer _Tp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept {
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int __countr_zero(_Tp __t) noexcept {
   if (__t == 0)
     return numeric_limits<_Tp>::digits;
 
@@ -59,6 +57,13 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) n
   }
 }
 
+#if _LIBCPP_STD_VER >= 20
+
+template <__libcpp_unsigned_integer _Tp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept {
+  return __countr_zero(__t);
+}
+
 template <__libcpp_unsigned_integer _Tp>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr int countr_one(_Tp __t) noexcept {
   return __t != numeric_limits<_Tp>::max() ? std::countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
diff --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index 3e9c244f25c285..55199547fa4db2 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -10,7 +10,9 @@
 #ifndef _LIBCPP___NUMERIC_GCD_LCM_H
 #define _LIBCPP___NUMERIC_GCD_LCM_H
 
+#include <__algorithm/min.h>
 #include <__assert>
+#include <__bit/countr.h>
 #include <__config>
 #include <__type_traits/common_type.h>
 #include <__type_traits/is_integral.h>
@@ -50,9 +52,25 @@ struct __ct_abs<_Result, _Source, false> {
 };
 
 template <class _Tp>
-_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __m, _Tp __n) {
-  static_assert((!is_signed<_Tp>::value), "");
-  return __n == 0 ? __m : std::__gcd<_Tp>(__n, __m % __n);
+_LIBCPP_CONSTEXPR _LIBCPP_HIDDEN _Tp __gcd(_Tp __a, _Tp __b) {
+    static_assert((!is_signed<_Tp>::value), "");
+    if (__a == 0)
+        return __b;
+    if (__b == 0)
+        return __a;
+
+    int __az    = std::__countr_zero(__a);
+    int __bz    = std::__countr_zero(__b);
+    int __shift = std::min(__az, __bz);
+    __b >>= __bz;
+    while (__a != 0) {
+        __a >>= __az;
+        _Tp __absdiff = __a > __b ? __a - __b : __b - __a;
+        __b           = std::min(__a, __b);
+        __a           = __absdiff;
+        __az          = std::__countr_zero(__absdiff);
+    }
+    return __b << __shift;
 }
 
 template <class _Tp, class _Up>
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 0d8c3fa21b2f37..e3d325169a3fa8 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -173,6 +173,29 @@ experimental/simd limits
 experimental/type_traits initializer_list
 experimental/type_traits type_traits
 experimental/utility utility
+experimental/vector experimental/memory_resource
+experimental/vector vector
+ext/hash_map algorithm
+ext/hash_map cmath
+ext/hash_map cstddef
+ext/hash_map cstdint
+ext/hash_map cstring
+ext/hash_map functional
+ext/hash_map initializer_list
+ext/hash_map limits
+ext/hash_map new
+ext/hash_map stdexcept
+ext/hash_map string
+ext/hash_set algorithm
+ext/hash_set cmath
+ext/hash_set cstddef
+ext/hash_set cstdint
+ext/hash_set cstring
+ext/hash_set functional
+ext/hash_set initializer_list
+ext/hash_set limits
+ext/hash_set new
+ext/hash_set string
 filesystem compare
 filesystem cstddef
 filesystem cstdint
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
index 831c226f9c8ea1..0435575bd8759a 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp
@@ -17,6 +17,7 @@
 #include <cassert>
 #include <climits>
 #include <cstdint>
+#include <random>
 #include <type_traits>
 
 #include "test_macros.h"
@@ -48,6 +49,26 @@ constexpr bool test0(int in1, int in2, int out)
     return true;
 }
 
+template <typename T>
+T basic_gcd(T m, T n) {
+  return n == 0 ? m : basic_gcd<T>(n, m % n);
+}
+
+template <typename Input>
+void do_fuzzy_tests() {
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<Input> distrib;
+
+  constexpr int nb_rounds = 10000;
+  for(int i = 0; i < nb_rounds; ++i) {
+    Input n = distrib(gen);
+    Input m = distrib(gen);
+    assert(std::gcd(n, m) == basic_gcd(n, m));
+  }
+}
+
 
 template <typename Input1, typename Input2 = Input1>
 constexpr bool do_test(int = 0)
@@ -143,5 +164,10 @@ int main(int argc, char**)
     assert(res == 2);
     }
 
+    do_fuzzy_tests<std::uint8_t>();
+    do_fuzzy_tests<std::uint16_t>();
+    do_fuzzy_tests<std::uint32_t>();
+    do_fuzzy_tests<std::uint64_t>();
+
   return 0;
 }

github-actions · 2024-01-11T10:31:16Z

✅ With the latest revision this PR passed the C/C++ code formatter.

AdvenamTacet

Looks good to me % testing.
To be sure that we don't change std::gcd behavior, let's test (compare with naive gcd) also:

minimal and maximal values for different types.
all pairs of numbers around 0.

    for(T i = -9; i < 10; ++i)
        for(T j = -9; j < 10; ++j)

I don't see a benchmark requested on Phabricator.

@philnik777 do changes in transitive_includes look good to you?

libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp

AdvenamTacet

gcd implementation and gcd test look good to me with addressed comments.

Please, add a benchmark in libcxx/benchmarks/.

libcxx/include/__numeric/gcd_lcm.h

libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp

libcxx/include/__numeric/gcd_lcm.h

AdvenamTacet · 2024-01-12T11:15:42Z

Also, while for random values performance gain is consistent, it's worth to point out that it's possible to create benchmarks in which naive GCD is clearly faster. https://quick-bench.com/q/1y8T00mwK3t9EL__IqTKLgmTHwI

We may want to consider running naive GCD if one value is very small.

AdvenamTacet · 2024-01-18T05:54:47Z

Until now, I was not really thinking how this algorithm works, but I finally looked at it closer and I don't think we should upstream it.

The idea is based on:

If power of two divides both numbers, we can push it out.
- gcd( 2^x * a, 2^x * b) = 2^x * gcd(a, b)

If and only if exactly one number is even, we can divide that number by  that power. 
- if a, b are odd, then gcd(2^x * a, b) = gcd(a, b)

And standard gcd algorithm where instead of modulo, minus is used.

Based on that knowledge, it's possible to create an example where the new GCD is hundred times slower. https://quick-bench.com/q/4_kHmlvAN7nwcDaMqk3_6BSGaRM (Edit: notice that there is a problem with that benchmark, look at comments below)

Moreover, we still can build a test like that, even if we exclude very small values. This algorithm works well for random values, but unfortunately not for all values. https://quick-bench.com/q/vUh_PAKAip6GLvnH3FvgnmzdK74

Potential performance hit is so big that I don't think we can upstream it.

philnik777 · 2024-01-18T06:30:44Z

@AdvenamTacet You always have to be careful when writing benchmarks. If you look at the generated code, you see that Clang was able to constant-fold the native version and you only benchmark moving 0x1 into memory. While we should also consider this, it's not exactly a common case. If you avoid that the numbers look much better: https://quick-bench.com/q/KxLGz_xNDsVCEd3UUgBD6R9oPlY.
Much more importantly: I can produce similarly bad cases for just about any optimziation in the library. The question is how likely it is that this is an actual problem in the wild. I'd guess that it's not very common to calculate the gcd of (2^n)-1 and 1.
Given that, there are a few questions to answer:

In which cases is it faster or slower?
How often do we expect these cases to occur?
Would it be possible to detect these cases and decide which algorithm to use based on that? (e.g. with a popcount)
How much of a difference does it make if we don't have perfect branch prediction?

AdvenamTacet · 2024-01-18T07:41:40Z

@philnik777 thx for looking at it and fixing my benchmark! It's still big performance hit, but you are right that it's probably unlikely to see it in the wild often.

In which cases is it faster or slower?

Would it be possible to detect these cases and decide which algorithm to use based on that? (e.g. with a popcount)

My fast tests show that for random numbers or interval of numbers, it's unlikely to have worse performance with this new implementation when abs(__a) > 250 && abs(__b) > 250. Probably we can do something similar with a number chosen more carefully.

How often do we expect these cases to occur?

On every big interval I tested, implementation from this PR is faster, I did not try to check how often it's slower and how often it's faster. I can try to look at it soon.
Probably we should look at discrete logarithm for a better estimation.

On average, it may have a positive effect, if we are ok with big slowdown in some rare cases.
https://grep.app/search?q=std%3A%3Agcd%28
https://grep.app/search?current=3&q=gcd%28

lemire · 2024-04-22T18:55:38Z

it's possible to create an example where the new GCD is hundred times slower (...) big slowdown in some rare cases.

The algorithmic complexity of the binary GCD is the number of bits in the largest of the two integers.

For the benchmark in question, it should go down as...

u: 1, v: 16777214
u: 1, v: 8388606
u: 1, v: 4194302
u: 1, v: 2097150
u: 1, v: 1048574
u: 1, v: 524286
u: 1, v: 262142
u: 1, v: 131070
u: 1, v: 65534
u: 1, v: 32766
u: 1, v: 16382
u: 1, v: 8190
u: 1, v: 4094
u: 1, v: 2046
u: 1, v: 1022
u: 1, v: 510
u: 1, v: 254
u: 1, v: 126
u: 1, v: 62
u: 1, v: 30
u: 1, v: 14
u: 1, v: 6
u: 1, v: 2

So it is difficult to imagine that that there could be gigantic differences.

lemire · 2024-04-22T19:10:38Z

Note that my statement is not meant to contradict @AdvenamTacet but to point out that you can actually bound the difference from first principles. Algorithmically speaking, the difference cannot be excessively large, especially because each step in the binary GCD is cheaper. So it is definitively true that the naive algorithm can be sometimes faster, but there is denial-of-service attack lurking. :-)

AdvenamTacet · 2024-04-24T21:38:21Z

That change looks good to me and resolves my performance concerns. Overall, LGTM with my previous in-code comments addressed.

serge-sans-paille · 2024-05-02T07:10:02Z

@AdvenamTacet does this version look good to you?

AdvenamTacet

I have no more performance concerns and LGTM % two hanging comments.

Please add to test values numbers around limits instead of only limits:

      std::numeric_limits<Input>::min(),
      std::numeric_limits<Input>::min() + 1,
      std::numeric_limits<Input>::min() + 2,
      std::numeric_limits<Input>::max(),
      std::numeric_limits<Input>::max() - 1,
      std::numeric_limits<Input>::max() - 2,

could you also comment on that change:

-      return -static_cast<_Result>(__t);
+      return static_cast<_Result>(-static_cast<std::make_unsigned_t<_Result>>(__t));

Why that change? When does it change the result?

The binary version is four times faster than current implementation in my setup, and generally considered a better implementation. Code inspired by https://en.algorithmica.org/hpc/algorithms/gcd/ which itself is inspired by https://lemire.me/blog/2013/12/26/fastest-way-to-compute-the-greatest-common-divisor/ Hybrid approach and benchmarks inspired by `ylchapuy <https://github.com/ylchapuy>`. Fix llvm#77648

serge-sans-paille · 2024-05-04T21:26:02Z

I have no more performance concerns and LGTM % two hanging comments.

Please add to test values numbers around limits instead of only limits:
      std::numeric_limits<Input>::min(),
      std::numeric_limits<Input>::min() + 1,
      std::numeric_limits<Input>::min() + 2,
      std::numeric_limits<Input>::max(),
      std::numeric_limits<Input>::max() - 1,
      std::numeric_limits<Input>::max() - 2,

done

could you also comment on that change:
-      return -static_cast<_Result>(__t);
+      return static_cast<_Result>(-static_cast<std::make_unsigned_t<_Result>>(__t));
Why that change? When does it change the result?

I wasn't aware that gcd is UB when one of its argument absolute value cannot be represented in the result type. I dismissed this change.

serge-sans-paille · 2024-05-06T04:58:43Z

@AdvenamTacet I'm going to merge this tomorrow, if that's fine with you?

AdvenamTacet · 2024-05-06T06:36:34Z

I think it's ready and I support that change. I would be happy, if someone else gives their positive opinion about the change in transitive includes, because I don't know rules of modifying them, but [except that] I see nothing to change and I support the PR. Don't consider me blocking the merge.

hiraditya · 2024-11-21T22:26:04Z

We should test this algorithm with ubsan

philnik777 · 2024-11-21T22:38:36Z

We should test this algorithm with ubsan

We do. Every test is run with UBSan in the CI.

serge-sans-paille requested a review from a team as a code owner January 11, 2024 10:29

llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Jan 11, 2024

serge-sans-paille requested a review from AdvenamTacet January 11, 2024 10:29

serge-sans-paille force-pushed the fast/gcd branch 8 times, most recently from 5d7517c to 335d34b Compare January 11, 2024 14:19

AdvenamTacet requested changes Jan 11, 2024

View reviewed changes

libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp Outdated Show resolved Hide resolved

libcxx/test/std/numerics/numeric.ops/numeric.ops.gcd/gcd.pass.cpp Show resolved Hide resolved

AdvenamTacet requested a review from philnik777 January 11, 2024 14:56

AdvenamTacet self-assigned this Jan 11, 2024

AdvenamTacet added the performance label Jan 11, 2024

serge-sans-paille force-pushed the fast/gcd branch 5 times, most recently from bc0e98d to 9cc8154 Compare January 12, 2024 09:05

AdvenamTacet requested changes Jan 12, 2024

View reviewed changes

AdvenamTacet mentioned this pull request Jan 18, 2024

[D145982] [libc++] Implement std::gcd using the binary version #77648

Closed

serge-sans-paille force-pushed the fast/gcd branch 8 times, most recently from baed224 to 59bbb53 Compare April 29, 2024 12:26

AdvenamTacet approved these changes May 2, 2024

View reviewed changes

serge-sans-paille force-pushed the fast/gcd branch 10 times, most recently from 26c142f to a8a1179 Compare May 4, 2024 18:16

serge-sans-paille force-pushed the fast/gcd branch from a8a1179 to 8ea797c Compare May 4, 2024 20:23

serge-sans-paille merged commit 27a062e into llvm:main May 8, 2024

hiraditya mentioned this pull request Nov 21, 2024

ubsan: sub-overflow in gcd after #77747 #117249

Closed

[libc++] Implement std::gcd using the binary version #77747

[libc++] Implement std::gcd using the binary version #77747

Uh oh!

Conversation

serge-sans-paille commented Jan 11, 2024

Uh oh!

llvmbot commented Jan 11, 2024

Uh oh!

github-actions bot commented Jan 11, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AdvenamTacet left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

AdvenamTacet left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

AdvenamTacet commented Jan 12, 2024

Uh oh!

AdvenamTacet commented Jan 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

philnik777 commented Jan 18, 2024

Uh oh!

AdvenamTacet commented Jan 18, 2024

Uh oh!

lemire commented Apr 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

lemire commented Apr 22, 2024

Uh oh!

AdvenamTacet commented Apr 24, 2024

Uh oh!

serge-sans-paille commented May 2, 2024

Uh oh!

AdvenamTacet left a comment

Choose a reason for hiding this comment

Uh oh!

serge-sans-paille commented May 4, 2024

Uh oh!

serge-sans-paille commented May 6, 2024

Uh oh!

AdvenamTacet commented May 6, 2024

Uh oh!

hiraditya commented Nov 21, 2024

Uh oh!

philnik777 commented Nov 21, 2024

Uh oh!

Uh oh!

github-actions bot commented Jan 11, 2024 •

edited

Loading

AdvenamTacet commented Jan 18, 2024 •

edited

Loading

lemire commented Apr 22, 2024 •

edited

Loading