Skip to content

Commit f8afc53

Browse files
authored
[libc++] Speed up classic locale (#72112)
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads. Co-authored-by: Louis Dionne <[email protected]> ``` │ baseline │ optimized │ │ sec/op │ sec/op vs base │ Istream_numbers/0/threads:1 4.672µ ± 0% 4.419µ ± 0% -5.42% (p=0.000 n=30+39) Istream_numbers/0/threads:72 539.817µ ± 0% 9.842µ ± 1% -98.18% (p=0.000 n=30+40) Istream_numbers/1/threads:1 4.890µ ± 0% 4.750µ ± 0% -2.85% (p=0.000 n=30+40) Istream_numbers/1/threads:72 66.44µ ± 1% 10.14µ ± 1% -84.74% (p=0.000 n=30+40) Istream_numbers/2/threads:1 4.888µ ± 0% 4.746µ ± 0% -2.92% (p=0.000 n=30+40) Istream_numbers/2/threads:72 494.8µ ± 0% 410.2µ ± 1% -17.11% (p=0.000 n=30+40) Istream_numbers/3/threads:1 4.697µ ± 0% 4.695µ ± 5% ~ (p=0.391 n=30+37) Istream_numbers/3/threads:72 421.5µ ± 7% 421.9µ ± 9% ~ (p=0.665 n=30) Ostream_number/0/threads:1 183.0n ± 0% 141.0n ± 2% -22.95% (p=0.000 n=30) Ostream_number/0/threads:72 24196.5n ± 1% 343.5n ± 3% -98.58% (p=0.000 n=30) Ostream_number/1/threads:1 250.0n ± 0% 196.0n ± 2% -21.60% (p=0.000 n=30) Ostream_number/1/threads:72 16260.5n ± 0% 407.0n ± 2% -97.50% (p=0.000 n=30) Ostream_number/2/threads:1 254.0n ± 0% 196.0n ± 1% -22.83% (p=0.000 n=30) Ostream_number/2/threads:72 28.49µ ± 1% 18.89µ ± 5% -33.72% (p=0.000 n=30) Ostream_number/3/threads:1 185.0n ± 0% 185.0n ± 0% 0.00% (p=0.017 n=30) Ostream_number/3/threads:72 19.38µ ± 4% 19.33µ ± 5% ~ (p=0.425 n=30) ```
1 parent e893242 commit f8afc53

File tree

6 files changed

+164
-55
lines changed

6 files changed

+164
-55
lines changed
Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
#include "benchmark/benchmark.h"
22
#include "test_macros.h"
33

4+
#include <mutex>
45
#include <sstream>
56

67
TEST_NOINLINE double istream_numbers();
78

8-
double istream_numbers() {
9+
double istream_numbers(std::locale* loc) {
910
const char* a[] = {"-6 69 -71 2.4882e-02 -100 101 -2.00005 5000000 -50000000",
1011
"-25 71 7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
1112
"-14 53 46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,73 @@ double istream_numbers() {
1415
double f1 = 0.0, f2 = 0.0, q = 0.0;
1516
for (int i = 0; i < 3; i++) {
1617
std::istringstream s(a[i]);
18+
if (loc)
19+
s.imbue(*loc);
1720
s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
1821
q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
1922
}
2023
return q;
2124
}
2225

26+
struct LocaleSelector {
27+
std::locale* imbue;
28+
std::locale old;
29+
static std::mutex mutex;
30+
31+
LocaleSelector(benchmark::State& state) {
32+
std::lock_guard guard(mutex);
33+
switch (state.range(0)) {
34+
case 0: {
35+
old = std::locale::global(std::locale::classic());
36+
imbue = nullptr;
37+
break;
38+
}
39+
case 1: {
40+
old = std::locale::global(std::locale::classic());
41+
thread_local std::locale loc("en_US.UTF-8");
42+
imbue = &loc;
43+
break;
44+
}
45+
case 2: {
46+
old = std::locale::global(std::locale::classic());
47+
static std::locale loc("en_US.UTF-8");
48+
imbue = &loc;
49+
break;
50+
}
51+
case 3: {
52+
old = std::locale::global(std::locale("en_US.UTF-8"));
53+
imbue = nullptr;
54+
break;
55+
}
56+
}
57+
}
58+
59+
~LocaleSelector() {
60+
std::lock_guard guard(mutex);
61+
std::locale::global(old);
62+
}
63+
};
64+
65+
std::mutex LocaleSelector::mutex;
66+
2367
static void BM_Istream_numbers(benchmark::State& state) {
68+
LocaleSelector sel(state);
2469
double i = 0;
2570
while (state.KeepRunning())
26-
benchmark::DoNotOptimize(i += istream_numbers());
71+
benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
72+
}
73+
BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
74+
75+
static void BM_Ostream_number(benchmark::State& state) {
76+
LocaleSelector sel(state);
77+
while (state.KeepRunning()) {
78+
std::ostringstream ss;
79+
if (sel.imbue)
80+
ss.imbue(*sel.imbue);
81+
ss << 0;
82+
benchmark::DoNotOptimize(ss.str().c_str());
83+
}
2784
}
85+
BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
2886

29-
BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
3087
BENCHMARK_MAIN();

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,7 @@ set(files
849849
__utility/integer_sequence.h
850850
__utility/is_pointer_in_range.h
851851
__utility/move.h
852+
__utility/no_destroy.h
852853
__utility/pair.h
853854
__utility/piecewise_construct.h
854855
__utility/priority_tag.h

libcxx/include/__locale

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <__memory/shared_ptr.h> // __shared_count
1616
#include <__mutex/once_flag.h>
1717
#include <__type_traits/make_unsigned.h>
18+
#include <__utility/no_destroy.h>
1819
#include <cctype>
1920
#include <clocale>
2021
#include <cstdint>

libcxx/include/__utility/no_destroy.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___UTILITY_NO_DESTROY_H
10+
#define _LIBCPP___UTILITY_NO_DESTROY_H
11+
12+
#include <__config>
13+
#include <__utility/forward.h>
14+
15+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
16+
# pragma GCC system_header
17+
#endif
18+
19+
_LIBCPP_BEGIN_NAMESPACE_STD
20+
21+
struct __uninitialized_tag {};
22+
23+
// This class stores an object of type _Tp but never destroys it.
24+
//
25+
// This is akin to using __attribute__((no_destroy)), except that it is possible
26+
// to control the lifetime of the object with more flexibility by deciding e.g.
27+
// whether to initialize the object at construction or to defer to a later
28+
// initialization using __emplace.
29+
template <class _Tp>
30+
struct __no_destroy {
31+
_LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) {}
32+
_LIBCPP_HIDE_FROM_ABI ~__no_destroy() {
33+
// nothing
34+
}
35+
36+
template <class... _Args>
37+
_LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args) : __obj_(std::forward<_Args>(__args)...) {}
38+
39+
template <class... _Args>
40+
_LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
41+
new (&__obj_) _Tp(std::forward<_Args>(__args)...);
42+
return __obj_;
43+
}
44+
45+
_LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
46+
_LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
47+
48+
private:
49+
union {
50+
_Tp __obj_;
51+
};
52+
};
53+
54+
_LIBCPP_END_NAMESPACE_STD
55+
56+
#endif // _LIBCPP___UTILITY_NO_DESTROY_H

libcxx/include/module.modulemap.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2087,6 +2087,7 @@ module std_private_utility_move [system] {
20872087
export std_private_type_traits_is_nothrow_move_constructible
20882088
export std_private_type_traits_remove_reference
20892089
}
2090+
module std_private_utility_no_destroy [system] { header "__utility/no_destroy.h" }
20902091
module std_private_utility_pair [system] {
20912092
header "__utility/pair.h"
20922093
export std_private_ranges_subrange_fwd

libcxx/src/locale.cpp

Lines changed: 45 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <__utility/no_destroy.h>
910
#include <algorithm>
1011
#include <clocale>
1112
#include <codecvt>
@@ -81,9 +82,8 @@ locale_t __cloc() {
8182

8283
namespace {
8384

84-
struct release
85-
{
86-
void operator()(locale::facet* p) {p->__release_shared();}
85+
struct releaser {
86+
void operator()(locale::facet* p) { p->__release_shared(); }
8787
};
8888

8989
template <class T, class ...Args>
@@ -155,7 +155,11 @@ class _LIBCPP_HIDDEN locale::__imp
155155
{return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
156156
const locale::facet* use_facet(long id) const;
157157

158-
private:
158+
void acquire();
159+
void release();
160+
static __no_destroy<__imp> classic_locale_imp_;
161+
162+
private:
159163
void install(facet* f, long id);
160164
template <class F> void install(F* f) {install(f, f->id.__get());}
161165
template <class F> void install_from(const __imp& other);
@@ -500,7 +504,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
500504
name_("*")
501505
{
502506
f->__add_shared();
503-
unique_ptr<facet, release> hold(f);
507+
unique_ptr<facet, releaser> hold(f);
504508
facets_ = other.facets_;
505509
for (unsigned i = 0; i < other.facets_.size(); ++i)
506510
if (facets_[i])
@@ -519,7 +523,7 @@ void
519523
locale::__imp::install(facet* f, long id)
520524
{
521525
f->__add_shared();
522-
unique_ptr<facet, release> hold(f);
526+
unique_ptr<facet, releaser> hold(f);
523527
if (static_cast<size_t>(id) >= facets_.size())
524528
facets_.resize(static_cast<size_t>(id+1));
525529
if (facets_[static_cast<size_t>(id)])
@@ -537,89 +541,78 @@ locale::__imp::use_facet(long id) const
537541

538542
// locale
539543

540-
// This class basically implements __attribute__((no_destroy)), which isn't supported
541-
// by GCC as of writing this.
542-
template <class T>
543-
struct __no_destroy {
544-
template <class... Args>
545-
explicit __no_destroy(Args&&... args) {
546-
T* obj = reinterpret_cast<T*>(&buf);
547-
new (obj) T(std::forward<Args>(args)...);
548-
}
549-
550-
T& get() { return *reinterpret_cast<T*>(&buf); }
551-
T const& get() const { return *reinterpret_cast<T const*>(&buf); }
552-
553-
private:
554-
alignas(T) byte buf[sizeof(T)];
555-
};
544+
// We don't do reference counting on the classic locale.
545+
// It's never destroyed anyway, but atomic reference counting may be very
546+
// expensive in parallel applications. The classic locale is used by default
547+
// in all streams. Note: if a new global locale is installed, then we lose
548+
// the benefit of no reference counting.
549+
__no_destroy<locale::__imp> locale::__imp::classic_locale_imp_(__uninitialized_tag{}); // initialized below in classic()
556550

557551
const locale& locale::classic() {
558-
static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
559-
return c.get();
552+
static const __no_destroy<locale> classic_locale(__private_tag{}, [] {
553+
// executed exactly once on first initialization of `classic_locale`
554+
locale::__imp::classic_locale_imp_.__emplace(1u);
555+
return &locale::__imp::classic_locale_imp_.__get();
556+
}());
557+
return classic_locale.__get();
560558
}
561559

562560
locale& locale::__global() {
563-
static __no_destroy<locale> g(locale::classic());
564-
return g.get();
561+
static __no_destroy<locale> g(locale::classic());
562+
return g.__get();
565563
}
566564

567-
locale::locale() noexcept
568-
: __locale_(__global().__locale_)
569-
{
570-
__locale_->__add_shared();
565+
void locale::__imp::acquire() {
566+
if (this != &locale::__imp::classic_locale_imp_.__get())
567+
__add_shared();
571568
}
572569

573-
locale::locale(const locale& l) noexcept
574-
: __locale_(l.__locale_)
575-
{
576-
__locale_->__add_shared();
570+
void locale::__imp::release() {
571+
if (this != &locale::__imp::classic_locale_imp_.__get())
572+
__release_shared();
577573
}
578574

579-
locale::~locale()
580-
{
581-
__locale_->__release_shared();
582-
}
575+
locale::locale() noexcept : __locale_(__global().__locale_) { __locale_->acquire(); }
576+
577+
locale::locale(const locale& l) noexcept : __locale_(l.__locale_) { __locale_->acquire(); }
578+
579+
locale::~locale() { __locale_->release(); }
583580

584581
const locale&
585582
locale::operator=(const locale& other) noexcept
586583
{
587-
other.__locale_->__add_shared();
588-
__locale_->__release_shared();
589-
__locale_ = other.__locale_;
590-
return *this;
584+
other.__locale_->acquire();
585+
__locale_->release();
586+
__locale_ = other.__locale_;
587+
return *this;
591588
}
592589

593590
locale::locale(const char* name)
594591
: __locale_(name ? new __imp(name)
595592
: (__throw_runtime_error("locale constructed with null"), nullptr))
596593
{
597-
__locale_->__add_shared();
594+
__locale_->acquire();
598595
}
599596

600-
locale::locale(const string& name)
601-
: __locale_(new __imp(name))
602-
{
603-
__locale_->__add_shared();
604-
}
597+
locale::locale(const string& name) : __locale_(new __imp(name)) { __locale_->acquire(); }
605598

606599
locale::locale(const locale& other, const char* name, category c)
607600
: __locale_(name ? new __imp(*other.__locale_, name, c)
608601
: (__throw_runtime_error("locale constructed with null"), nullptr))
609602
{
610-
__locale_->__add_shared();
603+
__locale_->acquire();
611604
}
612605

613606
locale::locale(const locale& other, const string& name, category c)
614607
: __locale_(new __imp(*other.__locale_, name, c))
615608
{
616-
__locale_->__add_shared();
609+
__locale_->acquire();
617610
}
618611

619612
locale::locale(const locale& other, const locale& one, category c)
620613
: __locale_(new __imp(*other.__locale_, *one.__locale_, c))
621614
{
622-
__locale_->__add_shared();
615+
__locale_->acquire();
623616
}
624617

625618
string
@@ -635,7 +628,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
635628
__locale_ = new __imp(*other.__locale_, f, id);
636629
else
637630
__locale_ = other.__locale_;
638-
__locale_->__add_shared();
631+
__locale_->acquire();
639632
}
640633

641634
locale

0 commit comments

Comments
 (0)