Skip to content

[libcxx] Provide locale conversions to tests through lit substitution #105651

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

// REQUIRES: locale.fr_FR.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}

// <locale>

// class money_get<charT, InputIterator>
Expand Down Expand Up @@ -59,7 +61,8 @@ class my_facetw
};

static std::wstring convert_thousands_sep(std::wstring const& in) {
return LocaleHelpers::convert_thousands_sep_fr_FR(in);
const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);
return LocaleHelpers::convert_thousands_sep(in, fr_sep);
}
#endif // TEST_HAS_NO_WIDE_CHARACTERS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

// REQUIRES: locale.ru_RU.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}

// XFAIL: glibc-old-ru_RU-decimal-point

// <locale>
Expand Down Expand Up @@ -52,7 +54,8 @@ class my_facetw
};

static std::wstring convert_thousands_sep(std::wstring const& in) {
return LocaleHelpers::convert_thousands_sep_ru_RU(in);
const wchar_t ru_sep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);
return LocaleHelpers::convert_thousands_sep(in, ru_sep);
}
#endif // TEST_HAS_NO_WIDE_CHARACTERS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

// REQUIRES: locale.fr_FR.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}

// <locale>

// class money_put<charT, OutputIterator>
Expand Down Expand Up @@ -59,7 +61,8 @@ class my_facetw
};

static std::wstring convert_thousands_sep(std::wstring const& in) {
return LocaleHelpers::convert_thousands_sep_fr_FR(in);
const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);
return LocaleHelpers::convert_thousands_sep(in, fr_sep);
}
#endif // TEST_HAS_NO_WIDE_CHARACTERS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

// REQUIRES: locale.ru_RU.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}

// XFAIL: glibc-old-ru_RU-decimal-point

// <locale>
Expand Down Expand Up @@ -52,7 +54,8 @@ class my_facetw
};

static std::wstring convert_thousands_sep(std::wstring const& in) {
return LocaleHelpers::convert_thousands_sep_ru_RU(in);
const wchar_t ru_sep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);
return LocaleHelpers::convert_thousands_sep(in, ru_sep);
}
#endif // TEST_HAS_NO_WIDE_CHARACTERS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
// NetBSD does not support LC_MONETARY at the moment
// XFAIL: netbsd

// XFAIL: LIBCXX-FREEBSD-FIXME

// REQUIRES: locale.en_US.UTF-8
// REQUIRES: locale.fr_FR.UTF-8
// REQUIRES: locale.ru_RU.UTF-8
// REQUIRES: locale.zh_CN.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DFR_MON_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_MON_THOUSANDS_SEP}
// ADDITIONAL_COMPILE_FLAGS: -DRU_MON_THOU_SEP=%{LOCALE_CONV_RU_RU_UTF_8_MON_THOUSANDS_SEP}

// <locale>

// class moneypunct_byname<charT, International>
Expand All @@ -27,6 +28,7 @@
#include <cassert>

#include "test_macros.h"
#include "locale_helpers.h"
#include "platform_support.h" // locale name macros

class Fnf
Expand Down Expand Up @@ -110,17 +112,10 @@ int main(int, char**)
Fnt f(LOCALE_fr_FR_UTF_8, 1);
assert(f.thousands_sep() == ' ');
}
// The below tests work around GLIBC's use of U202F as mon_thousands_sep.

#ifndef TEST_HAS_NO_WIDE_CHARACTERS
#if defined(_CS_GNU_LIBC_VERSION)
const wchar_t fr_sep = glibc_version_less_than("2.27") ? L' ' : L'\u202F';
#elif defined(_WIN32)
const wchar_t fr_sep = L'\u00A0';
#elif defined(_AIX)
const wchar_t fr_sep = L'\u202F';
#else
const wchar_t fr_sep = L' ';
#endif
const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);

{
Fwf f(LOCALE_fr_FR_UTF_8, 1);
assert(f.thousands_sep() == fr_sep);
Expand All @@ -140,19 +135,8 @@ int main(int, char**)
assert(f.thousands_sep() == sep);
}
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
// The below tests work around GLIBC's use of U00A0 as mon_thousands_sep
// and U002E as mon_decimal_point.
// TODO: Fix thousands_sep for 'char'.
// related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=16006
# if defined(_CS_GNU_LIBC_VERSION)
// FIXME libc++ specifically works around \u00A0 by translating it into
// a regular space.
const wchar_t wsep = glibc_version_less_than("2.27") ? L'\u00A0' : L'\u202F';
# elif defined(_WIN32) || defined(_AIX)
const wchar_t wsep = L'\u00A0';
# else
const wchar_t wsep = L' ';
# endif
const wchar_t wsep = LocaleHelpers::mon_thousands_sep_or_default(RU_MON_THOU_SEP);

{
Fwf f(LOCALE_ru_RU_UTF_8, 1);
assert(f.thousands_sep() == wsep);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
// REQUIRES: locale.en_US.UTF-8
// REQUIRES: locale.fr_FR.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DFR_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP}

// <locale>

// template <class charT> class numpunct_byname;
Expand All @@ -25,6 +27,7 @@
#include <cassert>

#include "test_macros.h"
#include "locale_helpers.h"
#include "platform_support.h" // locale name macros

int main(int, char**)
Expand Down Expand Up @@ -74,18 +77,11 @@ int main(int, char**)
}
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
{
#if defined(_CS_GNU_LIBC_VERSION)
const wchar_t wsep = glibc_version_less_than("2.27") ? L' ' : L'\u202f';
# elif defined(_AIX)
const wchar_t wsep = L'\u202F';
# elif defined(_WIN32)
const wchar_t wsep = L'\u00A0';
# else
const wchar_t wsep = L',';
# endif
typedef wchar_t C;
const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
assert(np.thousands_sep() == wsep);
const wchar_t wsep = LocaleHelpers::thousands_sep_or_default(FR_THOU_SEP);

typedef wchar_t C;
const std::numpunct<C>& np = std::use_facet<std::numpunct<C> >(l);
assert(np.thousands_sep() == wsep);
}
#endif // TEST_HAS_NO_WIDE_CHARACTERS
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
// REQUIRES: locale.fr_FR.UTF-8
// REQUIRES: locale.ja_JP.UTF-8

// ADDITIONAL_COMPILE_FLAGS: -DFR_THOU_SEP=%{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP}
// ADDITIONAL_COMPILE_FLAGS: -DFR_DEC_POINT=%{LOCALE_CONV_FR_FR_UTF_8_DECIMAL_POINT}

// <chrono>

// template<class Rep, class Period = ratio<1>> class duration;
Expand All @@ -33,6 +36,7 @@
#include <sstream>

#include "make_string.h"
#include "locale_helpers.h"
#include "platform_support.h" // locale name macros
#include "test_macros.h"

Expand Down Expand Up @@ -88,21 +92,11 @@ static void test_values() {
assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1 000,1235s"));
#endif
} else {
#ifdef _WIN32
assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1\u00A0000\u00A0000s"));
assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1\u00A0000\u00A0000s"));
assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1\u00A0000,1235s"));
assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1\u00A0000,1235s"));
#elif defined(__APPLE__)
assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1000000s"));
assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1000000s"));
assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1000,1235s"));
assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1000,1235s"));
#else
assert(stream_fr_FR_locale<CharT>(-1'000'000s) == SV("-1\u202f000\u202f000s"));
assert(stream_fr_FR_locale<CharT>(1'000'000s) == SV("1\u202f000\u202f000s"));
assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == SV("-1\u202f000,1235s"));
assert(stream_fr_FR_locale<CharT>(1'000.123456s) == SV("1\u202f000,1235s"));
#ifndef TEST_HAS_NO_WIDE_CHARACTERS
assert(stream_fr_FR_locale<CharT>(-1'000'000s) == L"-1" FR_THOU_SEP "000" FR_THOU_SEP "000s");
assert(stream_fr_FR_locale<CharT>(1'000'000s) == L"1" FR_THOU_SEP "000" FR_THOU_SEP "000s");
assert(stream_fr_FR_locale<CharT>(-1'000.123456s) == L"-1" FR_THOU_SEP "000" FR_DEC_POINT "1235s");
assert(stream_fr_FR_locale<CharT>(1'000.123456s) == L"1" FR_THOU_SEP "000" FR_DEC_POINT "1235s");
#endif
}

Expand Down
37 changes: 6 additions & 31 deletions libcxx/test/support/locale_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,37 +41,6 @@ std::wstring convert_thousands_sep(std::wstring const& in, wchar_t sep) {
return out;
}

// GLIBC 2.27 and newer use U+202F NARROW NO-BREAK SPACE as a thousands separator.
// This function converts the spaces in string inputs to U+202F if need
// be. FreeBSD's locale data also uses U+202F, since 2018.
// Windows uses U+00A0 NO-BREAK SPACE.
std::wstring convert_thousands_sep_fr_FR(std::wstring const& in) {
#if defined(_CS_GNU_LIBC_VERSION)
if (glibc_version_less_than("2.27"))
return in;
else
return convert_thousands_sep(in, L'\u202F');
#elif defined(__FreeBSD__)
return convert_thousands_sep(in, L'\u202F');
#elif defined(_WIN32)
return convert_thousands_sep(in, L'\u00A0');
#else
return in;
#endif
}

// GLIBC 2.27 uses U+202F NARROW NO-BREAK SPACE as a thousands separator.
// FreeBSD, AIX and Windows use U+00A0 NO-BREAK SPACE.
std::wstring convert_thousands_sep_ru_RU(std::wstring const& in) {
#if defined(TEST_HAS_GLIBC)
return convert_thousands_sep(in, L'\u202F');
# elif defined(__FreeBSD__) || defined(_WIN32) || defined(_AIX)
return convert_thousands_sep(in, L'\u00A0');
# else
return in;
# endif
}

std::wstring negate_en_US(std::wstring s) {
#if defined(_WIN32)
return L"(" + s + L")";
Expand All @@ -80,6 +49,12 @@ std::wstring negate_en_US(std::wstring s) {
#endif
}

wchar_t thousands_sep_or_default(std::wstring s) { return !s.empty() ? s[0] : L','; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why the test suite doesn't always provide the thousands separator? If we're going through the trouble of figuring those out, wouldn't it make sense to always provide the define so the code doesn't have to use thousands_sep_or_default?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean, that rather than the individual tests doing const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);, we'd just go const wchar_t fr_sep = FR_MON_THOU_SEP;?

I'm not entirely sure of the original author's intentions here, but I would guess that this felt like a smaller step - in case we didn't manage to dig up the right separators in the test framework. But as long as that setup does work (and afaik it does), I guess we could get rid of the defaults in locale_helpers.h entirely.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean, that rather than the individual tests doing const wchar_t fr_sep = LocaleHelpers::mon_thousands_sep_or_default(FR_MON_THOU_SEP);, we'd just go const wchar_t fr_sep = FR_MON_THOU_SEP;?

Yes, exactly!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That doesn't work quite off the bat, at least.

The problem is that FR_MON_THOU_SEP is a wchar_t string like L"\u00a0", while we want a single wchar_t. The mon_thousands_sep_or_default helper took a std::wstring and returns s[0] if s is nonempty in the current version of the patch.

This is because what localeconv() returns is a struct with strings, where each separator string usually is one single char, but they could in theory be multiple. In the current patch we treat this as strings all the way up to the foo_sep_or_default() helpers which convert from a string to a single wchar.

If we wanted to go that way, I guess we could make the localeconv() helper executable, that is called in python, only return the first char, and treat it as a numeric single wchar throughout instead. I guess that'd work too. It would be a bit tricky for one of the tests, though, where we currently try to do this:

    assert(stream_fr_FR_locale<CharT>(-1'000'000s) == L"-1" FR_THOU_SEP "000" FR_THOU_SEP "000s");

I'm not sure how we'd easily synthesize a wchar string literal out of that, when we'd have e.g. -DFR_THOU_SEP=0x00a0.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @mstorsjo for taking over the PR and making progress on it!

Right, localeconv's struct has currency_symbol which could in practice be more than 1 char, so to allow provide_locale_conversions substitution code in features.py to be flexible enough it just sets the substitutions as strings.

The defaults ("," and ".") in the thousands_sep_or_default helper function won't actually be used as we are only calling these from tests where the locale is set to fr_FR or ru_RU and we've set the defines.

I think it's reasonable for these helpers to assert that the input is not empty and always take the first char. The functions should probably be renamed to something more appropriate though if they're essentially just converting to a wchar_t.


wchar_t mon_thousands_sep_or_default(std::wstring s) { return thousands_sep_or_default(s); }

wchar_t decimal_point_or_default(std::wstring s) { return !s.empty() ? s[0] : L'.'; }

#endif // TEST_HAS_NO_WIDE_CHARACTERS

std::string negate_en_US(std::string s) {
Expand Down
91 changes: 90 additions & 1 deletion libcxx/utils/libcxx/test/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,17 +425,106 @@ def _mingwSupportsModules(cfg):
"fr_CA.ISO8859-1": ["fr_CA.ISO8859-1", "French_Canada.1252"],
"cs_CZ.ISO8859-2": ["cs_CZ.ISO8859-2", "Czech_Czech Republic.1250"],
}
provide_locale_conversions = {
"fr_FR.UTF-8": ["decimal_point", "mon_thousands_sep", "thousands_sep"],
"ru_RU.UTF-8": ["mon_thousands_sep"],
}
for locale, alts in locales.items():
# Note: Using alts directly in the lambda body here will bind it to the value at the
# end of the loop. Assigning it to a default argument works around this issue.
DEFAULT_FEATURES.append(
Feature(
name="locale.{}".format(locale),
when=lambda cfg, alts=alts: hasAnyLocale(cfg, alts),
)
actions=lambda cfg, locale=locale, alts=alts: _getLocaleFlagsAction(
cfg, locale, alts, provide_locale_conversions[locale]
)
if locale in provide_locale_conversions
and "_LIBCPP_HAS_NO_WIDE_CHARACTERS" not in compilerMacros(cfg)
else [],
),
)


# Provide environment locale conversions through substitutions to avoid platform specific
# maintenance.
def _getLocaleFlagsAction(cfg, locale, alts, members):
alts_list = ",".join([f'"{l}"' for l in alts])
get_member_list = ",".join([f"lc->{m}" for m in members])

localeconv_info = programOutput(
cfg,
r"""
#if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS
#endif
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
#include <wchar.h>
// Print each requested locale conversion member on separate lines.
int main() {
const char* locales[] = { %s };
for (int loc_i = 0; loc_i < %d; ++loc_i) {
if (!setlocale(LC_ALL, locales[loc_i])) {
continue; // Choose first locale name that is recognized.
}
lconv* lc = localeconv();
const char* members[] = { %s };
for (size_t m_i = 0; m_i < %d; ++m_i) {
if (!members[m_i]) {
printf("\n"); // member value is an empty string
continue;
}
size_t len = mbstowcs(nullptr, members[m_i], 0);
if (len == static_cast<size_t>(-1)) {
fprintf(stderr, "mbstowcs failed unexpectedly\n");
return 1;
}
// Include room for null terminator. Use malloc as these features
// are also used by lit configs that don't use -lc++ (libunwind tests).
wchar_t* dst = (wchar_t*)malloc((len + 1) * sizeof(wchar_t));
size_t ret = mbstowcs(dst, members[m_i], len + 1);
if (ret == static_cast<size_t>(-1)) {
fprintf(stderr, "mbstowcs failed unexpectedly\n");
free(dst);
return 1;
}
for (size_t i = 0; i < len; ++i) {
if (dst[i] > 0x7F) {
printf("\\u%%04x", dst[i]);
} else {
// c++03 does not allow basic ascii-range characters in UCNs
printf("%%c", (char)dst[i]);
}
}
printf("\n");
free(dst);
}
return 0;
}
return 1;
}
"""
% (alts_list, len(alts), get_member_list, len(members)),
)
valid_define_name = re.sub(r"[.-]", "_", locale).upper()
return [
# Provide locale conversion through a substitution.
# Example: %{LOCALE_CONV_FR_FR_UTF_8_THOUSANDS_SEP} = L"\u202f"
AddSubstitution(
f"%{{LOCALE_CONV_{valid_define_name}_{member.upper()}}}",
lambda cfg, value=value: f"'L\"{value}\"'",
)
for member, value in zip(members, localeconv_info.split("\n"))
]


# Add features representing the target platform name: darwin, linux, windows, etc...
DEFAULT_FEATURES += [
Feature(name="darwin", when=lambda cfg: "__APPLE__" in compilerMacros(cfg)),
Expand Down