Skip to content

Commit 18800a8

Browse files
tmatheson-armldionne
authored andcommitted
[locale][num_get] Improve Stage 2 of string to float conversion
https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.2 "Stage 2" of num_get::do_get() depends on "a check ... to determine if c is allowed as the next character of an input field of the conversion specifier returned by Stage 1". Previously this was a very simple check whether the next character was in a set of allowed characters. This could lead to Stage 2 accumulating character sequences such as "1.2f" and passing them to strtold (Stage 3). https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.3.3 Stage 3 can fail, however, if the entire character sequence from Stage 2 is not used in the conversion. For example, the "f" in "1.2f" is not used. https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.3.4 As a result, parsing a sequence like "1.2f" would return value 0.0 with failbit set. This change improves the checks made in Stage 2, determining what is passed to Stage 3. * Hex digits are only considered valid if "0x" has been seen * INFINITY value is recognised * Characters in INFINITY and NAN are only valid in sequence. This is done by checking one character backwards, which has obvious limitations. * New tests are added. The old ones are preserved but refactored. Differential Revision: https://reviews.llvm.org/D99091
1 parent 698b45a commit 18800a8

File tree

6 files changed

+286
-0
lines changed

6 files changed

+286
-0
lines changed

libcxx/include/__config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@
146146
// The incorrect implementation of CityHash has the problem that it drops some
147147
// bits on the floor.
148148
# define _LIBCPP_ABI_FIX_CITYHASH_IMPLEMENTATION
149+
// Enable breaking changes to stage 2 of string to float conversion in locale
150+
# define _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
149151
// Remove the base 10 implementation of std::to_chars from the dylib.
150152
// The implementation moved to the header, but we still export the symbols from
151153
// the dylib for backwards compatibility.

libcxx/include/locale

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,11 +396,21 @@ struct __num_get
396396
static string __stage2_float_prep(ios_base& __iob, _CharT* __atoms, _CharT& __decimal_point,
397397
_CharT& __thousands_sep);
398398

399+
#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
399400
static int __stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp,
400401
char* __a, char*& __a_end,
401402
_CharT __decimal_point, _CharT __thousands_sep,
402403
const string& __grouping, unsigned* __g,
403404
unsigned*& __g_end, unsigned& __dc, _CharT* __atoms);
405+
#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
406+
static int __stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp,
407+
char* __a, char*& __a_end,
408+
_CharT __decimal_point, _CharT __thousands_sep,
409+
const string& __grouping, unsigned* __g,
410+
unsigned*& __g_end, unsigned& __dc, _CharT* __atoms,
411+
bool& __hex);
412+
#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
413+
404414
#ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
405415
static string __stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep);
406416
static int __stage2_int_loop(_CharT __ct, int __base, char* __a, char*& __a_end,
@@ -523,6 +533,7 @@ __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*&
523533
return 0;
524534
}
525535

536+
#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
526537
template <class _CharT>
527538
int
528539
__num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp, char* __a, char*& __a_end,
@@ -579,8 +590,141 @@ __num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __ex
579590
if (__f >= 22)
580591
return 0;
581592
++__dc;
593+
594+
return 0;
595+
}
596+
#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
597+
template <class _CharT>
598+
_LIBCPP_HIDDEN
599+
int
600+
__num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp, char* __a, char*& __a_end,
601+
_CharT __decimal_point, _CharT __thousands_sep, const string& __grouping,
602+
unsigned* __g, unsigned*& __g_end, unsigned& __dc, _CharT* __atoms, bool& __hex)
603+
{
604+
#define __UPPERCASE(__x) ((__x)&0x5F)
605+
606+
if (__ct == __decimal_point)
607+
{
608+
if (!__in_units)
609+
return -1;
610+
__in_units = false;
611+
*__a_end++ = '.';
612+
if (__grouping.size() != 0 && __g_end-__g < __num_get_buf_sz)
613+
*__g_end++ = __dc;
614+
return 0;
615+
}
616+
if (__ct == __thousands_sep && __grouping.size() != 0)
617+
{
618+
if (!__in_units)
619+
return -1;
620+
if (__g_end-__g < __num_get_buf_sz)
621+
{
622+
*__g_end++ = __dc;
623+
__dc = 0;
624+
}
625+
return 0;
626+
}
627+
ptrdiff_t __f = find(__atoms, __atoms + 32, __ct) - __atoms;
628+
const bool __is_digit = __hex ? __f < 22 : __f < 10;
629+
const bool __first = __a_end == __a;
630+
if (__f >= 32)
631+
return -1;
632+
char __x = __src[__f];
633+
char __X = __UPPERCASE(__x);
634+
635+
// Return early -1 for any character that is not valid at this point
636+
if (__x == '-' || __x == '+')
637+
{
638+
// Previous character must be __exp, which was marked as seen setting bit 0x80
639+
if (!__first && __UPPERCASE(__a_end[-1]) != (__exp & 0x7F))
640+
return -1;
641+
}
642+
else if (__x == 'x' || __x == 'X')
643+
{
644+
// Can't have 'x' or 'X' as the first character
645+
if (__first)
646+
return -1;
647+
// Must be preceeded by a '0'
648+
if (__a_end[-1] != __atoms[0])
649+
return -1;
650+
// Can't have multiple occurrences of 'x'
651+
if (__hex)
652+
return -1;
653+
__hex = true;
654+
__exp = 'P';
655+
}
656+
else if (__X == __exp)
657+
{
658+
// Can't have e/E/p/P as first character
659+
if (__first)
660+
return -1;
661+
// Mark exponent as seen
662+
__exp |= (char) 0x80;
663+
if (__in_units)
664+
{
665+
__in_units = false;
666+
if (__grouping.size() != 0 && __g_end-__g < __num_get_buf_sz)
667+
*__g_end++ = __dc;
668+
}
669+
}
670+
else if (!__is_digit)
671+
{
672+
// Not '.' or __thousands_sep or '+' or '-' or 'x' or __exp or digit.
673+
// Special handling for the characters in INF/NAN.
674+
// These must appear at the start of the sequence, possibly preceeded by + or -.
675+
// Look back one character to check that these are part of a valid sequence.
676+
677+
if (__first)
678+
{
679+
// + and - as first character are handled in a separate branch.
680+
if (__X != 'I' && __X != 'N')
681+
return -1;
682+
}
683+
else
684+
{
685+
char __prev =
686+
__src[find(__atoms, __atoms + 32, __a_end[-1]) - __atoms];
687+
char __PREV = __UPPERCASE(__prev);
688+
689+
// Rule out special characters out of sequence INF or NAN.
690+
if (__X == 'I')
691+
{
692+
if (__prev != '+' && __prev != '-' && __PREV != 'N')
693+
return -1;
694+
}
695+
else if (__X == 'N')
696+
{
697+
if (__prev != '+' && __prev != '-' && __PREV != 'I' &&
698+
__PREV != 'A')
699+
return -1;
700+
}
701+
else if (__X == 'F')
702+
{
703+
if (__PREV != 'N')
704+
return -1;
705+
}
706+
else if (__X == 'A')
707+
{
708+
if (__PREV != 'N')
709+
return -1;
710+
}
711+
else if (!__is_digit)
712+
{
713+
return -1;
714+
}
715+
}
716+
}
717+
718+
// "...c is allowed as the next character of an input field of the conversion specifier returned by Stage 1."
719+
*__a_end++ = __x;
720+
721+
if (__is_digit)
722+
++__dc;
723+
582724
return 0;
725+
#undef __UPPERCASE
583726
}
727+
#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
584728

585729
extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<char>;
586730
#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
@@ -1046,6 +1190,10 @@ num_get<_CharT, _InputIterator>::__do_get_floating_point(iter_type __b, iter_typ
10461190
unsigned __dc = 0;
10471191
bool __in_units = true;
10481192
char __exp = 'E';
1193+
#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
1194+
bool __hex = false; //< set to true when we see 0x
1195+
#endif
1196+
10491197
for (; __b != __e; ++__b)
10501198
{
10511199
if (__a_end == __a + __buf.size())
@@ -1056,10 +1204,17 @@ num_get<_CharT, _InputIterator>::__do_get_floating_point(iter_type __b, iter_typ
10561204
__a = &__buf[0];
10571205
__a_end = __a + __tmp;
10581206
}
1207+
#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
10591208
if (this->__stage2_float_loop(*__b, __in_units, __exp, __a, __a_end,
10601209
__decimal_point, __thousands_sep,
10611210
__grouping, __g, __g_end,
10621211
__dc, __atoms))
1212+
#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
1213+
if (this->__stage2_float_loop(*__b, __in_units, __exp, __a, __a_end,
1214+
__decimal_point, __thousands_sep,
1215+
__grouping, __g, __g_end,
1216+
__dc, __atoms, __hex))
1217+
#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
10631218
break;
10641219
}
10651220
if (__grouping.size() != 0 && __in_units && __g_end-__g < __num_get_base::__num_get_buf_sz)

libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "test_macros.h"
2222
#include "test_iterators.h"
2323
#include "hexfloat.h"
24+
#include "get_float_common.h"
2425

2526
typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
2627

@@ -49,6 +50,41 @@ int main(int, char**)
4950
const my_facet f(1);
5051
std::ios ios(0);
5152
double v = -1;
53+
54+
#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
55+
// Valid floating point formats where whole string is consumed
56+
TEST("0x123.4f", 8, (double)hexfloat<double>(0x123, 0x4f, 0), ios.eofbit);
57+
TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
58+
TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
59+
TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
60+
TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
61+
TEST("INFxyz", 3, INFINITY, ios.goodbit);
62+
63+
// Valid floating point formats with unparsed trailing characters
64+
TEST("123.4f", 5, 123.4, ios.goodbit);
65+
TEST("123xyz", 3, 123.0, ios.goodbit);
66+
TEST("0x123.4+", 7, (double)hexfloat<double>(0x123, 0x4, 0), ios.goodbit);
67+
68+
// Shouldn't recognise e, p or x more than once
69+
TEST("123.4e-5e-4", 8, 123.4e-5, ios.goodbit);
70+
TEST("0x123.4p-5p-4", 10, (double)hexfloat<double>(0x123, 0x4, -5), ios.goodbit);
71+
TEST("0x123x5", 5, (double)hexfloat<double>(0x123, 0x0, 0), ios.goodbit);
72+
73+
// Invalid (non-float) inputs
74+
TEST("a", 0, 0.0, ios.failbit);
75+
TEST("e", 0, 0.0, ios.failbit);
76+
TEST("f", 0, 0.0, ios.failbit);
77+
TEST("p", 0, 0.0, ios.failbit);
78+
TEST("M", 0, 0.0, ios.failbit);
79+
TEST("{}", 0, 0.0, ios.failbit);
80+
TEST("x123", 0, 0.0, ios.failbit);
81+
82+
// Incomplete inputs, i.e. eof before finished parsing
83+
TEST("-", 1, 0.0, ios.eofbit | ios.failbit);
84+
TEST("+", 1, 0.0, ios.eofbit | ios.failbit);
85+
TEST("0x123.4p", 8, 0.0, ios.eofbit | ios.failbit);
86+
#endif
87+
5288
{
5389
const char str[] = "123";
5490
assert((ios.flags() & ios.basefield) == ios.dec);

libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "test_macros.h"
2222
#include "test_iterators.h"
2323
#include "hexfloat.h"
24+
#include "get_float_common.h"
2425

2526
typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
2627

@@ -38,6 +39,41 @@ int main(int, char**)
3839
const my_facet f(1);
3940
std::ios ios(0);
4041
float v = -1;
42+
43+
#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
44+
// Valid floating point formats where whole string is consumed
45+
TEST("0x123.4f", 8, (float)hexfloat<float>(0x123, 0x4f, 0), ios.eofbit);
46+
TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
47+
TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
48+
TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
49+
TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
50+
TEST("INFxyz", 3, INFINITY, ios.goodbit);
51+
52+
// Valid floating point formats with unparsed trailing characters
53+
TEST("123.4f", 5, 123.4f, ios.goodbit);
54+
TEST("123xyz", 3, 123.0f, ios.goodbit);
55+
TEST("0x123.4+", 7, (float)hexfloat<float>(0x123, 0x4, 0), ios.goodbit);
56+
57+
// Shouldn't recognise e, p or x more than once
58+
TEST("123.4e-5e-4", 8, 123.4e-5f, ios.goodbit);
59+
TEST("0x123.4p-5p-4", 10, (float)hexfloat<float>(0x123, 0x4, -5), ios.goodbit);
60+
TEST("0x123x5", 5, (float)hexfloat<float>(0x123, 0x0, 0), ios.goodbit);
61+
62+
// Invalid (non-float) inputs
63+
TEST("a", 0, 0.0f, ios.failbit);
64+
TEST("e", 0, 0.0f, ios.failbit);
65+
TEST("f", 0, 0.0f, ios.failbit);
66+
TEST("p", 0, 0.0f, ios.failbit);
67+
TEST("M", 0, 0.0f, ios.failbit);
68+
TEST("{}", 0, 0.0f, ios.failbit);
69+
TEST("x123", 0, 0.0f, ios.failbit);
70+
71+
// Incomplete inputs, i.e. eof before finished parsing
72+
TEST("-", 1, 0.0f, ios.eofbit | ios.failbit);
73+
TEST("+", 1, 0.0f, ios.eofbit | ios.failbit);
74+
TEST("0x123.4p", 8, 0.0f, ios.eofbit | ios.failbit);
75+
#endif
76+
4177
{
4278
const char str[] = "123";
4379
assert((ios.flags() & ios.basefield) == ios.dec);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#ifndef GET_FLOAT_COMMON_H
2+
#define GET_FLOAT_COMMON_H
3+
4+
/// Read a double from the input string, check that the expected number of
5+
/// characters are read, the expected value is returned, and the expected
6+
/// error is set.
7+
#define TEST(STR, EXPECTED_LEN, EXPECTED_VAL, EXPECTED_ERR) \
8+
{ \
9+
std::ios_base::iostate err = ios.goodbit; \
10+
cpp17_input_iterator<const char*> iter = \
11+
f.get(cpp17_input_iterator<const char*>((STR)), cpp17_input_iterator<const char*>((STR) + strlen((STR))), ios, \
12+
err, v); \
13+
assert(iter.base() == (STR) + (EXPECTED_LEN) && "read wrong number of characters"); \
14+
assert(err == (EXPECTED_ERR)); \
15+
if (std::isnan(EXPECTED_VAL)) \
16+
assert(std::isnan(v) && "expected NaN value"); \
17+
else \
18+
assert(v == (EXPECTED_VAL) && "wrong value"); \
19+
}
20+
21+
#endif

libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "test_macros.h"
2222
#include "test_iterators.h"
2323
#include "hexfloat.h"
24+
#include "get_float_common.h"
2425

2526
typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
2627

@@ -38,6 +39,41 @@ int main(int, char**)
3839
const my_facet f(1);
3940
std::ios ios(0);
4041
long double v = -1;
42+
43+
#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
44+
// Valid floating point formats where whole string is consumed
45+
TEST("0x123.4f", 8, (long double)hexfloat<long double>(0x123, 0x4f, 0), ios.eofbit);
46+
TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
47+
TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
48+
TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
49+
TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
50+
TEST("INFxyz", 3, INFINITY, ios.goodbit);
51+
52+
// Valid floating point formats with unparsed trailing characters
53+
TEST("123.4f", 5, 123.4l, ios.goodbit);
54+
TEST("123xyz", 3, 123.0l, ios.goodbit);
55+
TEST("0x123.4+", 7, (long double)hexfloat<long double>(0x123, 0x4, 0), ios.goodbit);
56+
57+
// Shouldn't recognise e, p or x more than once
58+
TEST("123.4e-5e-4", 8, 123.4e-5l, ios.goodbit);
59+
TEST("0x123.4p-5p-4", 10, (long double)hexfloat<long double>(0x123, 0x4, -5), ios.goodbit);
60+
TEST("0x123x5", 5, (long double)hexfloat<long double>(0x123, 0x0, 0), ios.goodbit);
61+
62+
// Invalid (non-float) inputs
63+
TEST("a", 0, 0.0l, ios.failbit);
64+
TEST("e", 0, 0.0l, ios.failbit);
65+
TEST("f", 0, 0.0l, ios.failbit);
66+
TEST("p", 0, 0.0l, ios.failbit);
67+
TEST("M", 0, 0.0l, ios.failbit);
68+
TEST("{}", 0, 0.0l, ios.failbit);
69+
TEST("x123", 0, 0.0l, ios.failbit);
70+
71+
// Incomplete inputs, i.e. eof before finished parsing
72+
TEST("-", 1, 0.0l, ios.eofbit | ios.failbit);
73+
TEST("+", 1, 0.0l, ios.eofbit | ios.failbit);
74+
TEST("0x123.4p", 8, 0.0l, ios.eofbit | ios.failbit);
75+
#endif
76+
4177
{
4278
const char str[] = "123";
4379
assert((ios.flags() & ios.basefield) == ios.dec);

0 commit comments

Comments
 (0)