[locale][num_get] Improve Stage 2 of string to float conversion

tmatheson-arm · ldionne · commit 18800a8fa159 · 2023-09-01T09:23:53.000-04:00
https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.2 "Stage 2" of num_get::do_get() depends on "a check ... to determine if c is allowed as the next character of an input field of the conversion specifier returned by Stage 1". Previously this was a very simple check whether the next character was in a set of allowed characters. This could lead to Stage 2 accumulating character sequences such as "1.2f" and passing them to strtold (Stage 3). https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.3.3 Stage 3 can fail, however, if the entire character sequence from Stage 2 is not used in the conversion. For example, the "f" in "1.2f" is not used. https://timsong-cpp.github.io/cppwp/n4140/facet.num.get.virtuals#3.3.4 As a result, parsing a sequence like "1.2f" would return value 0.0 with failbit set. This change improves the checks made in Stage 2, determining what is passed to Stage 3. * Hex digits are only considered valid if "0x" has been seen * INFINITY value is recognised * Characters in INFINITY and NAN are only valid in sequence. This is done by checking one character backwards, which has obvious limitations. * New tests are added. The old ones are preserved but refactored. Differential Revision: https://reviews.llvm.org/D99091
diff --git a/libcxx/include/__config b/libcxx/include/__config
@@ -146,6 +146,8 @@
 // The incorrect implementation of CityHash has the problem that it drops some
 // bits on the floor.
 #    define _LIBCPP_ABI_FIX_CITYHASH_IMPLEMENTATION
+// Enable breaking changes to stage 2 of string to float conversion in locale
+#    define _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
 // Remove the base 10 implementation of std::to_chars from the dylib.
 // The implementation moved to the header, but we still export the symbols from
 // the dylib for backwards compatibility.
diff --git a/libcxx/include/locale b/libcxx/include/locale
@@ -396,11 +396,21 @@ struct __num_get
     static string __stage2_float_prep(ios_base& __iob, _CharT* __atoms, _CharT& __decimal_point,
                                       _CharT& __thousands_sep);
 
+#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
     static int __stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp,
                                    char* __a, char*& __a_end,
                                    _CharT __decimal_point, _CharT __thousands_sep,
                                    const string& __grouping, unsigned* __g,
                                    unsigned*& __g_end, unsigned& __dc, _CharT* __atoms);
+#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+    static int __stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp,
+                                   char* __a, char*& __a_end,
+                                   _CharT __decimal_point, _CharT __thousands_sep,
+                                   const string& __grouping, unsigned* __g,
+                                   unsigned*& __g_end, unsigned& __dc, _CharT* __atoms,
+                                   bool& __hex);
+#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+
 #ifndef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
     static string __stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep);
     static int __stage2_int_loop(_CharT __ct, int __base, char* __a, char*& __a_end,
@@ -523,6 +533,7 @@ __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*&
     return 0;
 }
 
+#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
 template <class _CharT>
 int
 __num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp, char* __a, char*& __a_end,
@@ -579,8 +590,141 @@ __num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __ex
     if (__f >= 22)
         return 0;
     ++__dc;
+
+    return 0;
+}
+#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+template <class _CharT>
+_LIBCPP_HIDDEN
+int
+__num_get<_CharT>::__stage2_float_loop(_CharT __ct, bool& __in_units, char& __exp, char* __a, char*& __a_end,
+                    _CharT __decimal_point, _CharT __thousands_sep, const string& __grouping,
+                    unsigned* __g, unsigned*& __g_end, unsigned& __dc, _CharT* __atoms, bool& __hex)
+{
+#define __UPPERCASE(__x) ((__x)&0x5F)
+
+    if (__ct == __decimal_point)
+    {
+        if (!__in_units)
+            return -1;
+        __in_units = false;
+        *__a_end++ = '.';
+        if (__grouping.size() != 0 && __g_end-__g < __num_get_buf_sz)
+            *__g_end++ = __dc;
+        return 0;
+    }
+    if (__ct == __thousands_sep && __grouping.size() != 0)
+    {
+        if (!__in_units)
+            return -1;
+        if (__g_end-__g < __num_get_buf_sz)
+        {
+            *__g_end++ = __dc;
+            __dc = 0;
+        }
+        return 0;
+    }
+    ptrdiff_t __f = find(__atoms, __atoms + 32, __ct) - __atoms;
+    const bool __is_digit = __hex ? __f < 22 : __f < 10;
+    const bool __first = __a_end == __a;
+    if (__f >= 32)
+        return -1;
+    char __x = __src[__f];
+    char __X = __UPPERCASE(__x);
+
+    // Return early -1 for any character that is not valid at this point
+    if (__x == '-' || __x == '+')
+    {
+        // Previous character must be __exp, which was marked as seen setting bit 0x80
+        if (!__first && __UPPERCASE(__a_end[-1]) != (__exp & 0x7F))
+            return -1;
+    }
+    else if (__x == 'x' || __x == 'X')
+    {
+        // Can't have 'x' or 'X' as the first character
+        if (__first)
+            return -1;
+        // Must be preceeded by a '0'
+        if (__a_end[-1] != __atoms[0])
+            return -1;
+        // Can't have multiple occurrences of 'x'
+        if (__hex)
+            return -1;
+        __hex = true;
+        __exp = 'P';
+    }
+    else if (__X == __exp)
+    {
+        // Can't have e/E/p/P as first character
+        if (__first)
+            return -1;
+        // Mark exponent as seen
+        __exp |= (char) 0x80;
+        if (__in_units)
+        {
+            __in_units = false;
+            if (__grouping.size() != 0 && __g_end-__g < __num_get_buf_sz)
+                *__g_end++ = __dc;
+        }
+    }
+    else if (!__is_digit)
+    {
+        // Not '.' or __thousands_sep or '+' or '-' or 'x' or __exp or digit.
+        // Special handling for the characters in INF/NAN.
+        // These must appear at the start of the sequence, possibly preceeded by + or -.
+        // Look back one character to check that these are part of a valid sequence.
+
+        if (__first)
+        {
+            // + and - as first character are handled in a separate branch.
+            if (__X != 'I' && __X != 'N')
+                return -1;
+        }
+        else
+        {
+            char __prev =
+                __src[find(__atoms, __atoms + 32, __a_end[-1]) - __atoms];
+            char __PREV = __UPPERCASE(__prev);
+
+            // Rule out special characters out of sequence INF or NAN.
+            if (__X == 'I')
+            {
+                if (__prev != '+' && __prev != '-' && __PREV != 'N')
+                    return -1;
+            }
+            else if (__X == 'N')
+            {
+                if (__prev != '+' && __prev != '-' && __PREV != 'I' &&
+                    __PREV != 'A')
+                    return -1;
+            }
+            else if (__X == 'F')
+            {
+                if (__PREV != 'N')
+                    return -1;
+            }
+            else if (__X == 'A')
+            {
+                if (__PREV != 'N')
+                    return -1;
+            }
+            else if (!__is_digit)
+            {
+                return -1;
+            }
+        }
+    }
+
+    // "...c is allowed as the next character of an input field of the conversion specifier returned by Stage 1."
+    *__a_end++ = __x;
+
+    if (__is_digit)
+        ++__dc;
+
     return 0;
+#undef __UPPERCASE
 }
+#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
 
 extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get<char>;
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
@@ -1046,6 +1190,10 @@ num_get<_CharT, _InputIterator>::__do_get_floating_point(iter_type __b, iter_typ
     unsigned __dc = 0;
     bool __in_units = true;
     char __exp = 'E';
+#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+    bool __hex = false; //< set to true when we see 0x
+#endif
+
     for (; __b != __e; ++__b)
     {
         if (__a_end == __a + __buf.size())
@@ -1056,10 +1204,17 @@ num_get<_CharT, _InputIterator>::__do_get_floating_point(iter_type __b, iter_typ
             __a = &__buf[0];
             __a_end = __a + __tmp;
         }
+#ifndef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
         if (this->__stage2_float_loop(*__b, __in_units, __exp, __a, __a_end,
                                       __decimal_point, __thousands_sep,
                                       __grouping, __g, __g_end,
                                       __dc, __atoms))
+#else // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+        if (this->__stage2_float_loop(*__b, __in_units, __exp, __a, __a_end,
+                                      __decimal_point, __thousands_sep,
+                                      __grouping, __g, __g_end,
+                                      __dc, __atoms, __hex))
+#endif // _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
             break;
     }
     if (__grouping.size() != 0 && __in_units && __g_end-__g < __num_get_base::__num_get_buf_sz)
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
@@ -21,6 +21,7 @@
 #include "test_macros.h"
 #include "test_iterators.h"
 #include "hexfloat.h"
+#include "get_float_common.h"
 
 typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
 
@@ -49,6 +50,41 @@ int main(int, char**)
     const my_facet f(1);
     std::ios ios(0);
     double v = -1;
+
+#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+    // Valid floating point formats where whole string is consumed
+    TEST("0x123.4f", 8, (double)hexfloat<double>(0x123, 0x4f, 0), ios.eofbit);
+    TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
+    TEST("INFxyz", 3, INFINITY, ios.goodbit);
+
+    // Valid floating point formats with unparsed trailing characters
+    TEST("123.4f", 5, 123.4, ios.goodbit);
+    TEST("123xyz", 3, 123.0, ios.goodbit);
+    TEST("0x123.4+", 7, (double)hexfloat<double>(0x123, 0x4, 0), ios.goodbit);
+
+    // Shouldn't recognise e, p or x more than once
+    TEST("123.4e-5e-4", 8, 123.4e-5, ios.goodbit);
+    TEST("0x123.4p-5p-4", 10, (double)hexfloat<double>(0x123, 0x4, -5), ios.goodbit);
+    TEST("0x123x5", 5, (double)hexfloat<double>(0x123, 0x0, 0), ios.goodbit);
+
+    // Invalid (non-float) inputs
+    TEST("a", 0, 0.0, ios.failbit);
+    TEST("e", 0, 0.0, ios.failbit);
+    TEST("f", 0, 0.0, ios.failbit);
+    TEST("p", 0, 0.0, ios.failbit);
+    TEST("M", 0, 0.0, ios.failbit);
+    TEST("{}", 0, 0.0, ios.failbit);
+    TEST("x123", 0, 0.0, ios.failbit);
+
+    // Incomplete inputs, i.e. eof before finished parsing
+    TEST("-", 1, 0.0, ios.eofbit | ios.failbit);
+    TEST("+", 1, 0.0, ios.eofbit | ios.failbit);
+    TEST("0x123.4p", 8, 0.0, ios.eofbit | ios.failbit);
+#endif
+
     {
         const char str[] = "123";
         assert((ios.flags() & ios.basefield) == ios.dec);
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
@@ -21,6 +21,7 @@
 #include "test_macros.h"
 #include "test_iterators.h"
 #include "hexfloat.h"
+#include "get_float_common.h"
 
 typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
 
@@ -38,6 +39,41 @@ int main(int, char**)
     const my_facet f(1);
     std::ios ios(0);
     float v = -1;
+
+#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+    // Valid floating point formats where whole string is consumed
+    TEST("0x123.4f", 8, (float)hexfloat<float>(0x123, 0x4f, 0), ios.eofbit);
+    TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
+    TEST("INFxyz", 3, INFINITY, ios.goodbit);
+
+    // Valid floating point formats with unparsed trailing characters
+    TEST("123.4f", 5, 123.4f, ios.goodbit);
+    TEST("123xyz", 3, 123.0f, ios.goodbit);
+    TEST("0x123.4+", 7, (float)hexfloat<float>(0x123, 0x4, 0), ios.goodbit);
+
+    // Shouldn't recognise e, p or x more than once
+    TEST("123.4e-5e-4", 8, 123.4e-5f, ios.goodbit);
+    TEST("0x123.4p-5p-4", 10, (float)hexfloat<float>(0x123, 0x4, -5), ios.goodbit);
+    TEST("0x123x5", 5, (float)hexfloat<float>(0x123, 0x0, 0), ios.goodbit);
+
+    // Invalid (non-float) inputs
+    TEST("a", 0, 0.0f, ios.failbit);
+    TEST("e", 0, 0.0f, ios.failbit);
+    TEST("f", 0, 0.0f, ios.failbit);
+    TEST("p", 0, 0.0f, ios.failbit);
+    TEST("M", 0, 0.0f, ios.failbit);
+    TEST("{}", 0, 0.0f, ios.failbit);
+    TEST("x123", 0, 0.0f, ios.failbit);
+
+    // Incomplete inputs, i.e. eof before finished parsing
+    TEST("-", 1, 0.0f, ios.eofbit | ios.failbit);
+    TEST("+", 1, 0.0f, ios.eofbit | ios.failbit);
+    TEST("0x123.4p", 8, 0.0f, ios.eofbit | ios.failbit);
+#endif
+
     {
         const char str[] = "123";
         assert((ios.flags() & ios.basefield) == ios.dec);
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float_common.h b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float_common.h
@@ -0,0 +1,21 @@
+#ifndef GET_FLOAT_COMMON_H
+#define GET_FLOAT_COMMON_H
+
+/// Read a double from the input string, check that the expected number of
+/// characters are read, the expected value is returned, and the expected
+/// error is set.
+#define TEST(STR, EXPECTED_LEN, EXPECTED_VAL, EXPECTED_ERR)                                                            \
+  {                                                                                                                    \
+    std::ios_base::iostate err = ios.goodbit;                                                                          \
+    cpp17_input_iterator<const char*> iter =                                                                           \
+        f.get(cpp17_input_iterator<const char*>((STR)), cpp17_input_iterator<const char*>((STR) + strlen((STR))), ios, \
+              err, v);                                                                                                 \
+    assert(iter.base() == (STR) + (EXPECTED_LEN) && "read wrong number of characters");                                \
+    assert(err == (EXPECTED_ERR));                                                                                     \
+    if (std::isnan(EXPECTED_VAL))                                                                                      \
+      assert(std::isnan(v) && "expected NaN value");                                                                   \
+    else                                                                                                               \
+      assert(v == (EXPECTED_VAL) && "wrong value");                                                                    \
+  }
+
+#endif
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
@@ -21,6 +21,7 @@
 #include "test_macros.h"
 #include "test_iterators.h"
 #include "hexfloat.h"
+#include "get_float_common.h"
 
 typedef std::num_get<char, cpp17_input_iterator<const char*> > F;
 
@@ -38,6 +39,41 @@ int main(int, char**)
     const my_facet f(1);
     std::ios ios(0);
     long double v = -1;
+
+#ifdef _LIBCPP_ABI_LOCAL_NUM_GET_NEW_STAGE2_FLOAT_LOOP
+    // Valid floating point formats where whole string is consumed
+    TEST("0x123.4f", 8, (long double)hexfloat<long double>(0x123, 0x4f, 0), ios.eofbit);
+    TEST("inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("Inf", 3, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("+iNf", 4, INFINITY, ios.goodbit | ios.eofbit);
+    TEST("-inF", 4, -INFINITY, ios.goodbit | ios.eofbit);
+    TEST("INFxyz", 3, INFINITY, ios.goodbit);
+
+    // Valid floating point formats with unparsed trailing characters
+    TEST("123.4f", 5, 123.4l, ios.goodbit);
+    TEST("123xyz", 3, 123.0l, ios.goodbit);
+    TEST("0x123.4+", 7, (long double)hexfloat<long double>(0x123, 0x4, 0), ios.goodbit);
+
+    // Shouldn't recognise e, p or x more than once
+    TEST("123.4e-5e-4", 8, 123.4e-5l, ios.goodbit);
+    TEST("0x123.4p-5p-4", 10, (long double)hexfloat<long double>(0x123, 0x4, -5), ios.goodbit);
+    TEST("0x123x5", 5, (long double)hexfloat<long double>(0x123, 0x0, 0), ios.goodbit);
+
+    // Invalid (non-float) inputs
+    TEST("a", 0, 0.0l, ios.failbit);
+    TEST("e", 0, 0.0l, ios.failbit);
+    TEST("f", 0, 0.0l, ios.failbit);
+    TEST("p", 0, 0.0l, ios.failbit);
+    TEST("M", 0, 0.0l, ios.failbit);
+    TEST("{}", 0, 0.0l, ios.failbit);
+    TEST("x123", 0, 0.0l, ios.failbit);
+
+    // Incomplete inputs, i.e. eof before finished parsing
+    TEST("-", 1, 0.0l, ios.eofbit | ios.failbit);
+    TEST("+", 1, 0.0l, ios.eofbit | ios.failbit);
+    TEST("0x123.4p", 8, 0.0l, ios.eofbit | ios.failbit);
+#endif
+
     {
         const char str[] = "123";
         assert((ios.flags() & ios.basefield) == ios.dec);