-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[libc] mbrtowc implementation #144760
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] mbrtowc implementation #144760
Conversation
implemented the internal and public mbrtowc as well as tests for the public function.
@llvm/pr-subscribers-libc Author: None (sribee8) Changesimplemented the internal and public mbrtowc as well as tests for the public function. Full diff: https://github.com/llvm/llvm-project/pull/144760.diff 13 Files Affected:
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index aa2079faed409..10509a0c25835 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -365,6 +365,7 @@ set(TARGET_LIBC_ENTRYPOINTS
# wchar.h entrypoints
libc.src.wchar.btowc
+ libc.src.wchar.mbrtowc
libc.src.wchar.wcslen
libc.src.wchar.wctob
libc.src.wchar.wmemmove
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index c88c357009072..e4b3cb0faa820 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -20,6 +20,14 @@ add_proxy_header_library(
libc.include.uchar
)
+add_proxy_header_library(
+ mbstate_t
+ HDRS
+ mbstate_t.h
+ DEPENDS
+ libc.include.llvm-libc-types.mbstate_t
+)
+
add_proxy_header_library(
div_t
HDRS
diff --git a/libc/hdr/types/mbstate_t.h b/libc/hdr/types/mbstate_t.h
new file mode 100644
index 0000000000000..15b2614341d7d
--- /dev/null
+++ b/libc/hdr/types/mbstate_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from mbstate_t.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
+#define LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/mbstate_t.h"
+
+#else // Overlay mode
+
+#include "hdr/wchar_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_MBSTATE_T_H
diff --git a/libc/include/llvm-libc-types/mbstate_t.h b/libc/include/llvm-libc-types/mbstate_t.h
index 540d50975a264..009fe57da50e2 100644
--- a/libc/include/llvm-libc-types/mbstate_t.h
+++ b/libc/include/llvm-libc-types/mbstate_t.h
@@ -9,8 +9,12 @@
#ifndef LLVM_LIBC_TYPES_MBSTATE_T_H
#define LLVM_LIBC_TYPES_MBSTATE_T_H
-// TODO: Complete this once we implement functions that operate on this type.
+#include "../llvm-libc-macros/stdint-macros.h"
+
typedef struct {
+ uint32_t __field1;
+ uint8_t __field2;
+ uint8_t __field3;
} mbstate_t;
#endif // LLVM_LIBC_TYPES_MBSTATE_T_H
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 84db73d8f01ea..06c621f59b462 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -29,6 +29,15 @@ functions:
return_type: wint_t
arguments:
- type: int
+ - name: mbrtowc
+ standards:
+ - stdc
+ return_type: size_t
+ arguments:
+ - type: wchar_t * __restrict
+ - type: const char * __restrict
+ - type: size_t
+ - type: mbstate_t * __restrict
- name: wmemset
standards:
- stdc
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 6715e354e23e5..479c1dff2c6e0 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -19,3 +19,19 @@ add_object_library(
libc.src.__support.math_extras
.mbstate
)
+
+add_object_library(
+ mbrtowc
+ HDRS
+ mbrtowc.h
+ SRCS
+ mbrtowc.cpp
+ DEPENDS
+ libc.hdr.types.wchar_t
+ libc.hdr.types.size_t
+ libc.src.__support.common
+ libc.src.__support.error_or
+ libc.src.__support.macros.config
+ .character_converter
+ .mbstate
+)
diff --git a/libc/src/__support/wchar/mbrtowc.cpp b/libc/src/__support/wchar/mbrtowc.cpp
new file mode 100644
index 0000000000000..969448ee60e81
--- /dev/null
+++ b/libc/src/__support/wchar/mbrtowc.cpp
@@ -0,0 +1,50 @@
+//===-- Implementation for mbrtowc function ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/mbrtowc.h"
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
+ size_t n, mbstate *__restrict ps) {
+ CharacterConverter char_conv(ps);
+ if (s == nullptr)
+ return 0;
+ size_t i = 0;
+ auto wc = char_conv.pop_utf32();
+ // Reading in bytes until we have a complete wc or error
+ for (; i < n && !wc.has_value(); ++i) {
+ int err = char_conv.push(static_cast<char8_t>(s[i]));
+ // Encoding error
+ if (err == -1)
+ return Error(-1);
+ wc = char_conv.pop_utf32();
+ }
+ if (wc.has_value()) {
+ *pwc = wc.value();
+ // null terminator -> return 0
+ if (wc.value() == L'\0')
+ return 0;
+ return i;
+ }
+ // Incomplete but potentially valid
+ return Error(-2);
+}
+
+} // namespace internal
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/mbrtowc.h b/libc/src/__support/wchar/mbrtowc.h
new file mode 100644
index 0000000000000..37329ee61beac
--- /dev/null
+++ b/libc/src/__support/wchar/mbrtowc.h
@@ -0,0 +1,29 @@
+//===-- Implementation header for mbrtowc function --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
+#define LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+ErrorOr<size_t> mbrtowc(wchar_t *__restrict pwc, const char *__restrict s,
+ size_t n, mbstate *__restrict ps);
+
+} // namespace internal
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_WCHAR_MBRTOWC
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 491dd5b34340a..163c29847e6a2 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -34,6 +34,23 @@ add_entrypoint_object(
libc.src.__support.wctype_utils
)
+add_entrypoint_object(
+ mbrtowc
+ SRCS
+ mbrtowc.cpp
+ HDRS
+ mbrtowc.h
+ DEPENDS
+ libc.hdr.types.size_t
+ libc.hdr.types.mbstate_t
+ libc.hdr.types.wchar_t
+ libc.src.__support.common
+ libc.src.__support.macros.config
+ libc.src.__support.wchar.mbrtowc
+ libc.src.__support.libc_errno
+ libc.src.__support.wchar.mbstate
+)
+
add_entrypoint_object(
wmemset
SRCS
diff --git a/libc/src/wchar/mbrtowc.cpp b/libc/src/wchar/mbrtowc.cpp
new file mode 100644
index 0000000000000..c29c5ee161e32
--- /dev/null
+++ b/libc/src/wchar/mbrtowc.cpp
@@ -0,0 +1,40 @@
+//===-- Implementation of mbrtowc -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/mbrtowc.h"
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbrtowc.h"
+#include "src/__support/wchar/mbstate.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(size_t, mbrtowc,
+ (wchar_t *__restrict pwc, const char *__restrict s, size_t n,
+ mbstate_t *__restrict ps)) {
+ static mbstate_t internal_mbstate{0, 0, 0};
+ auto ret = internal::mbrtowc(
+ pwc, s, n, (internal::mbstate *)(ps == nullptr ? &internal_mbstate : ps));
+ if (!ret.has_value()) {
+ // Encoding failure
+ if (ret.error() == -1) {
+ libc_errno = EILSEQ;
+ return -1;
+ }
+ // Could potentially read a valid wide character.
+ return -2;
+ }
+ return ret.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/mbrtowc.h b/libc/src/wchar/mbrtowc.h
new file mode 100644
index 0000000000000..e2e3d3ebd2853
--- /dev/null
+++ b/libc/src/wchar/mbrtowc.h
@@ -0,0 +1,24 @@
+//===-- Implementation header for mbrtowc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
+#define LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, size_t n,
+ mbstate_t *__restrict ps);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_MBRTOWC_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 4990b6953348b..d4cae1f6228bd 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -23,6 +23,20 @@ add_libc_test(
libc.src.wchar.btowc
)
+add_libc_test(
+ mbrtowc_test
+ SUITE
+ libc_wchar_unittests
+ SRCS
+ mbrtowc_test.cpp
+ DEPENDS
+ libc.src.__support.libc_errno
+ libc.src.string.memset
+ libc.src.wchar.mbrtowc
+ libc.hdr.types.mbstate_t
+ libc.hdr.types.wchar_t
+)
+
add_libc_test(
wctob_test
SUITE
diff --git a/libc/test/src/wchar/mbrtowc_test.cpp b/libc/test/src/wchar/mbrtowc_test.cpp
new file mode 100644
index 0000000000000..6e96e7ac31f49
--- /dev/null
+++ b/libc/test/src/wchar/mbrtowc_test.cpp
@@ -0,0 +1,170 @@
+//===-- Unittests for mbrtowc ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/mbstate_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/libc_errno.h"
+#include "src/string/memset.h"
+#include "src/wchar/mbrtowc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcMBRToWC, OneByte) {
+ const char *ch = "A";
+ wchar_t dest[2];
+ // Testing if it works with nullptr mbstate_t
+ mbstate_t *mb = nullptr;
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<char>(*dest), 'A');
+ ASSERT_EQ(static_cast<int>(n), 1);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 0, mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+}
+
+TEST(LlvmLibcMBRToWC, TwoByte) {
+ const char ch[2] = {static_cast<char>(0xC2),
+ static_cast<char>(0x8E)}; // � car symbol
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+ ASSERT_EQ(static_cast<int>(n), 2);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ // Should pass after reading one more byte
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), 1);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBRToWC, ThreeByte) {
+ const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+ static_cast<char>(0x91)}; // ∑ sigma symbol
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb);
+ ASSERT_EQ(static_cast<int>(*dest), 8721);
+ ASSERT_EQ(static_cast<int>(n), 3);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ // Should pass after reading two more bytes
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 2);
+ ASSERT_EQ(static_cast<int>(*dest), 8721);
+}
+
+TEST(LlvmLibcMBRToWC, FourByte) {
+ const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+ static_cast<char>(0xA4),
+ static_cast<char>(0xA1)}; // 🤡 clown emoji
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+ ASSERT_EQ(static_cast<int>(*dest), 129313);
+ ASSERT_EQ(static_cast<int>(n), 4);
+
+ // Should fail since we have not read enough
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), -2);
+ // Should pass after reading two more bytes
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch + 2, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 2);
+ ASSERT_EQ(static_cast<int>(*dest), 129313);
+}
+
+TEST(LlvmLibcMBRToWC, InvalidByte) {
+ const char ch[1] = {static_cast<char>(0x80)};
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+}
+
+TEST(LlvmLibcMBRToWC, InvalidMultiByte) {
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x80),
+ static_cast<char>(0x00)}; // invalid sequence of bytes
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // Trying to push all 4 should error
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+ // Trying to push just the first one should error
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+ // Trying to push the second and third should correspond to null wc
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch + 1, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 0);
+}
+
+TEST(LlvmLibcMBRToWC, InvalidLastByte) {
+ // Last byte is invalid since it does not have correct starting sequence.
+ // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+ static_cast<char>(0x80), static_cast<char>(0xC0)};
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // Trying to push all 4 should error
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 4, mb);
+ ASSERT_EQ(static_cast<int>(n), -1);
+ ASSERT_EQ(static_cast<int>(libc_errno), EILSEQ);
+}
+
+TEST(LlvmLibcMBRToWC, ValidTwoByteWithExtraRead) {
+ const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0x80)};
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // Trying to push all 3 should return valid 2 byte
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 3, mb);
+ ASSERT_EQ(static_cast<int>(n), 2);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+}
+
+TEST(LlvmLibcMBRToWC, TwoValidTwoBytes) {
+ const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+ static_cast<char>(0xC7), static_cast<char>(0x8C)};
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // mbstate should reset after reading first one
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, ch, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 2);
+ ASSERT_EQ(static_cast<int>(*dest), 142);
+ n = LIBC_NAMESPACE::mbrtowc(dest + 1, ch + 2, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 2);
+ ASSERT_EQ(static_cast<int>(*(dest + 1)), 460);
+}
+
+TEST(LlvmLibcMBRToWC, NullString) {
+ wchar_t dest[2];
+ mbstate_t *mb;
+ LIBC_NAMESPACE::memset(&mb, 0, sizeof(mbstate_t));
+ // reading on nullptr should return 0
+ size_t n = LIBC_NAMESPACE::mbrtowc(dest, nullptr, 2, mb);
+ ASSERT_EQ(static_cast<int>(n), 0);
+ // reading a null terminator should return 0
+ const char *ch = "\0";
+ n = LIBC_NAMESPACE::mbrtowc(dest, ch, 1, mb);
+ ASSERT_EQ(static_cast<int>(n), 0);
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mostly good, just a few small suggestions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
overall looks good
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall looks good, while you're doing the last fix I'll run the tests on my machine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tests passed. I found one small thing but once that's fixed this is good to merge.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/131/builds/24439 Here is the relevant piece of the build log for the reference
|
implemented the internal and public mbrtowc as well as tests for the public function. --------- Co-authored-by: Sriya Pratipati <[email protected]>
implemented the internal and public mbrtowc as well as tests for the public function.