Skip to content

Commit 98eee4b

Browse files
sribee8Sriya Pratipati
andauthored
[libc] utf8 to 32 CharacterConverter (#143973)
Implemented push and pop for utf8 to 32 conversion and tests. --------- Co-authored-by: Sriya Pratipati <[email protected]>
1 parent 2488f26 commit 98eee4b

File tree

4 files changed

+263
-3
lines changed

4 files changed

+263
-3
lines changed

libc/src/__support/wchar/character_converter.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "hdr/types/char32_t.h"
1010
#include "hdr/types/char8_t.h"
11+
#include "src/__support/CPP/bit.h"
1112
#include "src/__support/common.h"
1213
#include "src/__support/error_or.h"
1314
#include "src/__support/math_extras.h"
@@ -30,6 +31,49 @@ bool CharacterConverter::isComplete() {
3031
return state->bytes_processed == state->total_bytes;
3132
}
3233

34+
int CharacterConverter::push(char8_t utf8_byte) {
35+
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
36+
// Checking the first byte if first push
37+
if (state->bytes_processed == 0) {
38+
// UTF-8 char has 1 byte total
39+
if (num_ones == 0) {
40+
state->total_bytes = 1;
41+
}
42+
// UTF-8 char has 2 through 4 bytes total
43+
else if (num_ones >= 2 && num_ones <= 4) {
44+
/* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
45+
we will make the base mask with 7 ones and right shift it as necessary. */
46+
constexpr size_t SIGNIFICANT_BITS = 7;
47+
uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
48+
state->total_bytes = num_ones;
49+
utf8_byte &= (base_mask >> num_ones);
50+
}
51+
// Invalid first byte
52+
else {
53+
// bytes_processed and total_bytes will always be 0 here
54+
state->partial = static_cast<char32_t>(0);
55+
return -1;
56+
}
57+
state->partial = static_cast<char32_t>(utf8_byte);
58+
state->bytes_processed++;
59+
return 0;
60+
}
61+
// Any subsequent push
62+
// Adding 6 more bits so need to left shift
63+
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
64+
if (num_ones == 1 && !isComplete()) {
65+
char32_t byte =
66+
utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
67+
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
68+
state->partial |= byte;
69+
state->bytes_processed++;
70+
return 0;
71+
}
72+
// Invalid byte -> reset the state
73+
clear();
74+
return -1;
75+
}
76+
3377
int CharacterConverter::push(char32_t utf32) {
3478
// we can't be partially through a conversion when pushing a utf32 value
3579
if (!isComplete())
@@ -54,6 +98,17 @@ int CharacterConverter::push(char32_t utf32) {
5498
return -1;
5599
}
56100

101+
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
102+
// If pop is called too early, do not reset the state, use error to determine
103+
// whether enough bytes have been pushed
104+
if (!isComplete() || state->bytes_processed == 0)
105+
return Error(-1);
106+
char32_t utf32 = state->partial;
107+
// reset if successful pop
108+
clear();
109+
return utf32;
110+
}
111+
57112
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
58113
if (isComplete())
59114
return Error(-1);

libc/test/src/__support/CMakeLists.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
275275
add_subdirectory(HashTable)
276276
add_subdirectory(time)
277277
add_subdirectory(threads)
278-
279-
# Requires access to uchar header which is not on macos
280-
# Therefore, cannot currently build this on macos in overlay mode
278+
# Requires access to uchar header which is not on MacOS
279+
# Cannot currently build this on MacOS in overlay mode
281280
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
282281
add_subdirectory(wchar)
283282
endif()

libc/test/src/__support/wchar/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
add_custom_target(libc-support-wchar-tests)
22

3+
add_libc_test(
4+
utf8_to_32_test
5+
SUITE
6+
libc-support-tests
7+
SRCS
8+
utf8_to_32_test.cpp
9+
DEPENDS
10+
libc.src.__support.wchar.character_converter
11+
)
12+
313
add_libc_test(
414
utf32_to_8_test
515
SUITE
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
//===-- Unittests for character_converter utf8->utf32 ---------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/error_or.h"
10+
#include "src/__support/wchar/character_converter.h"
11+
#include "src/__support/wchar/mbstate.h"
12+
#include "test/UnitTest/Test.h"
13+
14+
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
15+
LIBC_NAMESPACE::internal::mbstate state;
16+
state.bytes_processed = 0;
17+
state.total_bytes = 0;
18+
char ch = 'A';
19+
20+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
21+
int err = char_conv.push(static_cast<char8_t>(ch));
22+
auto wch = char_conv.pop_utf32();
23+
24+
ASSERT_EQ(err, 0);
25+
ASSERT_TRUE(wch.has_value());
26+
ASSERT_EQ(static_cast<int>(wch.value()), 65);
27+
}
28+
29+
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
30+
LIBC_NAMESPACE::internal::mbstate state;
31+
state.bytes_processed = 0;
32+
state.total_bytes = 0;
33+
const char ch[2] = {static_cast<char>(0xC2),
34+
static_cast<char>(0x8E)}; // Ž car symbol
35+
36+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
37+
char_conv.push(static_cast<char8_t>(ch[0]));
38+
char_conv.push(static_cast<char8_t>(ch[1]));
39+
auto wch = char_conv.pop_utf32();
40+
41+
ASSERT_TRUE(wch.has_value());
42+
ASSERT_EQ(static_cast<int>(wch.value()), 142);
43+
}
44+
45+
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
46+
LIBC_NAMESPACE::internal::mbstate state;
47+
state.bytes_processed = 0;
48+
state.total_bytes = 0;
49+
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
50+
static_cast<char>(0x91)}; // ∑ sigma symbol
51+
52+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
53+
char_conv.push(static_cast<char8_t>(ch[0]));
54+
char_conv.push(static_cast<char8_t>(ch[1]));
55+
char_conv.push(static_cast<char8_t>(ch[2]));
56+
auto wch = char_conv.pop_utf32();
57+
58+
ASSERT_TRUE(wch.has_value());
59+
ASSERT_EQ(static_cast<int>(wch.value()), 8721);
60+
}
61+
62+
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
63+
LIBC_NAMESPACE::internal::mbstate state;
64+
state.bytes_processed = 0;
65+
state.total_bytes = 0;
66+
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
67+
static_cast<char>(0xA4),
68+
static_cast<char>(0xA1)}; // 🤡 clown emoji
69+
70+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
71+
char_conv.push(static_cast<char8_t>(ch[0]));
72+
char_conv.push(static_cast<char8_t>(ch[1]));
73+
char_conv.push(static_cast<char8_t>(ch[2]));
74+
char_conv.push(static_cast<char8_t>(ch[3]));
75+
auto wch = char_conv.pop_utf32();
76+
77+
ASSERT_TRUE(wch.has_value());
78+
ASSERT_EQ(static_cast<int>(wch.value()), 129313);
79+
}
80+
81+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
82+
LIBC_NAMESPACE::internal::mbstate state;
83+
state.bytes_processed = 0;
84+
state.total_bytes = 0;
85+
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
86+
87+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
88+
int err = char_conv.push(static_cast<char8_t>(ch));
89+
90+
ASSERT_EQ(err, -1);
91+
}
92+
93+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
94+
LIBC_NAMESPACE::internal::mbstate state;
95+
state.bytes_processed = 0;
96+
state.total_bytes = 0;
97+
const char ch[4] = {
98+
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
99+
static_cast<char>(0x00)}; // first and third bytes are invalid
100+
101+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
102+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
103+
ASSERT_EQ(err, -1);
104+
err = char_conv.push(static_cast<char8_t>(ch[1]));
105+
ASSERT_EQ(err, 0);
106+
// Prev byte was single byte so trying to push another should error.
107+
err = char_conv.push(static_cast<char8_t>(ch[2]));
108+
ASSERT_EQ(err, -1);
109+
err = char_conv.push(static_cast<char8_t>(ch[3]));
110+
ASSERT_EQ(err, 0);
111+
}
112+
113+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
114+
LIBC_NAMESPACE::internal::mbstate state;
115+
state.bytes_processed = 0;
116+
state.total_bytes = 0;
117+
// Last byte is invalid since it does not have correct starting sequence.
118+
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
119+
const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
120+
static_cast<char>(0x80), static_cast<char>(0xC0)};
121+
122+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
123+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
124+
ASSERT_EQ(err, 0);
125+
err = char_conv.push(static_cast<char8_t>(ch[1]));
126+
ASSERT_EQ(err, 0);
127+
err = char_conv.push(static_cast<char8_t>(ch[2]));
128+
ASSERT_EQ(err, 0);
129+
err = char_conv.push(static_cast<char8_t>(ch[3]));
130+
ASSERT_EQ(err, -1);
131+
}
132+
133+
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
134+
LIBC_NAMESPACE::internal::mbstate state;
135+
state.bytes_processed = 0;
136+
state.total_bytes = 0;
137+
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
138+
static_cast<char>(0x80)};
139+
140+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
141+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
142+
ASSERT_EQ(err, 0);
143+
err = char_conv.push(static_cast<char8_t>(ch[1]));
144+
ASSERT_EQ(err, 0);
145+
// Should produce an error on 3rd byte
146+
err = char_conv.push(static_cast<char8_t>(ch[2]));
147+
ASSERT_EQ(err, -1);
148+
149+
// Should produce an error since mbstate was reset
150+
auto wch = char_conv.pop_utf32();
151+
ASSERT_FALSE(wch.has_value());
152+
}
153+
154+
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
155+
LIBC_NAMESPACE::internal::mbstate state;
156+
state.bytes_processed = 0;
157+
state.total_bytes = 0;
158+
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
159+
static_cast<char>(0xC7), static_cast<char>(0x8C)};
160+
161+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
162+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
163+
ASSERT_EQ(err, 0);
164+
err = char_conv.push(static_cast<char8_t>(ch[1]));
165+
ASSERT_EQ(err, 0);
166+
auto wch = char_conv.pop_utf32();
167+
ASSERT_TRUE(wch.has_value());
168+
ASSERT_EQ(static_cast<int>(wch.value()), 142);
169+
170+
// Second two byte character
171+
err = char_conv.push(static_cast<char8_t>(ch[2]));
172+
ASSERT_EQ(err, 0);
173+
err = char_conv.push(static_cast<char8_t>(ch[3]));
174+
ASSERT_EQ(err, 0);
175+
wch = char_conv.pop_utf32();
176+
ASSERT_TRUE(wch.has_value());
177+
ASSERT_EQ(static_cast<int>(wch.value()), 460);
178+
}
179+
180+
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
181+
LIBC_NAMESPACE::internal::mbstate state;
182+
state.bytes_processed = 0;
183+
state.total_bytes = 0;
184+
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
185+
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
186+
int err = char_conv.push(static_cast<char8_t>(ch[0]));
187+
ASSERT_EQ(err, 0);
188+
auto wch = char_conv.pop_utf32();
189+
ASSERT_FALSE(
190+
wch.has_value()); // Should fail since we have not read enough bytes
191+
err = char_conv.push(static_cast<char8_t>(ch[1]));
192+
ASSERT_EQ(err, 0);
193+
wch = char_conv.pop_utf32();
194+
ASSERT_TRUE(wch.has_value());
195+
ASSERT_EQ(static_cast<int>(wch.value()), 142);
196+
}

0 commit comments

Comments
 (0)