Skip to content

Commit 8adccae

Browse files
[libc] Implemented CharacterConverter push/pop for utf32->utf8 conversions (#143971)
Implemented CharacterConverter methods for conversion between utf32 -> utf8 Added tests --------- Co-authored-by: Michael Jones <[email protected]>
1 parent a637584 commit 8adccae

File tree

8 files changed

+278
-39
lines changed

8 files changed

+278
-39
lines changed

libc/src/__support/wchar/CMakeLists.txt

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,7 @@ add_object_library(
1515
DEPENDS
1616
libc.hdr.types.char8_t
1717
libc.hdr.types.char32_t
18+
libc.src.__support.error_or
19+
libc.src.__support.math_extras
1820
.mbstate
19-
.utf_ret
20-
)
21-
22-
add_header_library(
23-
utf_ret
24-
HDRS
25-
utf_ret.h
2621
)

libc/src/__support/wchar/character_converter.cpp

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88

99
#include "hdr/types/char32_t.h"
1010
#include "hdr/types/char8_t.h"
11+
#include "src/__support/common.h"
12+
#include "src/__support/error_or.h"
13+
#include "src/__support/math_extras.h"
1114
#include "src/__support/wchar/mbstate.h"
12-
#include "src/__support/wchar/utf_ret.h"
1315

1416
#include "character_converter.h"
1517

@@ -18,17 +20,75 @@ namespace internal {
1820

1921
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
2022

23+
void CharacterConverter::clear() {
24+
state->partial = 0;
25+
state->bytes_processed = 0;
26+
state->total_bytes = 0;
27+
}
28+
2129
bool CharacterConverter::isComplete() {
2230
return state->bytes_processed == state->total_bytes;
2331
}
2432

25-
int CharacterConverter::push(char8_t utf8_byte) {}
33+
int CharacterConverter::push(char32_t utf32) {
34+
// we can't be partially through a conversion when pushing a utf32 value
35+
if (!isComplete())
36+
return -1;
37+
38+
state->partial = utf32;
39+
state->bytes_processed = 0;
40+
41+
// determine number of utf-8 bytes needed to represent this utf32 value
42+
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
43+
constexpr int NUM_RANGES = 4;
44+
for (uint8_t i = 0; i < NUM_RANGES; i++) {
45+
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
46+
state->total_bytes = i + 1;
47+
return 0;
48+
}
49+
}
50+
51+
// `utf32` contains a value that is too large to actually represent a valid
52+
// unicode character
53+
clear();
54+
return -1;
55+
}
56+
57+
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
58+
if (isComplete())
59+
return Error(-1);
60+
61+
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
62+
constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
2663

27-
int CharacterConverter::push(char32_t utf32) {}
64+
// the number of bits per utf-8 byte that actually encode character
65+
// information not metadata (# of bits excluding the byte headers)
66+
constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
67+
constexpr int MASK_ENCODED_BITS =
68+
mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
2869

29-
utf_ret<char8_t> CharacterConverter::pop_utf8() {}
70+
char32_t output;
3071

31-
utf_ret<char32_t> CharacterConverter::pop_utf32() {}
72+
// Shift to get the next 6 bits from the utf32 encoding
73+
const char32_t shift_amount =
74+
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
75+
if (state->bytes_processed == 0) {
76+
/*
77+
Choose the correct set of most significant bits to encode the length
78+
of the utf8 sequence. The remaining bits contain the most significant
79+
bits of the unicode value of the character.
80+
*/
81+
output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
82+
(state->partial >> shift_amount);
83+
} else {
84+
// Get the next 6 bits and format it like so: 10xxxxxx
85+
output = CONTINUING_BYTE_HEADER |
86+
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
87+
}
88+
89+
state->bytes_processed++;
90+
return static_cast<char8_t>(output);
91+
}
3292

3393
} // namespace internal
3494
} // namespace LIBC_NAMESPACE_DECL

libc/src/__support/wchar/character_converter.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111

1212
#include "hdr/types/char32_t.h"
1313
#include "hdr/types/char8_t.h"
14+
#include "src/__support/common.h"
15+
#include "src/__support/error_or.h"
1416
#include "src/__support/wchar/mbstate.h"
15-
#include "src/__support/wchar/utf_ret.h"
1617

1718
namespace LIBC_NAMESPACE_DECL {
1819
namespace internal {
@@ -24,13 +25,14 @@ class CharacterConverter {
2425
public:
2526
CharacterConverter(mbstate *mbstate);
2627

28+
void clear();
2729
bool isComplete();
2830

2931
int push(char8_t utf8_byte);
3032
int push(char32_t utf32);
3133

32-
utf_ret<char8_t> pop_utf8();
33-
utf_ret<char32_t> pop_utf32();
34+
ErrorOr<char8_t> pop_utf8();
35+
ErrorOr<char32_t> pop_utf32();
3436
};
3537

3638
} // namespace internal

libc/src/__support/wchar/mbstate.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
1717
namespace internal {
1818

1919
struct mbstate {
20+
// store a partial codepoint (in UTF-32)
2021
char32_t partial;
22+
23+
/*
24+
Progress towards a conversion
25+
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
26+
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
27+
*/
2128
uint8_t bytes_processed;
29+
30+
// Total number of bytes that will be needed to represent this character
2231
uint8_t total_bytes;
2332
};
2433

libc/src/__support/wchar/utf_ret.h

Lines changed: 0 additions & 24 deletions
This file was deleted.

libc/test/src/__support/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
275275
add_subdirectory(HashTable)
276276
add_subdirectory(time)
277277
add_subdirectory(threads)
278+
279+
# Requires access to uchar header which is not on macos
280+
# Therefore, cannot currently build this on macos in overlay mode
281+
if(NOT(LIBC_TARGET_OS_IS_DARWIN))
282+
add_subdirectory(wchar)
283+
endif()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_custom_target(libc-support-wchar-tests)
2+
3+
add_libc_test(
4+
utf32_to_8_test
5+
SUITE
6+
libc-support-tests
7+
SRCS
8+
utf32_to_8_test.cpp
9+
DEPENDS
10+
libc.src.__support.wchar.character_converter
11+
)
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "src/__support/common.h"
10+
#include "src/__support/wchar/character_converter.h"
11+
#include "src/__support/wchar/mbstate.h"
12+
13+
#include "test/UnitTest/Test.h"
14+
15+
TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
16+
LIBC_NAMESPACE::internal::mbstate state;
17+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
18+
cr.clear();
19+
20+
// utf8 1-byte encodings are identical to their utf32 representations
21+
char32_t utf32_A = 0x41; // 'A'
22+
cr.push(utf32_A);
23+
auto popped = cr.pop_utf8();
24+
ASSERT_TRUE(popped.has_value());
25+
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
26+
ASSERT_TRUE(cr.isComplete());
27+
28+
char32_t utf32_B = 0x42; // 'B'
29+
cr.push(utf32_B);
30+
popped = cr.pop_utf8();
31+
ASSERT_TRUE(popped.has_value());
32+
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
33+
ASSERT_TRUE(cr.isComplete());
34+
35+
// should error if we try to pop another utf8 byte out
36+
popped = cr.pop_utf8();
37+
ASSERT_FALSE(popped.has_value());
38+
}
39+
40+
TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
41+
LIBC_NAMESPACE::internal::mbstate state;
42+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
43+
cr.clear();
44+
45+
// testing utf32: 0xff -> utf8: 0xc3 0xbf
46+
char32_t utf32 = 0xff;
47+
cr.push(utf32);
48+
auto popped = cr.pop_utf8();
49+
ASSERT_TRUE(popped.has_value());
50+
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
51+
ASSERT_TRUE(!cr.isComplete());
52+
popped = cr.pop_utf8();
53+
ASSERT_TRUE(popped.has_value());
54+
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
55+
ASSERT_TRUE(cr.isComplete());
56+
57+
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
58+
utf32 = 0x58e;
59+
cr.push(utf32);
60+
popped = cr.pop_utf8();
61+
ASSERT_TRUE(popped.has_value());
62+
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
63+
ASSERT_TRUE(!cr.isComplete());
64+
popped = cr.pop_utf8();
65+
ASSERT_TRUE(popped.has_value());
66+
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
67+
ASSERT_TRUE(cr.isComplete());
68+
69+
// should error if we try to pop another utf8 byte out
70+
popped = cr.pop_utf8();
71+
ASSERT_FALSE(popped.has_value());
72+
}
73+
74+
TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
75+
LIBC_NAMESPACE::internal::mbstate state;
76+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
77+
cr.clear();
78+
79+
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
80+
char32_t utf32 = 0xac15;
81+
cr.push(utf32);
82+
auto popped = cr.pop_utf8();
83+
ASSERT_TRUE(popped.has_value());
84+
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
85+
ASSERT_TRUE(!cr.isComplete());
86+
popped = cr.pop_utf8();
87+
ASSERT_TRUE(popped.has_value());
88+
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
89+
ASSERT_TRUE(!cr.isComplete());
90+
popped = cr.pop_utf8();
91+
ASSERT_TRUE(popped.has_value());
92+
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
93+
ASSERT_TRUE(cr.isComplete());
94+
95+
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
96+
utf32 = 0x267b;
97+
cr.push(utf32);
98+
popped = cr.pop_utf8();
99+
ASSERT_TRUE(popped.has_value());
100+
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
101+
ASSERT_TRUE(!cr.isComplete());
102+
popped = cr.pop_utf8();
103+
ASSERT_TRUE(popped.has_value());
104+
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
105+
ASSERT_TRUE(!cr.isComplete());
106+
popped = cr.pop_utf8();
107+
ASSERT_TRUE(popped.has_value());
108+
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
109+
ASSERT_TRUE(cr.isComplete());
110+
111+
// should error if we try to pop another utf8 byte out
112+
popped = cr.pop_utf8();
113+
ASSERT_FALSE(popped.has_value());
114+
}
115+
116+
TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
117+
LIBC_NAMESPACE::internal::mbstate state;
118+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
119+
cr.clear();
120+
121+
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
122+
char32_t utf32 = 0x1f921;
123+
cr.push(utf32);
124+
auto popped = cr.pop_utf8();
125+
ASSERT_TRUE(popped.has_value());
126+
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
127+
ASSERT_TRUE(!cr.isComplete());
128+
popped = cr.pop_utf8();
129+
ASSERT_TRUE(popped.has_value());
130+
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
131+
ASSERT_TRUE(!cr.isComplete());
132+
popped = cr.pop_utf8();
133+
ASSERT_TRUE(popped.has_value());
134+
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
135+
ASSERT_TRUE(!cr.isComplete());
136+
popped = cr.pop_utf8();
137+
ASSERT_TRUE(popped.has_value());
138+
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
139+
ASSERT_TRUE(cr.isComplete());
140+
141+
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
142+
utf32 = 0x12121;
143+
cr.push(utf32);
144+
popped = cr.pop_utf8();
145+
ASSERT_TRUE(popped.has_value());
146+
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
147+
ASSERT_TRUE(!cr.isComplete());
148+
popped = cr.pop_utf8();
149+
ASSERT_TRUE(popped.has_value());
150+
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
151+
ASSERT_TRUE(!cr.isComplete());
152+
popped = cr.pop_utf8();
153+
ASSERT_TRUE(popped.has_value());
154+
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
155+
ASSERT_TRUE(!cr.isComplete());
156+
popped = cr.pop_utf8();
157+
ASSERT_TRUE(popped.has_value());
158+
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
159+
ASSERT_TRUE(cr.isComplete());
160+
161+
// should error if we try to pop another utf8 byte out
162+
popped = cr.pop_utf8();
163+
ASSERT_FALSE(popped.has_value());
164+
}
165+
166+
TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
167+
LIBC_NAMESPACE::internal::mbstate state;
168+
LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
169+
cr.clear();
170+
171+
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
172+
char32_t utf32 = 0x12121;
173+
ASSERT_EQ(cr.push(utf32), 0);
174+
auto popped = cr.pop_utf8();
175+
ASSERT_TRUE(popped.has_value());
176+
177+
// can't push a utf32 without finishing popping the utf8 bytes out
178+
int err = cr.push(utf32);
179+
ASSERT_EQ(err, -1);
180+
}

0 commit comments

Comments
 (0)