Skip to content

Commit b9dab1f

Browse files
committed
address latest comments
1 parent b32b472 commit b9dab1f

File tree

3 files changed

+109
-38
lines changed

3 files changed

+109
-38
lines changed

llvm/include/llvm/Support/TextEncoding.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- TextEncodingConverter.h - Encoding conversion class -------*- C++ -*-=//
1+
//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,8 +12,8 @@
1212
///
1313
//===----------------------------------------------------------------------===//
1414

15-
#ifndef LLVM_SUPPORT_ENCODING_CONVERTER_H
16-
#define LLVM_SUPPORT_ENCODING_CONVERTER_H
15+
#ifndef LLVM_SUPPORT_TEXT_ENCODING_H
16+
#define LLVM_SUPPORT_TEXT_ENCODING_H
1717

1818
#include "llvm/ADT/SmallString.h"
1919
#include "llvm/ADT/StringRef.h"

llvm/lib/Support/TextEncoding.cpp

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- TextEncoding.cpp - Encoding conversion class --------------*- C++ -*-=//
1+
//===-- TextEncoding.cpp - Text encoding conversion class ---------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -82,7 +82,7 @@ enum ConversionType {
8282
// aforementioned encodings. The use of tables for conversion is only
8383
// possible because EBCDIC 1047 is a single-byte, stateless encoding; other
8484
// encodings are not supported.
85-
class TextEncodingConverterTable
85+
class TextEncodingConverterTable final
8686
: public details::TextEncodingConverterImplBase {
8787
const ConversionType ConvType;
8888

@@ -118,7 +118,8 @@ struct UConverterDeleter {
118118
};
119119
using UConverterUniquePtr = std::unique_ptr<UConverter, UConverterDeleter>;
120120

121-
class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
121+
class TextEncodingConverterICU final
122+
: public details::TextEncodingConverterImplBase {
122123
UConverterUniquePtr FromConvDesc;
123124
UConverterUniquePtr ToConvDesc;
124125

@@ -137,7 +138,8 @@ class TextEncodingConverterICU : public details::TextEncodingConverterImplBase {
137138
// TODO: The current implementation discards the partial result and restarts the
138139
// conversion from the beginning if there is a conversion error due to
139140
// insufficient buffer size. In the future, it would better to save the partial
140-
// result and redo the conversion for the remaining string.
141+
// result and resume the conversion for the remaining string.
142+
// TODO: Improve translation of ICU errors to error_code
141143
std::error_code
142144
TextEncodingConverterICU::convertString(StringRef Source,
143145
SmallVectorImpl<char> &Result) {
@@ -169,9 +171,12 @@ TextEncodingConverterICU::convertString(StringRef Source,
169171
/*pivotLimit=*/NULL, /*reset=*/true,
170172
/*flush=*/true, &EC);
171173
if (U_FAILURE(EC)) {
172-
if (EC == U_BUFFER_OVERFLOW_ERROR && Capacity < Result.max_size()) {
173-
HandleOverflow(Capacity, Output, OutputLength, Result);
174-
continue;
174+
if (EC == U_BUFFER_OVERFLOW_ERROR) {
175+
if (Capacity < Result.max_size()) {
176+
HandleOverflow(Capacity, Output, OutputLength, Result);
177+
continue;
178+
} else
179+
return std::error_code(E2BIG, std::generic_category());
175180
}
176181
// Some other error occured.
177182
Result.resize(Output - Result.data());
@@ -190,7 +195,7 @@ void TextEncodingConverterICU::reset() {
190195
}
191196

192197
#elif HAVE_ICONV
193-
class TextEncodingConverterIconv
198+
class TextEncodingConverterIconv final
194199
: public details::TextEncodingConverterImplBase {
195200
class UniqueIconvT {
196201
iconv_t ConvDesc;
@@ -230,7 +235,7 @@ class TextEncodingConverterIconv
230235
// TODO: The current implementation discards the partial result and restarts the
231236
// conversion from the beginning if there is a conversion error due to
232237
// insufficient buffer size. In the future, it would better to save the partial
233-
// result and redo the conversion for the remaining string.
238+
// result and resume the conversion for the remaining string.
234239
std::error_code
235240
TextEncodingConverterIconv::convertString(StringRef Source,
236241
SmallVectorImpl<char> &Result) {
@@ -249,7 +254,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
249254
if (errno == E2BIG && Capacity < Result.max_size()) {
250255
HandleOverflow(Capacity, Output, OutputLength, Result);
251256
// Reset converter
252-
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
257+
reset();
253258
return std::error_code();
254259
} else {
255260
// Some other error occured.
@@ -269,7 +274,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
269274
// Setup the input. Use nullptr to reset iconv state if input length is
270275
// zero.
271276
size_t InputLength = Source.size();
272-
char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
277+
char *Input = InputLength ? const_cast<char *>(Source.data()) : "";
273278
Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength);
274279
if (Ret != 0) {
275280
if (auto EC = HandleError(Ret))
@@ -291,7 +296,7 @@ TextEncodingConverterIconv::convertString(StringRef Source,
291296
return std::error_code();
292297
}
293298

294-
void TextEncodingConverterIconv::reset() {
299+
inline void TextEncodingConverterIconv::reset() {
295300
iconv(ConvDesc, nullptr, nullptr, nullptr, nullptr);
296301
}
297302

@@ -311,7 +316,7 @@ TextEncodingConverter::create(TextEncoding CPFrom, TextEncoding CPTo) {
311316
else if (CPFrom == TextEncoding::IBM1047 && CPTo == TextEncoding::UTF8)
312317
Conversion = IBM1047ToUTF8;
313318
else
314-
return std::error_code(errno, std::generic_category());
319+
return std::make_error_code(std::errc::invalid_argument);
315320

316321
return TextEncodingConverter(
317322
std::make_unique<TextEncodingConverterTable>(Conversion));
@@ -330,21 +335,20 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From,
330335
#if HAVE_ICU
331336
UErrorCode EC = U_ZERO_ERROR;
332337
UConverterUniquePtr FromConvDesc(ucnv_open(From.str().c_str(), &EC));
333-
if (U_FAILURE(EC)) {
334-
return std::error_code(errno, std::generic_category());
335-
}
338+
if (U_FAILURE(EC))
339+
return std::make_error_code(std::errc::invalid_argument);
340+
336341
UConverterUniquePtr ToConvDesc(ucnv_open(To.str().c_str(), &EC));
337-
if (U_FAILURE(EC)) {
338-
return std::error_code(errno, std::generic_category());
339-
}
340-
std::unique_ptr<details::TextEncodingConverterImplBase> Converter =
341-
std::make_unique<TextEncodingConverterICU>(std::move(FromConvDesc),
342-
std::move(ToConvDesc));
342+
if (U_FAILURE(EC))
343+
return std::make_error_code(std::errc::invalid_argument);
344+
345+
auto Converter = std::make_unique<TextEncodingConverterICU>(
346+
std::move(FromConvDesc), std::move(ToConvDesc));
343347
return TextEncodingConverter(std::move(Converter));
344348
#elif HAVE_ICONV
345349
iconv_t ConvDesc = iconv_open(To.str().c_str(), From.str().c_str());
346350
if (ConvDesc == (iconv_t)-1)
347-
return std::error_code(errno, std::generic_category());
351+
return std::make_error_code(std::errc::invalid_argument);
348352
return TextEncodingConverter(
349353
std::make_unique<TextEncodingConverterIconv>(ConvDesc));
350354
#else

llvm/unittests/Support/TextEncodingTest.cpp

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88

99
#include "llvm/Support/TextEncoding.h"
1010
#include "llvm/ADT/SmallString.h"
11+
#include "llvm/Config/config.h"
1112
#include "gtest/gtest.h"
13+
1214
using namespace llvm;
1315

1416
namespace {
@@ -61,12 +63,8 @@ TEST(Encoding, FromUTF8) {
6163
ErrorOr<TextEncodingConverter> Conv =
6264
TextEncodingConverter::create(TextEncoding::UTF8, TextEncoding::IBM1047);
6365

64-
// Stop test if conversion is not supported.
65-
if (!Conv) {
66-
ASSERT_EQ(Conv.getError(),
67-
std::make_error_code(std::errc::invalid_argument));
68-
return;
69-
}
66+
// Converter should always exist between UTF-8 and IBM-1047
67+
EXPECT_TRUE(Conv);
7068

7169
std::error_code EC = Conv->convert(Src, Dst);
7270
EXPECT_TRUE(!EC);
@@ -101,12 +99,8 @@ TEST(Encoding, ToUTF8) {
10199
ErrorOr<TextEncodingConverter> Conv =
102100
TextEncodingConverter::create(TextEncoding::IBM1047, TextEncoding::UTF8);
103101

104-
// Stop test if conversion is not supported.
105-
if (!Conv) {
106-
ASSERT_EQ(Conv.getError(),
107-
std::make_error_code(std::errc::invalid_argument));
108-
return;
109-
}
102+
// Converter should always exist between UTF-8 and IBM-1047
103+
EXPECT_TRUE(Conv);
110104

111105
std::error_code EC = Conv->convert(Src, Dst);
112106

@@ -131,28 +125,45 @@ TEST(Encoding, ToUTF8) {
131125
TEST(Encoding, RoundTrip) {
132126
ErrorOr<TextEncodingConverter> ConvToUTF16 =
133127
TextEncodingConverter::create("IBM-1047", "UTF-16");
128+
129+
#if HAVE_ICU
130+
EXPECT_TRUE(ConvToUTF16);
131+
#else
134132
// Stop test if conversion is not supported (no underlying iconv support).
135133
if (!ConvToUTF16) {
136134
ASSERT_EQ(ConvToUTF16.getError(),
137135
std::make_error_code(std::errc::invalid_argument));
138136
return;
139137
}
138+
#endif
139+
140140
ErrorOr<TextEncodingConverter> ConvToUTF32 =
141141
TextEncodingConverter::create("UTF-16", "UTF-32");
142+
143+
#if HAVE_ICU
144+
EXPECT_TRUE(ConvToUTF32);
145+
#else
142146
// Stop test if conversion is not supported (no underlying iconv support).
143147
if (!ConvToUTF32) {
144148
ASSERT_EQ(ConvToUTF32.getError(),
145149
std::make_error_code(std::errc::invalid_argument));
146150
return;
147151
}
152+
#endif
153+
148154
ErrorOr<TextEncodingConverter> ConvToEBCDIC =
149155
TextEncodingConverter::create("UTF-32", "IBM-1047");
156+
157+
#if HAVE_ICU
158+
EXPECT_TRUE(ConvToEBCDIC);
159+
#else
150160
// Stop test if conversion is not supported (no underlying iconv support).
151161
if (!ConvToEBCDIC) {
152162
ASSERT_EQ(ConvToEBCDIC.getError(),
153163
std::make_error_code(std::errc::invalid_argument));
154164
return;
155165
}
166+
#endif
156167

157168
// Setup source string.
158169
char SrcStr[256];
@@ -177,12 +188,17 @@ TEST(Encoding, ShiftState2022) {
177188

178189
ErrorOr<TextEncodingConverter> ConvTo2022 =
179190
TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
191+
192+
#if HAVE_ICU
193+
EXPECT_TRUE(ConvTo2022);
194+
#else
180195
// Stop test if conversion is not supported (no underlying iconv support).
181196
if (!ConvTo2022) {
182197
ASSERT_EQ(ConvTo2022.getError(),
183198
std::make_error_code(std::errc::invalid_argument));
184199
return;
185200
}
201+
#endif
186202

187203
// Check that the string is properly converted.
188204
std::error_code EC = ConvTo2022->convert(Src, Dst);
@@ -197,31 +213,82 @@ TEST(Encoding, InvalidInput) {
197213

198214
ErrorOr<TextEncodingConverter> ConvTo2022 =
199215
TextEncodingConverter::create("UTF-8", "ISO-2022-JP");
216+
217+
#if HAVE_ICU
218+
EXPECT_TRUE(ConvTo2022);
219+
#else
200220
// Stop test if conversion is not supported (no underlying iconv support).
201221
if (!ConvTo2022) {
202222
ASSERT_EQ(ConvTo2022.getError(),
203223
std::make_error_code(std::errc::invalid_argument));
204224
return;
205225
}
226+
#endif
206227

207228
// Check that the string failed to convert.
208229
std::error_code EC = ConvTo2022->convert(Src, Dst);
209230
EXPECT_TRUE(EC);
210231
}
211232

233+
TEST(Encoding, InvalidOutput) {
234+
// Cyrillic in UTF-16
235+
ErrorOr<TextEncodingConverter> ConvToUTF16 =
236+
TextEncodingConverter::create("UTF-8", "UTF-16");
237+
238+
#if HAVE_ICU
239+
EXPECT_TRUE(ConvToUTF16);
240+
#else
241+
// Stop test if conversion is not supported (no underlying iconv support).
242+
if (!ConvToUTF16) {
243+
ASSERT_EQ(ConvToUTF16.getError(),
244+
std::make_error_code(std::errc::invalid_argument));
245+
return;
246+
}
247+
#endif
248+
249+
ErrorOr<TextEncodingConverter> ConvToEBCDIC =
250+
TextEncodingConverter::create("UTF-16", "IBM-1047");
251+
252+
#if HAVE_ICU
253+
EXPECT_TRUE(ConvToEBCDIC);
254+
#else
255+
// Stop test if conversion is not supported (no underlying iconv support).
256+
if (!ConvToEBCDIC) {
257+
ASSERT_EQ(ConvToEBCDIC.getError(),
258+
std::make_error_code(std::errc::invalid_argument));
259+
return;
260+
}
261+
#endif
262+
263+
// Cyrillic string. Convert to UTF-16 and check if properly converted
264+
StringRef Src(CyrillicUTF);
265+
SmallString<8> Dst, Dst1;
266+
std::error_code EC = ConvToUTF16->convert(Src, Dst);
267+
EXPECT_TRUE(!EC);
268+
269+
// Cyrillic string. Results in error because not representable in 1047.
270+
EC = ConvToEBCDIC->convert(Dst, Dst1);
271+
EXPECT_TRUE(EC);
272+
}
273+
212274
TEST(Encoding, ShiftStateIBM939) {
213275
// Earth string.
214276
StringRef Src(EarthUTF);
215277
SmallString<64> Dst;
216278

217279
ErrorOr<TextEncodingConverter> ConvToIBM939 =
218280
TextEncodingConverter::create("UTF-8", "IBM-939");
281+
282+
#if HAVE_ICU
283+
EXPECT_TRUE(ConvToIBM939);
284+
#else
219285
// Stop test if conversion is not supported (no underlying iconv support).
220286
if (!ConvToIBM939) {
221287
ASSERT_EQ(ConvToIBM939.getError(),
222288
std::make_error_code(std::errc::invalid_argument));
223289
return;
224290
}
291+
#endif
225292

226293
// Check that the string is properly converted.
227294
std::error_code EC = ConvToIBM939->convert(Src, Dst);

0 commit comments

Comments
 (0)