Skip to content

Commit 6e0b77b

Browse files
committed
address review comments
1 parent da7728a commit 6e0b77b

File tree

2 files changed

+49
-105
lines changed

2 files changed

+49
-105
lines changed

llvm/include/llvm/Support/CharSet.h

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- CharSet.h - Utility class to convert between char sets ----*- C++ -*-=//
1+
//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -35,9 +35,9 @@ class CharSetConverterImplBase {
3535

3636
/// Converts a string.
3737
/// \param[in] Source source string
38-
/// \param[in,out] Result container for converted string
38+
/// \param[out] Result container for converted string
3939
/// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
40-
/// for multi-byte encodings iff true.
40+
/// for stateful encodings if true.
4141
/// \return error code in case something went wrong
4242
///
4343
/// The following error codes can occur, among others:
@@ -59,9 +59,9 @@ class CharSetConverterImplBase {
5959
/// Restore the conversion to the original state.
6060
/// \return error code in case something went wrong
6161
///
62-
/// If the original character set or the destination character set
63-
/// are multi-byte character sets, set the shift state to the initial
64-
/// state. Otherwise this is a no-op.
62+
/// If the destination character set is a stateful character set,
63+
/// set the shift state to the initial state.
64+
/// Otherwise this is a no-op.
6565
virtual std::error_code flush() const = 0;
6666

6767
virtual std::error_code flush(SmallVectorImpl<char> &Result) const = 0;
@@ -80,7 +80,6 @@ enum class id {
8080
} // end namespace text_encoding
8181

8282
/// Utility class to convert between different character set encodings.
83-
/// The class always supports converting between EBCDIC 1047 and Latin-1/UTF-8.
8483
class CharSetConverter {
8584
// details::CharSetConverterImplBase *Converter;
8685
std::unique_ptr<details::CharSetConverterImplBase> Converter;
@@ -121,33 +120,30 @@ class CharSetConverter {
121120

122121
/// Converts a string.
123122
/// \param[in] Source source string
124-
/// \param[in,out] Result container for converted string
123+
/// \param[out] Result container for converted string
125124
/// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
126-
/// for multi-byte encodings.
125+
/// for stateful encodings.
127126
/// \return error code in case something went wrong
128127
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result,
129128
bool ShouldAutoFlush = true) const {
130129
return Converter->convert(Source, Result, ShouldAutoFlush);
131130
}
132131

132+
ErrorOr<std::string> convert(StringRef Source,
133+
bool ShouldAutoFlush = true) const {
134+
SmallString<1> Result;
135+
auto EC = Converter->convert(Source, Result, ShouldAutoFlush);
136+
if (!EC)
137+
return std::string(Result);
138+
return EC;
139+
}
140+
133141
char convert(char SingleChar) const {
134142
SmallString<1> Result;
135143
Converter->convert(StringRef(&SingleChar, 1), Result, false);
136144
return Result[0];
137145
}
138146

139-
/// Converts a string.
140-
/// \param[in] Source source string
141-
/// \param[in,out] Result container for converted string
142-
/// \param[in] ShouldAutoFlush Append shift-back sequence after conversion
143-
/// for multi-byte encodings iff true.
144-
/// \return error code in case something went wrong
145-
std::error_code convert(const std::string &Source,
146-
SmallVectorImpl<char> &Result,
147-
bool ShouldAutoFlush = true) const {
148-
return convert(StringRef(Source), Result, ShouldAutoFlush);
149-
}
150-
151147
std::error_code flush() const { return Converter->flush(); }
152148

153149
std::error_code flush(SmallVectorImpl<char> &Result) const {

llvm/lib/Support/CharSet.cpp

Lines changed: 32 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===-- CharSet.cpp - Utility class to convert between char sets --*- C++ -*-=//
1+
//===-- CharSet.cpp - Characters sets conversion class ------------*- C++ -*-=//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -32,7 +32,8 @@ using namespace llvm;
3232

3333
// Normalize the charset name with the charset alias matching algorithm proposed
3434
// in https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching.
35-
void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
35+
static void normalizeCharSetName(StringRef CSName,
36+
SmallVectorImpl<char> &Normalized) {
3637
bool PrevDigit = false;
3738
for (auto Ch : CSName) {
3839
if (isAlnum(Ch)) {
@@ -49,15 +50,26 @@ void normalizeCharSetName(StringRef CSName, SmallVectorImpl<char> &Normalized) {
4950
std::optional<text_encoding::id> getKnownCharSet(StringRef CSName) {
5051
SmallString<16> Normalized;
5152
normalizeCharSetName(CSName, Normalized);
52-
#define CSNAME(CS, STR) \
53-
if (Normalized.equals(STR)) \
54-
return CS
55-
CSNAME(text_encoding::id::UTF8, "utf8");
56-
CSNAME(text_encoding::id::IBM1047, "ibm1047");
57-
#undef CSNAME
53+
if (Normalized.equals("utf8"))
54+
return text_encoding::id::UTF8;
55+
if (Normalized.equals("ibm1047"))
56+
return text_encoding::id::IBM1047;
5857
return std::nullopt;
5958
}
6059

60+
void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength,
61+
SmallVectorImpl<char> &Result) {
62+
// No space left in output buffer. Double the size of the underlying
63+
// memory in the SmallVectorImpl, adjust pointer and length and continue
64+
// the conversion.
65+
Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
66+
? 2 * Capacity
67+
: std::numeric_limits<size_t>::max();
68+
Result.resize_for_overwrite(Capacity);
69+
Output = static_cast<char *>(Result.data());
70+
OutputLength = Capacity;
71+
}
72+
6173
namespace {
6274
enum ConversionType {
6375
UTFToIBM1047,
@@ -138,31 +150,12 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
138150
SmallVectorImpl<char> &Result,
139151
bool ShouldAutoFlush) const {
140152
// Setup the output. We directly write into the SmallVector.
153+
Result.resize_for_overwrite(Source.size());
141154
size_t OutputLength, Capacity = Result.capacity();
142155
char *Output, *Out;
143156

144157
UErrorCode EC = U_ZERO_ERROR;
145158

146-
auto HandleError = [&Capacity, &Output, &OutputLength,
147-
&Result](UErrorCode UEC) {
148-
if (UEC == U_BUFFER_OVERFLOW_ERROR &&
149-
Capacity < std::numeric_limits<size_t>::max()) {
150-
// No space left in output buffer. Double the size of the underlying
151-
// memory in the SmallVectorImpl, adjust pointer and length and continue
152-
// the conversion.
153-
Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
154-
? 2 * Capacity
155-
: std::numeric_limits<size_t>::max();
156-
Result.resize_for_overwrite(Capacity);
157-
Output = static_cast<char *>(Result.data());
158-
OutputLength = Capacity;
159-
return std::error_code();
160-
} else {
161-
// Some other error occured.
162-
return std::error_code(errno, std::generic_category());
163-
}
164-
};
165-
166159
do {
167160
EC = U_ZERO_ERROR;
168161
size_t InputLength = Source.size();
@@ -176,10 +169,15 @@ std::error_code CharSetConverterICU::convert(StringRef Source,
176169
ucnv_convertEx(ToConvDesc, FromConvDesc, &Output, Out + OutputLength,
177170
&Input, In + InputLength, /*pivotStart=*/NULL,
178171
/*pivotSource=*/NULL, /*pivotTarget=*/NULL,
179-
/*pivotLimit=*/NULL, /*reset=*/true, /*flush=*/true, &EC);
172+
/*pivotLimit=*/NULL, /*reset=*/true,
173+
/*flush=*/ShouldAutoFlush, &EC);
180174
if (U_FAILURE(EC)) {
181-
if (auto error = HandleError(EC))
182-
return error;
175+
if (EC == U_BUFFER_OVERFLOW_ERROR &&
176+
Capacity < std::numeric_limits<size_t>::max())
177+
HandleOverflow(Capacity, Output, OutputLength, Result);
178+
else
179+
// Some other error occured.
180+
return std::error_code(errno, std::generic_category());
183181
} else if (U_SUCCESS(EC))
184182
break;
185183
} while (U_FAILURE(EC));
@@ -215,8 +213,8 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
215213
size_t InputLength = Source.size();
216214
char *Input = InputLength ? const_cast<char *>(Source.data()) : nullptr;
217215
// Setup the output. We directly write into the SmallVector.
216+
Result.resize_for_overwrite(Source.size());
218217
size_t Capacity = Result.capacity();
219-
Result.resize_for_overwrite(Capacity);
220218
char *Output = InputLength ? static_cast<char *>(Result.data()) : nullptr;
221219
size_t OutputLength = Capacity;
222220

@@ -227,16 +225,7 @@ std::error_code CharSetConverterIconv::convert(StringRef Source,
227225
if (Ret == static_cast<size_t>(-1)) {
228226
// An error occured. Check if we can gracefully handle it.
229227
if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
230-
// No space left in output buffer. Double the size of the underlying
231-
// memory in the SmallVectorImpl, adjust pointer and length and continue
232-
// the conversion.
233-
const size_t Used = Capacity - OutputLength;
234-
Capacity = (Capacity < std::numeric_limits<size_t>::max() / 2)
235-
? 2 * Capacity
236-
: std::numeric_limits<size_t>::max();
237-
Result.resize_for_overwrite(Capacity);
238-
Output = static_cast<char *>(Result.data()) + Used;
239-
OutputLength = Capacity - Used;
228+
HandleOverflow(Capacity, Output, OutputLength, Result);
240229
return std::error_code();
241230
} else {
242231
// Some other error occured.
@@ -276,48 +265,7 @@ std::error_code CharSetConverterIconv::flush() const {
276265

277266
std::error_code
278267
CharSetConverterIconv::flush(SmallVectorImpl<char> &Result) const {
279-
char *Output = Result.data();
280-
size_t OutputLength = Result.capacity();
281-
size_t Capacity = Result.capacity();
282-
Result.resize_for_overwrite(Capacity);
283-
284-
// Handle errors returned from iconv().
285-
auto HandleError = [&Capacity, &Output, &OutputLength, &Result](size_t Ret) {
286-
if (Ret == static_cast<size_t>(-1)) {
287-
// An error occured. Check if we can gracefully handle it.
288-
if (errno == E2BIG && Capacity < std::numeric_limits<size_t>::max()) {
289-
// No space left in output buffer. Increase the size of the underlying
290-
// memory in the SmallVectorImpl by 2 bytes, adjust pointer and length
291-
// and continue the conversion.
292-
const size_t Used = Capacity - OutputLength;
293-
Capacity = (Capacity < std::numeric_limits<size_t>::max() - 2)
294-
? 2 + Capacity
295-
: std::numeric_limits<size_t>::max();
296-
Result.resize_for_overwrite(Capacity);
297-
Output = static_cast<char *>(Result.data()) + Used;
298-
OutputLength = Capacity - Used;
299-
return std::error_code();
300-
} else {
301-
// Some other error occured.
302-
return std::error_code(errno, std::generic_category());
303-
}
304-
} else {
305-
// A positive return value indicates that some characters were converted
306-
// in a nonreversible way, that is, replaced with a SUB symbol. Returning
307-
// an error in this case makes sure that both conversion routines behave
308-
// in the same way.
309-
return std::make_error_code(std::errc::illegal_byte_sequence);
310-
}
311-
};
312-
313-
size_t Ret;
314-
while ((Ret = iconv(ConvDesc, nullptr, nullptr, &Output, &OutputLength)))
315-
if (auto EC = HandleError(Ret))
316-
return EC;
317-
318-
// Re-adjust size to actual size.
319-
Result.resize(Capacity - OutputLength);
320-
return std::error_code();
268+
return convert(nullptr, Result);
321269
}
322270

323271
#endif // HAVE_ICONV

0 commit comments

Comments
 (0)