-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Create a CharSetConverter class with both iconv and icu support #74516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
faf0bcd
87991d3
226756d
76cc37a
53e185a
f032bc9
241a597
c08945e
a294096
9a55df0
fa06563
5eb3d5c
f19d93d
4ce5ee2
a3e9f45
5386a17
2e04f3d
d721e8b
f6e8d52
f4e3ec2
eeaf034
cfd8e5d
3ee9f4f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
/// | ||
/// \file | ||
/// This file provides a utility class to convert between different character | ||
/// set encodings. | ||
/// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_SUPPORT_CHARSET_H | ||
#define LLVM_SUPPORT_CHARSET_H | ||
|
||
#include "llvm/ADT/SmallString.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Config/config.h" | ||
#include "llvm/Support/ErrorOr.h" | ||
|
||
#include <string> | ||
#include <system_error> | ||
|
||
namespace llvm { | ||
|
||
template <typename T> class SmallVectorImpl; | ||
|
||
namespace details { | ||
class CharSetConverterImplBase { | ||
|
||
private: | ||
/// Converts a string. | ||
/// \param[in] Source source string | ||
/// \param[out] Result container for converted string | ||
/// \return error code in case something went wrong | ||
/// | ||
/// The following error codes can occur, among others: | ||
/// - std::errc::argument_list_too_long: The result requires more than | ||
/// std::numeric_limits<size_t>::max() bytes. | ||
/// - std::errc::illegal_byte_sequence: The input contains an invalid | ||
/// multibyte sequence. | ||
/// - std::errc::invalid_argument: The input contains an incomplete | ||
/// multibyte sequence. | ||
/// | ||
/// If the destination charset is a stateful character set, the shift state | ||
/// will be set to the initial state. | ||
/// | ||
/// In case of an error, the result string contains the successfully converted | ||
/// part of the input string. | ||
Comment on lines
+50
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just noting that this is probably insufficiently tested. The functionality may be useful for printing at least the first part of static_assert messages where conversion to the encoding used for diagnostic messages fails. As a later improvement, a |
||
/// | ||
virtual std::error_code convertString(StringRef Source, | ||
SmallVectorImpl<char> &Result) = 0; | ||
|
||
/// Resets the converter to the initial state. | ||
virtual void reset() = 0; | ||
|
||
public: | ||
virtual ~CharSetConverterImplBase() = default; | ||
|
||
/// Converts a string and resets the converter to the initial state. | ||
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) { | ||
auto EC = convertString(Source, Result); | ||
reset(); | ||
return EC; | ||
} | ||
}; | ||
} // namespace details | ||
|
||
// Names inspired by https://wg21.link/p1885. | ||
namespace text_encoding { | ||
enum class id { | ||
/// UTF-8 character set encoding. | ||
UTF8, | ||
|
||
/// IBM EBCDIC 1047 character set encoding. | ||
IBM1047 | ||
}; | ||
} // end namespace text_encoding | ||
|
||
/// Utility class to convert between different character set encodings. | ||
class CharSetConverter { | ||
std::unique_ptr<details::CharSetConverterImplBase> Converter; | ||
|
||
CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter) | ||
: Converter(std::move(Converter)) {} | ||
|
||
public: | ||
/// Creates a CharSetConverter instance. | ||
abhina-sree marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// Returns std::errc::invalid_argument in case the requested conversion is | ||
/// not supported. | ||
/// \param[in] CSFrom the source character encoding | ||
/// \param[in] CSTo the target character encoding | ||
/// \return a CharSetConverter instance or an error code | ||
static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom, | ||
text_encoding::id CSTo); | ||
|
||
/// Creates a CharSetConverter instance. | ||
/// Returns std::errc::invalid_argument in case the requested conversion is | ||
/// not supported. | ||
/// \param[in] CPFrom name of the source character encoding | ||
/// \param[in] CPTo name of the target character encoding | ||
/// \return a CharSetConverter instance or an error code | ||
static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo); | ||
|
||
CharSetConverter(const CharSetConverter &) = delete; | ||
CharSetConverter &operator=(const CharSetConverter &) = delete; | ||
|
||
CharSetConverter(CharSetConverter &&Other) | ||
: Converter(std::move(Other.Converter)) {} | ||
|
||
CharSetConverter &operator=(CharSetConverter &&Other) { | ||
if (this != &Other) | ||
Converter = std::move(Other.Converter); | ||
return *this; | ||
} | ||
|
||
~CharSetConverter() = default; | ||
|
||
/// Converts a string. | ||
/// \param[in] Source source string | ||
/// \param[out] Result container for converted string | ||
/// \return error code in case something went wrong | ||
std::error_code convert(StringRef Source, | ||
SmallVectorImpl<char> &Result) const { | ||
return Converter->convert(Source, Result); | ||
} | ||
|
||
abhina-sree marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ErrorOr<std::string> convert(StringRef Source) const { | ||
SmallString<100> Result; | ||
auto EC = Converter->convert(Source, Result); | ||
if (!EC) | ||
return std::string(Result); | ||
return EC; | ||
} | ||
}; | ||
|
||
} // namespace llvm | ||
|
||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My opinion is that we do want the
iconv
path enabled by default. For me, the main rationale is to increase the chances that at least half of the code being added is tested "widely".There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the community preferred ICU over iconv, so it would make more sense to enable ICU and disable iconv, or disable both as the default