Skip to content

Create a CharSetConverter class with both iconv and icu support #74516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
faf0bcd
Create a CharSetConverter class with both iconv and icu support.
abhina-sree Dec 5, 2023
87991d3
address review comments
abhina-sree Jan 9, 2024
226756d
add LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV option
abhina-sree Jan 10, 2024
76cc37a
remove single char conversion function
abhina-sree Jan 31, 2024
53e185a
handle FORCE_ON, look for shared libraries only for ICU
abhina-sree Feb 23, 2024
f032bc9
only allow builtin iconv support
abhina-sree Apr 4, 2024
241a597
Update llvm/cmake/config-ix.cmake
abhina-sree Apr 17, 2024
c08945e
address comments
abhina-sree Apr 19, 2024
a294096
remove function to get shift back characters, address comments
abhina-sree Apr 22, 2024
9a55df0
remove other flush function as well
abhina-sree Apr 23, 2024
fa06563
update comments
abhina-sree Apr 24, 2024
5eb3d5c
reset iconv if failed, cause overflow in testcase
abhina-sree Apr 29, 2024
f19d93d
remove AutoFlush, remove stray comment
abhina-sree Apr 29, 2024
4ce5ee2
formatting nits
abhina-sree May 1, 2024
a3e9f45
Refactor ICU code
abhina-sree May 24, 2024
5386a17
refactor iconv
abhina-sree May 29, 2024
2e04f3d
resize output if error
abhina-sree May 31, 2024
d721e8b
address some comments
abhina-sree Jan 6, 2025
f6e8d52
add callback function to properly report errors
abhina-sree Jan 13, 2025
f4e3ec2
make non-const functions private
abhina-sree Jan 29, 2025
eeaf034
switch default to OFF, add error if both ICU/ICONV are turned on, onl…
abhina-sree Apr 3, 2025
cfd8e5d
add error checking for text_encoding function, address other comments
abhina-sree Apr 4, 2025
3ee9f4f
address comments, fix ICU error callback
abhina-sree Apr 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,10 @@ else()
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
endif()

set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for character conversion support if available. Can be ON, OFF, or FORCE_ON")

set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")
set(LLVM_ENABLE_ICONV "ON" CACHE STRING "Use iconv for character conversion support if available. Can be ON, OFF, or FORCE_ON")

My opinion is that we do want the iconv path enabled by default. For me, the main rationale is to increase the chances that at least half of the code being added is tested "widely".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the community preferred ICU over iconv, so it would make more sense to enable ICU and disable iconv, or disable both as the default


set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")

set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
Expand Down
35 changes: 35 additions & 0 deletions llvm/cmake/config-ix.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
endif()

if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
endif()

# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
find_package(ICU REQUIRED COMPONENTS uc i18n)
if (NOT ICU_FOUND)
message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
endif()
else()
find_package(ICU COMPONENTS uc i18n)
endif()
set(HAVE_ICU ${ICU_FOUND})
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
endif()

# Check for builtin iconv to avoid licensing issues.
if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
find_package(Iconv REQUIRED)
if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
endif()
else()
find_package(Iconv)
endif()
if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
set(HAVE_ICONV 1)
endif()
endif()

# function checks
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
find_package(Backtrace)
Expand Down
6 changes: 6 additions & 0 deletions llvm/include/llvm/Config/config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,12 @@
/* Have host's ___chkstk_ms */
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}

/* Define if ICU library is available */
#cmakedefine HAVE_ICU ${HAVE_ICU}

/* Define if iconv library is available */
#cmakedefine HAVE_ICONV ${HAVE_ICONV}

/* Linker version detected at compile time. */
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"

Expand Down
141 changes: 141 additions & 0 deletions llvm/include/llvm/Support/CharSet.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
//===-- CharSet.h - Characters set conversion class ---------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file provides a utility class to convert between different character
/// set encodings.
///
//===----------------------------------------------------------------------===//

#ifndef LLVM_SUPPORT_CHARSET_H
#define LLVM_SUPPORT_CHARSET_H

#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Config/config.h"
#include "llvm/Support/ErrorOr.h"

#include <string>
#include <system_error>

namespace llvm {

template <typename T> class SmallVectorImpl;

namespace details {
class CharSetConverterImplBase {

private:
/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
/// \return error code in case something went wrong
///
/// The following error codes can occur, among others:
/// - std::errc::argument_list_too_long: The result requires more than
/// std::numeric_limits<size_t>::max() bytes.
/// - std::errc::illegal_byte_sequence: The input contains an invalid
/// multibyte sequence.
/// - std::errc::invalid_argument: The input contains an incomplete
/// multibyte sequence.
///
/// If the destination charset is a stateful character set, the shift state
/// will be set to the initial state.
///
/// In case of an error, the result string contains the successfully converted
/// part of the input string.
Comment on lines +50 to +51
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that this is probably insufficiently tested. The functionality may be useful for printing at least the first part of static_assert messages where conversion to the encoding used for diagnostic messages fails. As a later improvement, a StringRef to the unconverted portion of the input buffer would be helpful.

///
virtual std::error_code convertString(StringRef Source,
SmallVectorImpl<char> &Result) = 0;

/// Resets the converter to the initial state.
virtual void reset() = 0;

public:
virtual ~CharSetConverterImplBase() = default;

/// Converts a string and resets the converter to the initial state.
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
auto EC = convertString(Source, Result);
reset();
return EC;
}
};
} // namespace details

// Names inspired by https://wg21.link/p1885.
namespace text_encoding {
enum class id {
/// UTF-8 character set encoding.
UTF8,

/// IBM EBCDIC 1047 character set encoding.
IBM1047
};
} // end namespace text_encoding

/// Utility class to convert between different character set encodings.
class CharSetConverter {
std::unique_ptr<details::CharSetConverterImplBase> Converter;

CharSetConverter(std::unique_ptr<details::CharSetConverterImplBase> Converter)
: Converter(std::move(Converter)) {}

public:
/// Creates a CharSetConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] CSFrom the source character encoding
/// \param[in] CSTo the target character encoding
/// \return a CharSetConverter instance or an error code
static ErrorOr<CharSetConverter> create(text_encoding::id CSFrom,
text_encoding::id CSTo);

/// Creates a CharSetConverter instance.
/// Returns std::errc::invalid_argument in case the requested conversion is
/// not supported.
/// \param[in] CPFrom name of the source character encoding
/// \param[in] CPTo name of the target character encoding
/// \return a CharSetConverter instance or an error code
static ErrorOr<CharSetConverter> create(StringRef CPFrom, StringRef CPTo);

CharSetConverter(const CharSetConverter &) = delete;
CharSetConverter &operator=(const CharSetConverter &) = delete;

CharSetConverter(CharSetConverter &&Other)
: Converter(std::move(Other.Converter)) {}

CharSetConverter &operator=(CharSetConverter &&Other) {
if (this != &Other)
Converter = std::move(Other.Converter);
return *this;
}

~CharSetConverter() = default;

/// Converts a string.
/// \param[in] Source source string
/// \param[out] Result container for converted string
/// \return error code in case something went wrong
std::error_code convert(StringRef Source,
SmallVectorImpl<char> &Result) const {
return Converter->convert(Source, Result);
}

ErrorOr<std::string> convert(StringRef Source) const {
SmallString<100> Result;
auto EC = Converter->convert(Source, Result);
if (!EC)
return std::string(Result);
return EC;
}
};

} // namespace llvm

#endif
9 changes: 9 additions & 0 deletions llvm/lib/Support/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ add_llvm_component_library(LLVMSupport
CachePruning.cpp
Caching.cpp
circular_raw_ostream.cpp
CharSet.cpp
Chrono.cpp
COM.cpp
CodeGenCoverage.cpp
Expand Down Expand Up @@ -315,6 +316,14 @@ add_llvm_component_library(LLVMSupport
Demangle
)

# Link ICU library if it is an external library.
if(ICU_FOUND)
target_link_libraries(LLVMSupport
PRIVATE
${ICU_LIBRARIES}
)
endif()

set(llvm_system_libs ${system_libs})

# This block is only needed for llvm-config. When we deprecate llvm-config and
Expand Down
Loading
Loading