Skip to content

[SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 #98652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions clang/include/clang/Basic/FileEntry.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ class FileEntryRef {
const FileEntry &getFileEntry() const {
return *getBaseMapEntry().second->V.get<FileEntry *>();
}

// This function is used if the buffer size needs to be increased
// due to potential z/OS EBCDIC -> UTF-8 conversion
inline void updateFileEntryBufferSize(unsigned BufferSize);

DirectoryEntryRef getDir() const { return ME->second->Dir; }

inline off_t getSize() const;
Expand Down Expand Up @@ -323,6 +328,8 @@ class FileEntry {

StringRef tryGetRealPathName() const { return RealPathName; }
off_t getSize() const { return Size; }
// Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
void setSize(off_t NewSize) { Size = NewSize; }
unsigned getUID() const { return UID; }
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
time_t getModificationTime() const { return ModTime; }
Expand Down Expand Up @@ -353,6 +360,10 @@ bool FileEntryRef::isNamedPipe() const { return getFileEntry().isNamedPipe(); }

void FileEntryRef::closeFile() const { getFileEntry().closeFile(); }

void FileEntryRef::updateFileEntryBufferSize(unsigned BufferSize) {
getBaseMapEntry().second->V.get<FileEntry *>()->setSize(BufferSize);
}

} // end namespace clang

#endif // LLVM_CLANG_BASIC_FILEENTRY_H
32 changes: 31 additions & 1 deletion clang/lib/Basic/SourceManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/AutoConvert.h"
#include "llvm/Support/Capacity.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
Expand Down Expand Up @@ -156,8 +157,11 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
// Unless this is a named pipe (in which case we can handle a mismatch),
// check that the file's size is the same as in the file entry (which may
// have come from a stat cache).
// The buffer will always be larger than the file size on z/OS in the presence
// of characters outside the base character set.
assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
if (!ContentsEntry->isNamedPipe() &&
Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();

return std::nullopt;
Expand Down Expand Up @@ -583,6 +587,18 @@ SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
FileCharacter);
}

/// Helper function to determine if an input file requires conversion
bool needConversion(StringRef Filename) {
#ifdef __MVS__
llvm::ErrorOr<bool> NeedConversion =
llvm::needzOSConversion(Filename.str().c_str());
assert(NeedConversion && "Filename was not found");
return *NeedConversion;
#else
return false;
#endif
}

/// createFileID - Create a new FileID for the specified ContentCache and
/// include position. This works regardless of whether the ContentCache
/// corresponds to a file or some other input source.
Expand All @@ -602,6 +618,20 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
return FileID::get(LoadedID);
}
unsigned FileSize = File.getSize();
bool NeedConversion = needConversion(Filename);
if (NeedConversion) {
// Buffer size may increase due to potential z/OS EBCDIC to UTF-8
// conversion.
if (std::optional<llvm::MemoryBufferRef> Buffer =
File.getBufferOrNone(Diag, getFileManager())) {
unsigned BufSize = Buffer->getBufferSize();
if (BufSize > FileSize) {
if (File.ContentsEntry.has_value())
File.ContentsEntry->updateFileEntryBufferSize(BufSize);
FileSize = BufSize;
}
}
}
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
Diag.Report(IncludePos, diag::err_sloc_space_too_large);
Expand Down
11 changes: 9 additions & 2 deletions llvm/include/llvm/Support/AutoConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#ifdef __MVS__
#include <_Ccsid.h>
#ifdef __cplusplus
#include "llvm/Support/ErrorOr.h"
#include <system_error>
#endif /* __cplusplus */

Expand Down Expand Up @@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
/** \brief Set the tag information for a file descriptor. */
std::error_code setzOSFileTag(int FD, int CCSID, bool Text);

} /* namespace llvm */
#endif /* __cplusplus */
// Get the the tag ccsid for a file name or a file descriptor.
ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);

// Query the file tag to determine if it needs conversion to UTF-8 codepage.
ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);

} // namespace llvm
#endif // __cplusplus

#endif /* __MVS__ */

Expand Down
40 changes: 39 additions & 1 deletion llvm/lib/Support/AutoConvert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <sys/stat.h>
#include <unistd.h>

using namespace llvm;

static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};

int disablezOSAutoConversion(int FD) {
Expand Down Expand Up @@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
return std::error_code();
}

#endif // __MVS__
ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
// If we have a file descriptor, use it to find out file tagging. Otherwise we
// need to use stat() with the file path.
if (FD != -1) {
struct f_cnvrt Query = {
QUERYCVT, // cvtcmd
0, // pccsid
0, // fccsid
};
if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
return std::error_code(errno, std::generic_category());
return Query.fccsid;
}
struct stat Attr;
if (stat(FileName, &Attr) == -1)
return std::error_code(errno, std::generic_category());
return Attr.st_tag.ft_ccsid;
}

ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
if (std::error_code EC = Ccsid.getError())
return EC;
// We don't need conversion for UTF-8 tagged files or binary files.
// TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
// problems related to UTF-8 tagged source files.
switch (*Ccsid) {
case CCSID_UTF_8:
case CCSID_ISO8859_1:
case FT_BINARY:
return false;
default:
return true;
}
}

#endif //__MVS__
16 changes: 14 additions & 2 deletions llvm/lib/Support/MemoryBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
bool RequiresNullTerminator,
int PageSize,
bool IsVolatile) {
#if defined(__MVS__)
// zOS Enhanced ASCII auto convert does not support mmap.
return false;
#endif

// mmap may leave the buffer without null terminator if the file size changed
// by the time the last page is mapped in, so avoid it if the file size is
// likely to change.
Expand Down Expand Up @@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
}

#ifdef __MVS__
// Set codepage auto-conversion for z/OS.
if (auto EC = llvm::enablezOSAutoConversion(FD))
ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
if (std::error_code EC = NeedConversion.getError())
return EC;
// File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
// cannot trust the file size and we create the memory buffer by copying
// off the stream.
// Note: This only works with the assumption of reading a full file (i.e,
// Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
if (Offset == 0 && MapSize == FileSize && *NeedConversion)
return getMemoryBufferForStream(FD, Filename);
#endif

auto Buf =
Expand Down
Loading