Skip to content

Commit 04379c9

Browse files
authored
[SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 (#98652)
This fixes the following error when reading source and header files on z/OS: error: source file is not valid UTF-8
1 parent 0100c63 commit 04379c9

File tree

5 files changed

+104
-6
lines changed

5 files changed

+104
-6
lines changed

clang/include/clang/Basic/FileEntry.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ class FileEntryRef {
7070
const FileEntry &getFileEntry() const {
7171
return *getBaseMapEntry().second->V.get<FileEntry *>();
7272
}
73+
74+
// This function is used if the buffer size needs to be increased
75+
// due to potential z/OS EBCDIC -> UTF-8 conversion
76+
inline void updateFileEntryBufferSize(unsigned BufferSize);
77+
7378
DirectoryEntryRef getDir() const { return ME->second->Dir; }
7479

7580
inline off_t getSize() const;
@@ -323,6 +328,8 @@ class FileEntry {
323328

324329
StringRef tryGetRealPathName() const { return RealPathName; }
325330
off_t getSize() const { return Size; }
331+
// Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
332+
void setSize(off_t NewSize) { Size = NewSize; }
326333
unsigned getUID() const { return UID; }
327334
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
328335
time_t getModificationTime() const { return ModTime; }
@@ -353,6 +360,10 @@ bool FileEntryRef::isNamedPipe() const { return getFileEntry().isNamedPipe(); }
353360

354361
void FileEntryRef::closeFile() const { getFileEntry().closeFile(); }
355362

363+
void FileEntryRef::updateFileEntryBufferSize(unsigned BufferSize) {
364+
getBaseMapEntry().second->V.get<FileEntry *>()->setSize(BufferSize);
365+
}
366+
356367
} // end namespace clang
357368

358369
#endif // LLVM_CLANG_BASIC_FILEENTRY_H

clang/lib/Basic/SourceManager.cpp

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/ADT/StringRef.h"
2525
#include "llvm/ADT/StringSwitch.h"
2626
#include "llvm/Support/Allocator.h"
27+
#include "llvm/Support/AutoConvert.h"
2728
#include "llvm/Support/Capacity.h"
2829
#include "llvm/Support/Compiler.h"
2930
#include "llvm/Support/Endian.h"
@@ -156,8 +157,11 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
156157
// Unless this is a named pipe (in which case we can handle a mismatch),
157158
// check that the file's size is the same as in the file entry (which may
158159
// have come from a stat cache).
160+
// The buffer will always be larger than the file size on z/OS in the presence
161+
// of characters outside the base character set.
162+
assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
159163
if (!ContentsEntry->isNamedPipe() &&
160-
Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
164+
Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
161165
Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
162166

163167
return std::nullopt;
@@ -583,6 +587,18 @@ SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
583587
FileCharacter);
584588
}
585589

590+
/// Helper function to determine if an input file requires conversion
591+
bool needConversion(StringRef Filename) {
592+
#ifdef __MVS__
593+
llvm::ErrorOr<bool> NeedConversion =
594+
llvm::needzOSConversion(Filename.str().c_str());
595+
assert(NeedConversion && "Filename was not found");
596+
return *NeedConversion;
597+
#else
598+
return false;
599+
#endif
600+
}
601+
586602
/// createFileID - Create a new FileID for the specified ContentCache and
587603
/// include position. This works regardless of whether the ContentCache
588604
/// corresponds to a file or some other input source.
@@ -602,6 +618,20 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
602618
return FileID::get(LoadedID);
603619
}
604620
unsigned FileSize = File.getSize();
621+
bool NeedConversion = needConversion(Filename);
622+
if (NeedConversion) {
623+
// Buffer size may increase due to potential z/OS EBCDIC to UTF-8
624+
// conversion.
625+
if (std::optional<llvm::MemoryBufferRef> Buffer =
626+
File.getBufferOrNone(Diag, getFileManager())) {
627+
unsigned BufSize = Buffer->getBufferSize();
628+
if (BufSize > FileSize) {
629+
if (File.ContentsEntry.has_value())
630+
File.ContentsEntry->updateFileEntryBufferSize(BufSize);
631+
FileSize = BufSize;
632+
}
633+
}
634+
}
605635
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
606636
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
607637
Diag.Report(IncludePos, diag::err_sloc_space_too_large);

llvm/include/llvm/Support/AutoConvert.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#ifdef __MVS__
1818
#include <_Ccsid.h>
1919
#ifdef __cplusplus
20+
#include "llvm/Support/ErrorOr.h"
2021
#include <system_error>
2122
#endif /* __cplusplus */
2223

@@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
5455
/** \brief Set the tag information for a file descriptor. */
5556
std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
5657

57-
} /* namespace llvm */
58-
#endif /* __cplusplus */
58+
// Get the the tag ccsid for a file name or a file descriptor.
59+
ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
60+
61+
// Query the file tag to determine if it needs conversion to UTF-8 codepage.
62+
ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
63+
64+
} // namespace llvm
65+
#endif // __cplusplus
5966

6067
#endif /* __MVS__ */
6168

llvm/lib/Support/AutoConvert.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include <sys/stat.h>
2121
#include <unistd.h>
2222

23+
using namespace llvm;
24+
2325
static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
2426

2527
int disablezOSAutoConversion(int FD) {
@@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
116118
return std::error_code();
117119
}
118120

119-
#endif // __MVS__
121+
ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
122+
// If we have a file descriptor, use it to find out file tagging. Otherwise we
123+
// need to use stat() with the file path.
124+
if (FD != -1) {
125+
struct f_cnvrt Query = {
126+
QUERYCVT, // cvtcmd
127+
0, // pccsid
128+
0, // fccsid
129+
};
130+
if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
131+
return std::error_code(errno, std::generic_category());
132+
return Query.fccsid;
133+
}
134+
struct stat Attr;
135+
if (stat(FileName, &Attr) == -1)
136+
return std::error_code(errno, std::generic_category());
137+
return Attr.st_tag.ft_ccsid;
138+
}
139+
140+
ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
141+
ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
142+
if (std::error_code EC = Ccsid.getError())
143+
return EC;
144+
// We don't need conversion for UTF-8 tagged files or binary files.
145+
// TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
146+
// problems related to UTF-8 tagged source files.
147+
switch (*Ccsid) {
148+
case CCSID_UTF_8:
149+
case CCSID_ISO8859_1:
150+
case FT_BINARY:
151+
return false;
152+
default:
153+
return true;
154+
}
155+
}
156+
157+
#endif //__MVS__

llvm/lib/Support/MemoryBuffer.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
361361
bool RequiresNullTerminator,
362362
int PageSize,
363363
bool IsVolatile) {
364+
#if defined(__MVS__)
365+
// zOS Enhanced ASCII auto convert does not support mmap.
366+
return false;
367+
#endif
368+
364369
// mmap may leave the buffer without null terminator if the file size changed
365370
// by the time the last page is mapped in, so avoid it if the file size is
366371
// likely to change.
@@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
503508
}
504509

505510
#ifdef __MVS__
506-
// Set codepage auto-conversion for z/OS.
507-
if (auto EC = llvm::enablezOSAutoConversion(FD))
511+
ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
512+
if (std::error_code EC = NeedConversion.getError())
508513
return EC;
514+
// File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
515+
// cannot trust the file size and we create the memory buffer by copying
516+
// off the stream.
517+
// Note: This only works with the assumption of reading a full file (i.e,
518+
// Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
519+
if (Offset == 0 && MapSize == FileSize && *NeedConversion)
520+
return getMemoryBufferForStream(FD, Filename);
509521
#endif
510522

511523
auto Buf =

0 commit comments

Comments
 (0)