Skip to content

Commit 4583cac

Browse files
committed
update autoconversion functionality to fix error: source file is not valid UTF-8
1 parent 1091fad commit 4583cac

File tree

5 files changed

+96
-6
lines changed

5 files changed

+96
-6
lines changed

clang/include/clang/Basic/FileEntry.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@ class FileEntryRef {
7070
const FileEntry &getFileEntry() const {
7171
return *getBaseMapEntry().second->V.get<FileEntry *>();
7272
}
73+
#ifdef __MVS__
74+
FileEntry &getFileEntry() {
75+
return *getBaseMapEntry().second->V.get<FileEntry *>();
76+
}
77+
#endif
7378
DirectoryEntryRef getDir() const { return ME->second->Dir; }
7479

7580
inline off_t getSize() const;
@@ -323,6 +328,10 @@ class FileEntry {
323328

324329
StringRef tryGetRealPathName() const { return RealPathName; }
325330
off_t getSize() const { return Size; }
331+
#ifdef __MVS__
332+
// Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
333+
void setSize(off_t NewSize) { Size = NewSize; }
334+
#endif
326335
unsigned getUID() const { return UID; }
327336
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
328337
time_t getModificationTime() const { return ModTime; }

clang/lib/Basic/SourceManager.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "llvm/ADT/StringRef.h"
2525
#include "llvm/ADT/StringSwitch.h"
2626
#include "llvm/Support/Allocator.h"
27+
#include "llvm/Support/AutoConvert.h"
2728
#include "llvm/Support/Capacity.h"
2829
#include "llvm/Support/Compiler.h"
2930
#include "llvm/Support/Endian.h"
@@ -156,10 +157,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
156157
// Unless this is a named pipe (in which case we can handle a mismatch),
157158
// check that the file's size is the same as in the file entry (which may
158159
// have come from a stat cache).
160+
#ifndef __MVS__
159161
if (!ContentsEntry->isNamedPipe() &&
160162
Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
163+
#else
164+
// The buffer will always be larger than the file size on z/OS in the presence
165+
// of characters outside the base character set.
166+
if (!ContentsEntry->isNamedPipe() &&
167+
Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
168+
#endif
161169
Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
162-
163170
return std::nullopt;
164171
}
165172

@@ -602,6 +609,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
602609
return FileID::get(LoadedID);
603610
}
604611
unsigned FileSize = File.getSize();
612+
#ifdef __MVS__
613+
llvm::ErrorOr<bool> NeedConversion =
614+
llvm::needzOSConversion(Filename.str().c_str());
615+
if (NeedConversion && *NeedConversion) {
616+
// Buffer size may increase due to potential z/OS EBCDIC to UTF-8
617+
// conversion.
618+
if (std::optional<llvm::MemoryBufferRef> Buffer =
619+
File.getBufferOrNone(Diag, getFileManager())) {
620+
unsigned BufSize = Buffer->getBufferSize();
621+
if (BufSize > FileSize) {
622+
if (File.ContentsEntry.has_value())
623+
File.ContentsEntry->getFileEntry().setSize(BufSize);
624+
FileSize = BufSize;
625+
}
626+
}
627+
}
628+
#endif
605629
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
606630
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
607631
Diag.Report(IncludePos, diag::err_sloc_space_too_large);

llvm/include/llvm/Support/AutoConvert.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#ifdef __MVS__
1818
#include <_Ccsid.h>
1919
#ifdef __cplusplus
20+
#include "llvm/Support/ErrorOr.h"
2021
#include <system_error>
2122
#endif /* __cplusplus */
2223

@@ -54,8 +55,14 @@ std::error_code restorezOSStdHandleAutoConversion(int FD);
5455
/** \brief Set the tag information for a file descriptor. */
5556
std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
5657

57-
} /* namespace llvm */
58-
#endif /* __cplusplus */
58+
// Get the the tag ccsid for a file name or a file descriptor.
59+
ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
60+
61+
// Query the file tag to determine if it needs conversion to UTF-8 codepage.
62+
ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
63+
64+
} // namespace llvm
65+
#endif // __cplusplus
5966

6067
#endif /* __MVS__ */
6168

llvm/lib/Support/AutoConvert.cpp

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include <sys/stat.h>
2121
#include <unistd.h>
2222

23+
using namespace llvm;
24+
2325
static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
2426

2527
int disablezOSAutoConversion(int FD) {
@@ -116,4 +118,40 @@ std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
116118
return std::error_code();
117119
}
118120

119-
#endif // __MVS__
121+
ErrorOr<__ccsid_t> llvm::getzOSFileTag(const char *FileName, const int FD) {
122+
// If we have a file descriptor, use it to find out file tagging. Otherwise we
123+
// need to use stat() with the file path.
124+
if (FD != -1) {
125+
struct f_cnvrt Query = {
126+
QUERYCVT, // cvtcmd
127+
0, // pccsid
128+
0, // fccsid
129+
};
130+
if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
131+
return std::error_code(errno, std::generic_category());
132+
return Query.fccsid;
133+
}
134+
struct stat Attr;
135+
if (stat(FileName, &Attr) == -1)
136+
return std::error_code(errno, std::generic_category());
137+
return Attr.st_tag.ft_ccsid;
138+
}
139+
140+
ErrorOr<bool> llvm::needzOSConversion(const char *FileName, const int FD) {
141+
ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
142+
if (std::error_code EC = Ccsid.getError())
143+
return EC;
144+
// We don't need conversion for UTF-8 tagged files or binary files.
145+
// TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
146+
// problems related to UTF-8 tagged source files.
147+
switch (*Ccsid) {
148+
case CCSID_UTF_8:
149+
case CCSID_ISO8859_1:
150+
case FT_BINARY:
151+
return false;
152+
default:
153+
return true;
154+
}
155+
}
156+
157+
#endif //__MVS__

llvm/lib/Support/MemoryBuffer.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
361361
bool RequiresNullTerminator,
362362
int PageSize,
363363
bool IsVolatile) {
364+
#if defined(__MVS__)
365+
// zOS Enhanced ASCII auto convert does not support mmap.
366+
return false;
367+
#endif
368+
364369
// mmap may leave the buffer without null terminator if the file size changed
365370
// by the time the last page is mapped in, so avoid it if the file size is
366371
// likely to change.
@@ -503,9 +508,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
503508
}
504509

505510
#ifdef __MVS__
506-
// Set codepage auto-conversion for z/OS.
507-
if (auto EC = llvm::enablezOSAutoConversion(FD))
511+
ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
512+
if (std::error_code EC = NeedConversion.getError())
508513
return EC;
514+
// File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
515+
// cannot trust the file size and we create the memory buffer by copying
516+
// off the stream.
517+
// Note: This only works with the assumption of reading a full file (i.e,
518+
// Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
519+
if (Offset == 0 && MapSize == FileSize && *NeedConversion)
520+
return getMemoryBufferForStream(FD, Filename);
509521
#endif
510522

511523
auto Buf =

0 commit comments

Comments
 (0)