-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 #98652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SystemZ][z/OS] Update autoconversion functions to improve support for UTF-8 #98652
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-llvm-support Author: Abhina Sree (abhina-sree) ChangesThis fixes the following error when reading source and header files: error: source file is not valid UTF-8 Full diff: https://github.com/llvm/llvm-project/pull/98652.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index 68d4bf6093003..56684cc8828d9 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -70,6 +70,11 @@ class FileEntryRef {
const FileEntry &getFileEntry() const {
return *getBaseMapEntry().second->V.get<FileEntry *>();
}
+#ifdef __MVS__
+ FileEntry &getFileEntry() {
+ return *getBaseMapEntry().second->V.get<FileEntry *>();
+ }
+#endif
DirectoryEntryRef getDir() const { return ME->second->Dir; }
inline off_t getSize() const;
@@ -323,6 +328,10 @@ class FileEntry {
StringRef tryGetRealPathName() const { return RealPathName; }
off_t getSize() const { return Size; }
+#ifdef __MVS__
+ // Size may increase due to potential z/OS EBCDIC -> UTF-8 conversion.
+ void setSize(off_t NewSize) { Size = NewSize; }
+#endif
unsigned getUID() const { return UID; }
const llvm::sys::fs::UniqueID &getUniqueID() const { return UniqueID; }
time_t getModificationTime() const { return ModTime; }
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 533a9fe88a215..1b6197e4fa054 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -24,6 +24,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AutoConvert.h"
#include "llvm/Support/Capacity.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
@@ -166,8 +167,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM,
// Unless this is a named pipe (in which case we can handle a mismatch),
// check that the file's size is the same as in the file entry (which may
// have come from a stat cache).
+#ifndef __MVS__
if (!ContentsEntry->isNamedPipe() &&
Buffer->getBufferSize() != (size_t)ContentsEntry->getSize()) {
+#else
+ // The buffer will always be larger than the file size on z/OS in the presence
+ // of characters outside the base character set.
+ if (!ContentsEntry->isNamedPipe() &&
+ Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+#endif
if (Diag.isDiagnosticInFlight())
Diag.SetDelayedDiagnostic(diag::err_file_modified,
ContentsEntry->getName());
@@ -616,6 +624,23 @@ FileID SourceManager::createFileIDImpl(ContentCache &File, StringRef Filename,
return FileID::get(LoadedID);
}
unsigned FileSize = File.getSize();
+#ifdef __MVS__
+ llvm::ErrorOr<bool> NeedConversion =
+ llvm::needConversion(Filename.str().c_str());
+ if (NeedConversion && *NeedConversion) {
+ // Buffer size may increase due to potential z/OS EBCDIC to UTF-8
+ // conversion.
+ if (std::optional<llvm::MemoryBufferRef> Buffer =
+ File.getBufferOrNone(Diag, getFileManager())) {
+ unsigned BufSize = Buffer->getBufferSize();
+ if (BufSize > FileSize) {
+ if (File.ContentsEntry.has_value())
+ File.ContentsEntry->getFileEntry().setSize(BufSize);
+ FileSize = BufSize;
+ }
+ }
+ }
+#endif
if (!(NextLocalOffset + FileSize + 1 > NextLocalOffset &&
NextLocalOffset + FileSize + 1 <= CurrentLoadedOffset)) {
Diag.Report(IncludePos, diag::err_sloc_space_too_large);
diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index f4fe80b22a876..784d8f2d3bf75 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -17,6 +17,7 @@
#ifdef __MVS__
#include <_Ccsid.h>
#ifdef __cplusplus
+#include "llvm/Support/ErrorOr.h"
#include <system_error>
#endif // __cplusplus
@@ -52,6 +53,12 @@ std::error_code restoreStdHandleAutoConversion(int FD);
/// \brief Set the tag information for a file descriptor.
std::error_code setFileTag(int FD, int CCSID, bool Text);
+// Get the the tag ccsid for a file name or a file descriptor.
+ErrorOr<__ccsid_t> getFileTag(const char *FileName, const int FD = -1);
+
+// Query the file tag to determine if it needs conversion to UTF-8 codepage.
+ErrorOr<bool> needConversion(const char *FileName, const int FD = -1);
+
} // namespace llvm
#endif // __cplusplus
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index c509284ee916e..0792265733a43 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -20,6 +20,8 @@
#include <sys/stat.h>
#include <unistd.h>
+using namespace llvm;
+
static int savedStdHandleAutoConversionMode[3] = {-1, -1, -1};
int disableAutoConversion(int FD) {
@@ -116,4 +118,40 @@ std::error_code llvm::setFileTag(int FD, int CCSID, bool Text) {
return std::error_code();
}
-#endif // __MVS__
+ErrorOr<__ccsid_t> llvm::getFileTag(const char *FileName, const int FD) {
+ // If we have a file descriptor, use it to find out file tagging. Otherwise we
+ // need to use stat() with the file path.
+ if (FD != -1) {
+ struct f_cnvrt Query = {
+ QUERYCVT, // cvtcmd
+ 0, // pccsid
+ 0, // fccsid
+ };
+ if (fcntl(FD, F_CONTROL_CVT, &Query) == -1)
+ return std::error_code(errno, std::generic_category());
+ return Query.fccsid;
+ }
+ struct stat Attr;
+ if (stat(FileName, &Attr) == -1)
+ return std::error_code(errno, std::generic_category());
+ return Attr.st_tag.ft_ccsid;
+}
+
+ErrorOr<bool> llvm::needConversion(const char *FileName, const int FD) {
+ ErrorOr<__ccsid_t> Ccsid = getFileTag(FileName, FD);
+ if (std::error_code EC = Ccsid.getError())
+ return EC;
+ // We don't need conversion for UTF-8 tagged files or binary files.
+ // TODO: Remove the assumption of ISO8859-1 = UTF-8 here when we fully resolve
+ // problems related to UTF-8 tagged source files.
+ switch (*Ccsid) {
+ case CCSID_UTF_8:
+ case CCSID_ISO8859_1:
+ case FT_BINARY:
+ return false;
+ default:
+ return true;
+ }
+}
+
+#endif //__MVS__
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index fb7e804fd7e84..be351129879ea 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -362,6 +362,11 @@ static bool shouldUseMmap(sys::fs::file_t FD,
bool RequiresNullTerminator,
int PageSize,
bool IsVolatile) {
+#if defined(__MVS__)
+ // zOS Enhanced ASCII auto convert does not support mmap.
+ return false;
+#endif
+
// mmap may leave the buffer without null terminator if the file size changed
// by the time the last page is mapped in, so avoid it if the file size is
// likely to change.
@@ -504,9 +509,16 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
}
#ifdef __MVS__
- // Set codepage auto-conversion for z/OS.
- if (auto EC = llvm::enableAutoConversion(FD))
+ ErrorOr<bool> NeedConversion = needConversion(Filename.str().c_str(), FD);
+ if (std::error_code EC = NeedConversion.getError())
return EC;
+ // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
+ // cannot trust the file size and we create the memory buffer by copying
+ // off the stream.
+ // Note: This only works with the assumption of reading a full file (i.e,
+ // Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
+ if (Offset == 0 && MapSize == FileSize && *NeedConversion)
+ return getMemoryBufferForStream(FD, Filename);
#endif
auto Buf =
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index ce2bf2b4193a5..fde63ddc7e20a 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -326,7 +326,7 @@ ErrorOr<std::unique_ptr<File>>
RealFileSystem::openFileForRead(const Twine &Name) {
SmallString<256> RealName, Storage;
Expected<file_t> FDOrErr = sys::fs::openNativeFileForRead(
- adjustPath(Name, Storage), sys::fs::OF_None, &RealName);
+ adjustPath(Name, Storage), sys::fs::OF_Text, &RealName);
if (!FDOrErr)
return errorToErrorCode(FDOrErr.takeError());
return std::unique_ptr<File>(
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
539af08
to
716a8c9
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for this!
I have some concerns over le lack of tests, although I understand this would be difficult.
More importantly, I have concerns about the impact on reading non-source files, such as #embed files. Did you test that?
5d5f3d6
to
358fd3f
Compare
97862dd
to
bb32463
Compare
bb32463
to
d7798e2
Compare
d7798e2
to
74c4e2a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
55c3c72
to
4c52061
Compare
This fixes the following error when reading source and header files on z/OS: error: source file is not valid UTF-8