Xilinx
diff --git a/‎clang/include/clang/Interpreter/Interpreter.h
Lines changed: 32 additions & 0 deletions b/‎clang/include/clang/Interpreter/Interpreter.h
Lines changed: 32 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/CGCUDANV.cpp
Lines changed: 4 additions & 2 deletions b/‎clang/lib/CodeGen/CGCUDANV.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎clang/lib/CodeGen/CodeGenAction.cpp
Lines changed: 2 additions & 0 deletions b/‎clang/lib/CodeGen/CodeGenAction.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/CodeGenModule.cpp
Lines changed: 4 additions & 0 deletions b/‎clang/lib/CodeGen/CodeGenModule.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎clang/lib/CodeGen/ModuleBuilder.cpp
Lines changed: 1 addition & 1 deletion b/‎clang/lib/CodeGen/ModuleBuilder.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎clang/lib/Interpreter/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎clang/lib/Interpreter/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎clang/lib/Interpreter/DeviceOffload.cpp
Lines changed: 176 additions & 0 deletions b/‎clang/lib/Interpreter/DeviceOffload.cpp
Lines changed: 176 additions & 0 deletions
diff --git a/‎clang/lib/Interpreter/DeviceOffload.h
Lines changed: 51 additions & 0 deletions b/‎clang/lib/Interpreter/DeviceOffload.h
Lines changed: 51 additions & 0 deletions
@@ -42,8 +42,34 @@ class IncrementalParser;
 /// Create a pre-configured \c CompilerInstance for incremental processing.
 class IncrementalCompilerBuilder {
 public:
+  IncrementalCompilerBuilder() {}
+
+  void SetCompilerArgs(const std::vector<const char *> &Args) {
+    UserArgs = Args;
+  }
+
+  // General C++
+  llvm::Expected<std::unique_ptr<CompilerInstance>> CreateCpp();
+
+  // Offload options
+  void SetOffloadArch(llvm::StringRef Arch) { OffloadArch = Arch; };
+
+  // CUDA specific
+  void SetCudaSDK(llvm::StringRef path) { CudaSDKPath = path; };
+
+  llvm::Expected<std::unique_ptr<CompilerInstance>> CreateCudaHost();
+  llvm::Expected<std::unique_ptr<CompilerInstance>> CreateCudaDevice();
+
+private:
   static llvm::Expected<std::unique_ptr<CompilerInstance>>
   create(std::vector<const char *> &ClangArgv);
+
+  llvm::Expected<std::unique_ptr<CompilerInstance>> createCuda(bool device);
+
+  std::vector<const char *> UserArgs;
+
+  llvm::StringRef OffloadArch;
+  llvm::StringRef CudaSDKPath;
 };
 
 /// Provides top-level interfaces for incremental compilation and execution.
@@ -52,6 +78,9 @@ class Interpreter {
   std::unique_ptr<IncrementalParser> IncrParser;
   std::unique_ptr<IncrementalExecutor> IncrExecutor;
 
+  // An optional parser for CUDA offloading
+  std::unique_ptr<IncrementalParser> DeviceParser;
+
   Interpreter(std::unique_ptr<CompilerInstance> CI, llvm::Error &Err);
 
   llvm::Error CreateExecutor();
@@ -66,6 +95,9 @@ class Interpreter {
   ~Interpreter();
   static llvm::Expected<std::unique_ptr<Interpreter>>
   create(std::unique_ptr<CompilerInstance> CI);
+  static llvm::Expected<std::unique_ptr<Interpreter>>
+  createWithCUDA(std::unique_ptr<CompilerInstance> CI,
+                 std::unique_ptr<CompilerInstance> DCI);
   const ASTContext &getASTContext() const;
   ASTContext &getASTContext();
   const CompilerInstance *getCompilerInstance() const;
 
@@ -24,6 +24,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/VirtualFileSystem.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -721,8 +722,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   // handle so CUDA runtime can figure out what to call on the GPU side.
   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
   if (!CudaGpuBinaryFileName.empty()) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
-        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+    auto VFS = CGM.getFileSystem();
+    auto CudaGpuBinaryOrErr =
+        VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
       CGM.getDiags().Report(diag::err_cannot_open_file)
           << CudaGpuBinaryFileName << EC.message();
 
@@ -264,6 +264,7 @@ namespace clang {
     // Links each entry in LinkModules into our module.  Returns true on error.
     bool LinkInModules() {
       for (auto &LM : LinkModules) {
+        assert(LM.Module && "LinkModule does not actually have a module");
         if (LM.PropagateAttrs)
           for (Function &F : *LM.Module) {
             // Skip intrinsics. Keep consistent with how intrinsics are created
@@ -293,6 +294,7 @@ namespace clang {
         if (Err)
           return true;
       }
+      LinkModules.clear();
       return false; // success
     }
 
 
@@ -6272,6 +6272,10 @@ void CodeGenModule::EmitLinkageSpec(const LinkageSpecDecl *LSD) {
 }
 
 void CodeGenModule::EmitTopLevelStmt(const TopLevelStmtDecl *D) {
+  // Device code should not be at top level.
+  if (LangOpts.CUDA && LangOpts.CUDAIsDevice)
+    return;
+
   std::unique_ptr<CodeGenFunction> &CurCGF =
       GlobalTopLevelStmtBlockInFlight.first;
 
 
@@ -36,7 +36,7 @@ namespace {
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS; // Only used for debug info.
     const HeaderSearchOptions &HeaderSearchOpts; // Only used for debug info.
     const PreprocessorOptions &PreprocessorOpts; // Only used for debug info.
-    const CodeGenOptions CodeGenOpts;  // Intentionally copied in.
+    const CodeGenOptions &CodeGenOpts;
 
     unsigned HandlingTopLevelDecls;
 
 
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS
    core
    native
+   MC
    Option
    OrcJit
    OrcShared
@@ -11,6 +12,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_clang_library(clangInterpreter
+  DeviceOffload.cpp
   IncrementalExecutor.cpp
   IncrementalParser.cpp
   Interpreter.cpp
 
@@ -0,0 +1,176 @@
+//===---------- DeviceOffload.cpp - Device Offloading------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements offloading to CUDA devices.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceOffload.h"
+
+#include "clang/Basic/TargetOptions.h"
+#include "clang/CodeGen/ModuleBuilder.h"
+#include "clang/Frontend/CompilerInstance.h"
+
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace clang {
+
+IncrementalCUDADeviceParser::IncrementalCUDADeviceParser(
+    Interpreter &Interp, std::unique_ptr<CompilerInstance> Instance,
+    IncrementalParser &HostParser, llvm::LLVMContext &LLVMCtx,
+    llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS,
+    llvm::Error &Err)
+    : IncrementalParser(Interp, std::move(Instance), LLVMCtx, Err),
+      HostParser(HostParser), VFS(FS) {
+  if (Err)
+    return;
+  StringRef Arch = CI->getTargetOpts().CPU;
+  if (!Arch.starts_with("sm_") || Arch.substr(3).getAsInteger(10, SMVersion)) {
+    Err = llvm::joinErrors(std::move(Err), llvm::make_error<llvm::StringError>(
+                                               "Invalid CUDA architecture",
+                                               llvm::inconvertibleErrorCode()));
+    return;
+  }
+}
+
+llvm::Expected<PartialTranslationUnit &>
+IncrementalCUDADeviceParser::Parse(llvm::StringRef Input) {
+  auto PTU = IncrementalParser::Parse(Input);
+  if (!PTU)
+    return PTU.takeError();
+
+  auto PTX = GeneratePTX();
+  if (!PTX)
+    return PTX.takeError();
+
+  auto Err = GenerateFatbinary();
+  if (Err)
+    return std::move(Err);
+
+  std::string FatbinFileName =
+      "/incr_module_" + std::to_string(PTUs.size()) + ".fatbin";
+  VFS->addFile(FatbinFileName, 0,
+               llvm::MemoryBuffer::getMemBuffer(
+                   llvm::StringRef(FatbinContent.data(), FatbinContent.size()),
+                   "", false));
+
+  HostParser.getCI()->getCodeGenOpts().CudaGpuBinaryFileName = FatbinFileName;
+
+  FatbinContent.clear();
+
+  return PTU;
+}
+
+llvm::Expected<llvm::StringRef> IncrementalCUDADeviceParser::GeneratePTX() {
+  auto &PTU = PTUs.back();
+  std::string Error;
+
+  const llvm::Target *Target = llvm::TargetRegistry::lookupTarget(
+      PTU.TheModule->getTargetTriple(), Error);
+  if (!Target)
+    return llvm::make_error<llvm::StringError>(std::move(Error),
+                                               std::error_code());
+  llvm::TargetOptions TO = llvm::TargetOptions();
+  llvm::TargetMachine *TargetMachine = Target->createTargetMachine(
+      PTU.TheModule->getTargetTriple(), getCI()->getTargetOpts().CPU, "", TO,
+      llvm::Reloc::Model::PIC_);
+  PTU.TheModule->setDataLayout(TargetMachine->createDataLayout());
+
+  PTXCode.clear();
+  llvm::raw_svector_ostream dest(PTXCode);
+
+  llvm::legacy::PassManager PM;
+  if (TargetMachine->addPassesToEmitFile(PM, dest, nullptr,
+                                         llvm::CGFT_AssemblyFile)) {
+    return llvm::make_error<llvm::StringError>(
+        "NVPTX backend cannot produce PTX code.",
+        llvm::inconvertibleErrorCode());
+  }
+
+  if (!PM.run(*PTU.TheModule))
+    return llvm::make_error<llvm::StringError>("Failed to emit PTX code.",
+                                               llvm::inconvertibleErrorCode());
+
+  PTXCode += '\0';
+  while (PTXCode.size() % 8)
+    PTXCode += '\0';
+  return PTXCode.str();
+}
+
+llvm::Error IncrementalCUDADeviceParser::GenerateFatbinary() {
+  enum FatBinFlags {
+    AddressSize64 = 0x01,
+    HasDebugInfo = 0x02,
+    ProducerCuda = 0x04,
+    HostLinux = 0x10,
+    HostMac = 0x20,
+    HostWindows = 0x40
+  };
+
+  struct FatBinInnerHeader {
+    uint16_t Kind;             // 0x00
+    uint16_t unknown02;        // 0x02
+    uint32_t HeaderSize;       // 0x04
+    uint32_t DataSize;         // 0x08
+    uint32_t unknown0c;        // 0x0c
+    uint32_t CompressedSize;   // 0x10
+    uint32_t SubHeaderSize;    // 0x14
+    uint16_t VersionMinor;     // 0x18
+    uint16_t VersionMajor;     // 0x1a
+    uint32_t CudaArch;         // 0x1c
+    uint32_t unknown20;        // 0x20
+    uint32_t unknown24;        // 0x24
+    uint32_t Flags;            // 0x28
+    uint32_t unknown2c;        // 0x2c
+    uint32_t unknown30;        // 0x30
+    uint32_t unknown34;        // 0x34
+    uint32_t UncompressedSize; // 0x38
+    uint32_t unknown3c;        // 0x3c
+    uint32_t unknown40;        // 0x40
+    uint32_t unknown44;        // 0x44
+    FatBinInnerHeader(uint32_t DataSize, uint32_t CudaArch, uint32_t Flags)
+        : Kind(1 /*PTX*/), unknown02(0x0101), HeaderSize(sizeof(*this)),
+          DataSize(DataSize), unknown0c(0), CompressedSize(0),
+          SubHeaderSize(HeaderSize - 8), VersionMinor(2), VersionMajor(4),
+          CudaArch(CudaArch), unknown20(0), unknown24(0), Flags(Flags),
+          unknown2c(0), unknown30(0), unknown34(0), UncompressedSize(0),
+          unknown3c(0), unknown40(0), unknown44(0) {}
+  };
+
+  struct FatBinHeader {
+    uint32_t Magic;      // 0x00
+    uint16_t Version;    // 0x04
+    uint16_t HeaderSize; // 0x06
+    uint32_t DataSize;   // 0x08
+    uint32_t unknown0c;  // 0x0c
+  public:
+    FatBinHeader(uint32_t DataSize)
+        : Magic(0xba55ed50), Version(1), HeaderSize(sizeof(*this)),
+          DataSize(DataSize), unknown0c(0) {}
+  };
+
+  FatBinHeader OuterHeader(sizeof(FatBinInnerHeader) + PTXCode.size());
+  FatbinContent.append((char *)&OuterHeader,
+                       ((char *)&OuterHeader) + OuterHeader.HeaderSize);
+
+  FatBinInnerHeader InnerHeader(PTXCode.size(), SMVersion,
+                                FatBinFlags::AddressSize64 |
+                                    FatBinFlags::HostLinux);
+  FatbinContent.append((char *)&InnerHeader,
+                       ((char *)&InnerHeader) + InnerHeader.HeaderSize);
+
+  FatbinContent.append(PTXCode.begin(), PTXCode.end());
+
+  return llvm::Error::success();
+}
+
+IncrementalCUDADeviceParser::~IncrementalCUDADeviceParser() {}
+
+} // namespace clang
@@ -0,0 +1,51 @@
+//===----------- DeviceOffload.h - Device Offloading ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes required for offloading to CUDA devices.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_INTERPRETER_DEVICE_OFFLOAD_H
+#define LLVM_CLANG_LIB_INTERPRETER_DEVICE_OFFLOAD_H
+
+#include "IncrementalParser.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+namespace clang {
+
+class IncrementalCUDADeviceParser : public IncrementalParser {
+public:
+  IncrementalCUDADeviceParser(
+      Interpreter &Interp, std::unique_ptr<CompilerInstance> Instance,
+      IncrementalParser &HostParser, llvm::LLVMContext &LLVMCtx,
+      llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> VFS,
+      llvm::Error &Err);
+
+  llvm::Expected<PartialTranslationUnit &>
+  Parse(llvm::StringRef Input) override;
+
+  // Generate PTX for the last PTU
+  llvm::Expected<llvm::StringRef> GeneratePTX();
+
+  // Generate fatbinary contents in memory
+  llvm::Error GenerateFatbinary();
+
+  ~IncrementalCUDADeviceParser();
+
+protected:
+  IncrementalParser &HostParser;
+  int SMVersion;
+  llvm::SmallString<1024> PTXCode;
+  llvm::SmallVector<char, 1024> FatbinContent;
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> VFS;
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_INTERPRETER_DEVICE_OFFLOAD_H
Original file line number	Diff line number	Diff line change
`@@ -264,6 +264,7 @@ namespace clang {`
`264`	`264`	`// Links each entry in LinkModules into our module. Returns true on error.`
`265`	`265`	`bool LinkInModules() {`
`266`	`266`	`for (auto &LM : LinkModules) {`
	`267`	`+ assert(LM.Module && "LinkModule does not actually have a module");`
`267`	`268`	`if (LM.PropagateAttrs)`
`268`	`269`	`for (Function &F : *LM.Module) {`
`269`	`270`	`// Skip intrinsics. Keep consistent with how intrinsics are created`
`@@ -293,6 +294,7 @@ namespace clang {`
`293`	`294`	`if (Err)`
`294`	`295`	`return true;`
`295`	`296`	`}`
	`297`	`+ LinkModules.clear();`
`296`	`298`	`return false; // success`
`297`	`299`	`}`
`298`	`300`
Original file line number	Diff line number	Diff line change
`@@ -6272,6 +6272,10 @@ void CodeGenModule::EmitLinkageSpec(const LinkageSpecDecl *LSD) {`
`6272`	`6272`	`}`
`6273`	`6273`
`6274`	`6274`	`void CodeGenModule::EmitTopLevelStmt(const TopLevelStmtDecl *D) {`
	`6275`	`+ // Device code should not be at top level.`
	`6276`	`+ if (LangOpts.CUDA && LangOpts.CUDAIsDevice)`
	`6277`	`+ return;`
	`6278`	`+`
`6275`	`6279`	`std::unique_ptr<CodeGenFunction> &CurCGF =`
`6276`	`6280`	`GlobalTopLevelStmtBlockInFlight.first;`
`6277`	`6281`