codeplaysoftware
diff --git a/‎clang/include/clang/Basic/Attr.td
Lines changed: 8 additions & 0 deletions b/‎clang/include/clang/Basic/Attr.td
Lines changed: 8 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/AttrDocs.td
Lines changed: 11 additions & 0 deletions b/‎clang/include/clang/Basic/AttrDocs.td
Lines changed: 11 additions & 0 deletions
diff --git a/‎clang/include/clang/Basic/DiagnosticSemaKinds.td
Lines changed: 3 additions & 2 deletions b/‎clang/include/clang/Basic/DiagnosticSemaKinds.td
Lines changed: 3 additions & 2 deletions
diff --git a/‎clang/lib/CodeGen/CGCUDANV.cpp
Lines changed: 81 additions & 12 deletions b/‎clang/lib/CodeGen/CGCUDANV.cpp
Lines changed: 81 additions & 12 deletions
diff --git a/‎clang/lib/CodeGen/CGCUDARuntime.h
Lines changed: 5 additions & 2 deletions b/‎clang/lib/CodeGen/CGCUDARuntime.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎clang/lib/CodeGen/CodeGenModule.cpp
Lines changed: 3 additions & 2 deletions b/‎clang/lib/CodeGen/CodeGenModule.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎clang/lib/Sema/SemaDeclAttr.cpp
Lines changed: 31 additions & 2 deletions b/‎clang/lib/Sema/SemaDeclAttr.cpp
Lines changed: 31 additions & 2 deletions
diff --git a/‎clang/test/AST/Inputs/cuda.h
Lines changed: 54 additions & 0 deletions b/‎clang/test/AST/Inputs/cuda.h
Lines changed: 54 additions & 0 deletions
diff --git a/‎clang/test/AST/ast-dump-managed-var.cu
Lines changed: 28 additions & 0 deletions b/‎clang/test/AST/ast-dump-managed-var.cu
Lines changed: 28 additions & 0 deletions
diff --git a/‎clang/test/CodeGenCUDA/Inputs/cuda.h
Lines changed: 3 additions & 0 deletions b/‎clang/test/CodeGenCUDA/Inputs/cuda.h
Lines changed: 3 additions & 0 deletions
@@ -324,6 +324,7 @@ class LangOpt<string name, code customCode = [{}]> {
 def MicrosoftExt : LangOpt<"MicrosoftExt">;
 def Borland : LangOpt<"Borland">;
 def CUDA : LangOpt<"CUDA">;
+def HIP : LangOpt<"HIP">;
 def SYCL : LangOpt<"SYCLIsDevice">;
 def COnly : LangOpt<"", "!LangOpts.CPlusPlus">;
 def CPlusPlus : LangOpt<"CPlusPlus">;
@@ -1115,6 +1116,13 @@ def CUDAHost : InheritableAttr {
   let Documentation = [Undocumented];
 }
 
+def HIPManaged : InheritableAttr {
+  let Spellings = [GNU<"managed">, Declspec<"__managed__">];
+  let Subjects = SubjectList<[Var]>;
+  let LangOpts = [HIP];
+  let Documentation = [HIPManagedAttrDocs];
+}
+
 def CUDAInvalidTarget : InheritableAttr {
   let Spellings = [];
   let Subjects = SubjectList<[Function]>;
 
@@ -5419,6 +5419,17 @@ unbind runtime APIs.
   }];
 }
 
+def HIPManagedAttrDocs : Documentation {
+  let Category = DocCatDecl;
+  let Content = [{
+The ``__managed__`` attribute can be applied to a global variable declaration in HIP.
+A managed variable is emitted as an undefined global symbol in the device binary and is
+registered by ``__hipRegisterManagedVariable`` in init functions. The HIP runtime allocates
+managed memory and uses it to define the symbol when loading the device binary.
+A managed variable can be accessed in both device and host code.
+  }];
+}
+
 def LifetimeOwnerDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 
@@ -8237,7 +8237,7 @@ def err_cuda_device_exceptions : Error<
   "%select{__device__|__global__|__host__|__host__ __device__}1 function">;
 def err_dynamic_var_init : Error<
     "dynamic initialization is not supported for "
-    "__device__, __constant__, and __shared__ variables.">;
+    "__device__, __constant__, __shared__, and __managed__ variables.">;
 def err_shared_var_init : Error<
     "initialization is not supported for __shared__ variables.">;
 def err_cuda_vla : Error<
@@ -8247,7 +8247,8 @@ def err_cuda_extern_shared : Error<"__shared__ variable %0 cannot be 'extern'">;
 def err_cuda_host_shared : Error<
     "__shared__ local variables not allowed in "
     "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
-def err_cuda_nonstatic_constdev: Error<"__constant__ and __device__ are not allowed on non-static local variables">;
+def err_cuda_nonstatic_constdev: Error<"__constant__, __device__, and "
+    "__managed__ are not allowed on non-static local variables">;
 def err_cuda_ovl_target : Error<
   "%select{__device__|__global__|__host__|__host__ __device__}0 function %1 "
   "cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
 
@@ -21,6 +21,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
 
 using namespace clang;
@@ -128,21 +129,23 @@ class CGNVCUDARuntime : public CGCUDARuntime {
     DeviceVars.push_back({&Var,
                           VD,
                           {DeviceVarFlags::Variable, Extern, Constant,
-                           /*Normalized*/ false, /*Type*/ 0}});
+                           VD->hasAttr<HIPManagedAttr>(),
+                           /*Normalized*/ false, 0}});
   }
   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
                           bool Extern, int Type) override {
     DeviceVars.push_back({&Var,
                           VD,
                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
+                           /*Managed*/ false,
                            /*Normalized*/ false, Type}});
   }
   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
                          bool Extern, int Type, bool Normalized) override {
     DeviceVars.push_back({&Var,
                           VD,
                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
-                           Normalized, Type}});
+                           /*Managed*/ false, Normalized, Type}});
   }
 
   /// Creates module constructor function
@@ -380,6 +383,47 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
   CGF.EmitBlock(EndBlock);
 }
 
+// Replace the original variable Var with the address loaded from variable
+// ManagedVar populated by HIP runtime.
+static void replaceManagedVar(llvm::GlobalVariable *Var,
+                              llvm::GlobalVariable *ManagedVar) {
+  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
+  for (auto &&VarUse : Var->uses()) {
+    WorkList.push_back({VarUse.getUser()});
+  }
+  while (!WorkList.empty()) {
+    auto &&WorkItem = WorkList.pop_back_val();
+    auto *U = WorkItem.back();
+    if (isa<llvm::ConstantExpr>(U)) {
+      for (auto &&UU : U->uses()) {
+        WorkItem.push_back(UU.getUser());
+        WorkList.push_back(WorkItem);
+        WorkItem.pop_back();
+      }
+      continue;
+    }
+    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
+      llvm::Value *OldV = Var;
+      llvm::Instruction *NewV =
+          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
+                             llvm::Align(Var->getAlignment()), I);
+      WorkItem.pop_back();
+      // Replace constant expressions directly or indirectly using the managed
+      // variable with instructions.
+      for (auto &&Op : WorkItem) {
+        auto *CE = cast<llvm::ConstantExpr>(Op);
+        auto *NewInst = llvm::createReplacementInstr(CE, I);
+        NewInst->replaceUsesOfWith(OldV, NewV);
+        OldV = CE;
+        NewV = NewInst;
+      }
+      I->replaceUsesOfWith(OldV, NewV);
+    } else {
+      llvm_unreachable("Invalid use of managed variable");
+    }
+  }
+}
+
 /// Creates a function that sets up state on the host side for CUDA objects that
 /// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
@@ -452,6 +496,13 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
       addUnderscoredPrefixToName("RegisterVar"));
+  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
+  //                              size_t, unsigned)
+  llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
+                                            CharPtrTy,    VarSizeTy, IntTy};
+  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
+      addUnderscoredPrefixToName("RegisterManagedVar"));
   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
   //                            const void **, const char *, int, int);
   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
@@ -474,16 +525,34 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
     case DeviceVarFlags::Variable: {
       uint64_t VarSize =
           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
-      llvm::Value *Args[] = {
-          &GpuBinaryHandlePtr,
-          Builder.CreateBitCast(Var, VoidPtrTy),
-          VarName,
-          VarName,
-          llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
-          llvm::ConstantInt::get(VarSizeTy, VarSize),
-          llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
-          llvm::ConstantInt::get(IntTy, 0)};
-      Builder.CreateCall(RegisterVar, Args);
+      if (Info.Flags.isManaged()) {
+        auto ManagedVar = new llvm::GlobalVariable(
+            CGM.getModule(), Var->getType(),
+            /*isConstant=*/false, Var->getLinkage(),
+            /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
+            Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
+            llvm::GlobalVariable::NotThreadLocal);
+        replaceManagedVar(Var, ManagedVar);
+        llvm::Value *Args[] = {
+            &GpuBinaryHandlePtr,
+            Builder.CreateBitCast(ManagedVar, VoidPtrTy),
+            Builder.CreateBitCast(Var, VoidPtrTy),
+            VarName,
+            llvm::ConstantInt::get(VarSizeTy, VarSize),
+            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+        Builder.CreateCall(RegisterManagedVar, Args);
+      } else {
+        llvm::Value *Args[] = {
+            &GpuBinaryHandlePtr,
+            Builder.CreateBitCast(Var, VoidPtrTy),
+            VarName,
+            VarName,
+            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+            llvm::ConstantInt::get(VarSizeTy, VarSize),
+            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+            llvm::ConstantInt::get(IntTy, 0)};
+        Builder.CreateCall(RegisterVar, Args);
+      }
       break;
     }
     case DeviceVarFlags::Surface:
 
@@ -54,16 +54,19 @@ class CGCUDARuntime {
     unsigned Kind : 2;
     unsigned Extern : 1;
     unsigned Constant : 1;   // Constant variable.
+    unsigned Managed : 1;    // Managed variable.
     unsigned Normalized : 1; // Normalized texture.
     int SurfTexType;         // Type of surface/texutre.
 
   public:
-    DeviceVarFlags(DeviceVarKind K, bool E, bool C, bool N, int T)
-        : Kind(K), Extern(E), Constant(C), Normalized(N), SurfTexType(T) {}
+    DeviceVarFlags(DeviceVarKind K, bool E, bool C, bool M, bool N, int T)
+        : Kind(K), Extern(E), Constant(C), Managed(M), Normalized(N),
+          SurfTexType(T) {}
 
     DeviceVarKind getKind() const { return static_cast<DeviceVarKind>(Kind); }
     bool isExtern() const { return Extern; }
     bool isConstant() const { return Constant; }
+    bool isManaged() const { return Managed; }
     bool isNormalized() const { return Normalized; }
     int getSurfTexType() const { return SurfTexType; }
   };
 
@@ -4152,13 +4152,14 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   // Shadows of initialized device-side global variables are also left
   // undefined.
   bool IsCUDAShadowVar =
-      !getLangOpts().CUDAIsDevice &&
+      !getLangOpts().CUDAIsDevice && !D->hasAttr<HIPManagedAttr>() &&
       (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>() ||
        D->hasAttr<CUDASharedAttr>());
   bool IsCUDADeviceShadowVar =
       getLangOpts().CUDAIsDevice &&
       (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
-       D->getType()->isCUDADeviceBuiltinTextureType());
+       D->getType()->isCUDADeviceBuiltinTextureType() ||
+       D->hasAttr<HIPManagedAttr>());
   // HIP pinned shadow of initialized host-side global variables are also
   // left undefined.
   if (getLangOpts().CUDA &&
 
@@ -4493,7 +4493,8 @@ static void handleOptimizeNoneAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleConstantAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (checkAttrMutualExclusion<CUDASharedAttr>(S, D, AL))
+  if (checkAttrMutualExclusion<CUDASharedAttr>(S, D, AL) ||
+      checkAttrMutualExclusion<HIPManagedAttr>(S, D, AL))
     return;
   const auto *VD = cast<VarDecl>(D);
   if (VD->hasLocalStorage()) {
@@ -4504,7 +4505,8 @@ static void handleConstantAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 }
 
 static void handleSharedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (checkAttrMutualExclusion<CUDAConstantAttr>(S, D, AL))
+  if (checkAttrMutualExclusion<CUDAConstantAttr>(S, D, AL) ||
+      checkAttrMutualExclusion<HIPManagedAttr>(S, D, AL))
     return;
   const auto *VD = cast<VarDecl>(D);
   // extern __shared__ is only allowed on arrays with no length (e.g.
@@ -4569,9 +4571,33 @@ static void handleDeviceAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       return;
     }
   }
+
+  if (auto *A = D->getAttr<CUDADeviceAttr>()) {
+    if (!A->isImplicit())
+      return;
+    D->dropAttr<CUDADeviceAttr>();
+  }
   D->addAttr(::new (S.Context) CUDADeviceAttr(S.Context, AL));
 }
 
+static void handleManagedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (checkAttrMutualExclusion<CUDAConstantAttr>(S, D, AL) ||
+      checkAttrMutualExclusion<CUDASharedAttr>(S, D, AL)) {
+    return;
+  }
+
+  if (const auto *VD = dyn_cast<VarDecl>(D)) {
+    if (VD->hasLocalStorage()) {
+      S.Diag(AL.getLoc(), diag::err_cuda_nonstatic_constdev);
+      return;
+    }
+  }
+  if (!D->hasAttr<HIPManagedAttr>())
+    D->addAttr(::new (S.Context) HIPManagedAttr(S.Context, AL));
+  if (!D->hasAttr<CUDADeviceAttr>())
+    D->addAttr(CUDADeviceAttr::CreateImplicit(S.Context));
+}
+
 static void handleGNUInlineAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   const auto *Fn = cast<FunctionDecl>(D);
   if (!Fn->isInlineSpecified()) {
@@ -7793,6 +7819,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D,
   case ParsedAttr::AT_CUDAHost:
     handleSimpleAttributeWithExclusions<CUDAHostAttr, CUDAGlobalAttr>(S, D, AL);
     break;
+  case ParsedAttr::AT_HIPManaged:
+    handleManagedAttr(S, D, AL);
+    break;
   case ParsedAttr::AT_CUDADeviceBuiltinSurfaceType:
     handleSimpleAttributeWithExclusions<CUDADeviceBuiltinSurfaceTypeAttr,
                                         CUDADeviceBuiltinTextureTypeAttr>(S, D,
 
@@ -0,0 +1,54 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+// Make this file work with nvcc, for testing compatibility.
+
+#ifndef __NVCC__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#define __managed__ __attribute__((managed))
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+#endif
+
+// Host- and device-side placement new overloads.
+void *operator new(__SIZE_TYPE__, void *p) { return p; }
+void *operator new[](__SIZE_TYPE__, void *p) { return p; }
+__device__ void *operator new(__SIZE_TYPE__, void *p) { return p; }
+__device__ void *operator new[](__SIZE_TYPE__, void *p) { return p; }
+
+#endif // !__NVCC__
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -ast-dump -x hip %s | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -fcuda-is-device -x hip %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: VarDecl {{.*}} m1 'int'
+// CHECK-NEXT: HIPManagedAttr
+// CHECK-NEXT: CUDADeviceAttr {{.*}}Implicit
+__managed__ int m1;
+
+// CHECK-LABEL: VarDecl {{.*}} m2 'int'
+// CHECK-NEXT: HIPManagedAttr
+// CHECK-NEXT: CUDADeviceAttr {{.*}}Implicit
+// CHECK-NOT: HIPManagedAttr
+// CHECK-NOT: CUDADeviceAttr
+__managed__ __managed__ int m2;
+
+// CHECK-LABEL: VarDecl {{.*}} m3 'int'
+// CHECK-NEXT: HIPManagedAttr
+// CHECK-NEXT: CUDADeviceAttr {{.*}}line
+// CHECK-NOT: CUDADeviceAttr {{.*}}Implicit
+__managed__ __device__ int m3;
+
+// CHECK-LABEL: VarDecl {{.*}} m3a 'int'
+// CHECK-NEXT: CUDADeviceAttr {{.*}}cuda.h
+// CHECK-NEXT: HIPManagedAttr
+// CHECK-NOT: CUDADeviceAttr {{.*}}Implicit
+__device__ __managed__ int m3a;
@@ -7,6 +7,9 @@
 #define __global__ __attribute__((global))
 #define __host__ __attribute__((host))
 #define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
 
 struct dim3 {