intel · hdelan · Sep 19, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
@@ -0,0 +1,49 @@
+//===- AMDGPUAddGlobalForAtomicXor.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Some AMDGPU atomic instructions require a prefetch in order for them to work
+// properly when using hipMallocManaged. This pass scans a module for the
+// problematic atomic instructions and creates a global PrefetchNeeded if the
+// builtin is present. This allows the prefetch to happen at runtime only if the
+// problematic builtin is chosen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUAddGlobalForAtomicXor.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+#define NEW_GLOBAL_NAME "HipAtomicXorModuleNeedsPrefetch"
+
+namespace {
+
+bool moduleHasAtomicXor(Module &M) {
+  for (auto &F : M)
+    for (auto &I : instructions(F))
+      if (auto *AtomicInst = dyn_cast<AtomicRMWInst>(&I);
+          AtomicInst && AtomicInst->getOperation() == AtomicRMWInst::Xor)
+        return true;
+  return false;
+}
+
+} // end anonymous namespace
+
+PreservedAnalyses
+AMDGPUAddGlobalForAtomicXorPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!moduleHasAtomicXor(M))
+    return PreservedAnalyses::all();
+  LLVMContext &Ctx = M.getContext();
+  M.getOrInsertGlobal(NEW_GLOBAL_NAME, Type::getInt1Ty(Ctx), [&] {
+    return new GlobalVariable(
+        M, Type::getInt1Ty(Ctx), true, GlobalValue::InternalLinkage,
+        Constant::getAllOnesValue(Type::getInt1Ty(Ctx)), NEW_GLOBAL_NAME);
+  });
+  return PreservedAnalyses::none();
+}
@@ -0,0 +1,32 @@
+//===- AMDGPUAddGlobalForAtomicXor.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Some AMDGPU atomic instructions require a prefetch in order for them to work
+// properly when using hipMallocManaged. This pass scans a module for the
+// problematic atomic instructions and creates a global PrefetchNeeded if the
+// builtin is present. This allows the prefetch to happen at runtime only if the
+// problematic builtin is chosen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_ADDGLOBALFORATOMICXOR_H
+#define LLVM_LIB_TARGET_AMDGPU_ADDGLOBALFORATOMICXOR_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class AMDGPUAddGlobalForAtomicXorPass
+    : public PassInfoMixin<AMDGPUAddGlobalForAtomicXorPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_ADDGLOBALFORATOMICXOR_H
@@ -14,6 +14,7 @@
 
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
+#include "AMDGPUAddGlobalForAtomicXor.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
@@ -618,6 +619,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-add-global-for-atomic-xor") {
+          PM.addPass(AMDGPUAddGlobalForAtomicXorPass());
+          return true;
+        }
         if (PassName == "amdgpu-lower-module-lds") {
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
           return true;

@@ -41,6 +41,7 @@ tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
 add_public_tablegen_target(InstCombineTableGen)
 
 add_llvm_target(AMDGPUCodeGen
+  AMDGPUAddGlobalForAtomicXor.cpp
   AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
   AMDGPUAnnotateKernelFeatures.cpp

@@ -0,0 +1,11 @@
+; RUN: sed -e s/.T1://g %s | opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-add-global-for-atomic-xor -S | FileCheck %s --check-prefix=CHECK1
+; RUN: sed -e s/.T2://g %s | opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-add-global-for-atomic-xor -S | FileCheck %s --check-prefix=CHECK2
+
+define i32 @test(ptr %p) {
+; CHECK1:      @HipAtomicXorModuleNeedsPrefetch
+; CHECK2-NOT:  @HipAtomicXorModuleNeedsPrefetch
+;T1:   %1 = atomicrmw volatile xor ptr %p, i32 1 syncscope("agent-one-as") monotonic, align 4
+;T2:   %1 = atomicrmw volatile add ptr %p, i32 1 syncscope("agent-one-as") monotonic, align 4
+  ret i32 %1
+}
+
@@ -8,7 +8,6 @@
 #pragma once
 
 #include <set>
-#include <unordered_map>
 
 #include "common.hpp"
 #include "device.hpp"
@@ -104,57 +103,6 @@ struct ur_context_handle_t_ {
 
   ur_usm_pool_handle_t getOwningURPool(umf_memory_pool_t *UMFPool);
 
-  /// We need to keep track of USM mappings in AMD HIP, as certain extra
-  /// synchronization *is* actually required for correctness.
-  /// During kernel enqueue we must dispatch a prefetch for each kernel argument
-  /// that points to a USM mapping to ensure the mapping is correctly
-  /// populated on the device (https://github.com/intel/llvm/issues/7252). Thus,
-  /// we keep track of mappings in the context, and then check against them just
-  /// before the kernel is launched. The stream against which the kernel is
-  /// launched is not known until enqueue time, but the USM mappings can happen
-  /// at any time. Thus, they are tracked on the context used for the urUSM*
-  /// mapping.
-  ///
-  /// The three utility function are simple wrappers around a mapping from a
-  /// pointer to a size.
-  void addUSMMapping(void *Ptr, size_t Size) {
-    std::lock_guard<std::mutex> Guard(Mutex);
-    assert(USMMappings.find(Ptr) == USMMappings.end() &&
-           "mapping already exists");
-    USMMappings[Ptr] = Size;
-  }
-
-  void removeUSMMapping(const void *Ptr) {
-    std::lock_guard<std::mutex> guard(Mutex);
-    auto It = USMMappings.find(Ptr);
-    if (It != USMMappings.end())
-      USMMappings.erase(It);
-  }
-
-  std::pair<const void *, size_t> getUSMMapping(const void *Ptr) {
-    std::lock_guard<std::mutex> Guard(Mutex);
-    auto It = USMMappings.find(Ptr);
-    // The simple case is the fast case...
-    if (It != USMMappings.end())
-      return *It;
-
-    // ... but in the failure case we have to fall back to a full scan to search
-    // for "offset" pointers in case the user passes in the middle of an
-    // allocation. We have to do some not-so-ordained-by-the-standard ordered
-    // comparisons of pointers here, but it'll work on all platforms we support.
-    uintptr_t PtrVal = (uintptr_t)Ptr;
-    for (std::pair<const void *, size_t> Pair : USMMappings) {
-      uintptr_t BaseAddr = (uintptr_t)Pair.first;
-      uintptr_t EndAddr = BaseAddr + Pair.second;
-      if (PtrVal > BaseAddr && PtrVal < EndAddr) {
-        // If we've found something now, offset *must* be nonzero
-        assert(Pair.second);
-        return Pair;
-      }
-    }
-    return {nullptr, 0};
-  }
-
 private:
   std::mutex Mutex;
   std::vector<deleter_data> ExtendedDeleters;

@@ -254,7 +254,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   try {
     ur_device_handle_t Dev = hQueue->getDevice();
     ScopedContext Active(Dev);
-    ur_context_handle_t Ctx = hQueue->getContext();
 
     uint32_t StreamToken;
     ur_stream_quard Guard;
@@ -263,15 +262,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     hipFunction_t HIPFunc = hKernel->get();
 
     hipDevice_t HIPDev = Dev->get();
-    for (const void *P : hKernel->getPtrArgs()) {
-      auto [Addr, Size] = Ctx->getUSMMapping(P);
-      if (!Addr)
-        continue;
-      if (hipMemPrefetchAsync(Addr, Size, HIPDev, HIPStream) != hipSuccess)
-        return UR_RESULT_ERROR_INVALID_KERNEL_ARGS;
+
+    // Some args using shared USM require prefetch
+    for (auto [Ptr, Size] : hKernel->Args.PtrArgsRequiringPrefetch) {
+      if (Ptr && Size) {
+        UR_CHECK_ERROR(hipMemPrefetchAsync(Ptr, Size, HIPDev, HIPStream));
+      }
     }
-    Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     // Set the implicit global offset parameter if kernel has offset variant
     if (hKernel->getWithOffsetParameter()) {

@@ -57,6 +57,8 @@ struct ur_kernel_handle_t_ {
     args_index_t Indices;
     args_size_t OffsetPerIndex;
     std::set<const void *> PtrArgs;
+    // Ptr args needing prefetch arranged [Ptr, Size of alloca]
+    std::set<std::pair<const void *, size_t>> PtrArgsRequiringPrefetch;
 
     std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
 
@@ -177,11 +179,20 @@ struct ur_kernel_handle_t_ {
     Args.addArg(Index, Size, Arg);
   }
 
-  /// We track all pointer arguments to be able to issue prefetches at enqueue
-  /// time
   void setKernelPtrArg(int Index, size_t Size, const void *PtrArg) {
     Args.PtrArgs.insert(*static_cast<void *const *>(PtrArg));
     setKernelArg(Index, Size, PtrArg);
+    // Ptr args using managed memory may require prefetch
+    hipPointerAttribute_t Attribs;
+    // We are only using hipPointerGetAttributes to check if the ptr refers to
+    // a managed memory location, meaning the Ptr may require a prefetch at
+    // kernel launch. If this call fails then it means that the Ptr may have
+    // been a host Ptr, which is not a problem
+    if (hipPointerGetAttributes(&Attribs, PtrArg) == hipSuccess &&
+        Attribs.isManaged) {
+      Args.PtrArgsRequiringPrefetch.insert(
+          {*static_cast<void *const *>(PtrArg), Size});
+    }
   }
 
   bool isPtrArg(const void *ptr) {