[SYCL][CUDA][PI][UR] Fix PR review comments

Petr Vesely · callumfare · commit a0de2d72c877 · 2023-06-14T10:07:36.000+01:00
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -21,8 +21,8 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 
 /// UR context mapping to a CUDA context object.
 ///
-/// There is no direct mapping between a CUDA context and a UR context,
-/// main differences described below:
+/// There is no direct mapping between a CUDA context and a UR context.
+/// The main differences are described below:
 ///
 /// <b> CUDA context vs UR context </b>
 ///
@@ -32,21 +32,21 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 /// with a given device and control access to said device from the user side.
 /// UR API context are objects that are passed to functions, and not bound
 /// to threads.
-/// The _ur_context object doesn't implement this behavior, only holds the
-/// CUDA context data. The RAII object \ref ScopedContext implements the active
-/// context behavior.
+/// The ur_context_handle_t_ object doesn't implement this behavior. It only
+/// holds the CUDA context data. The RAII object \ref ScopedContext implements
+/// the active context behavior.
 ///
 /// <b> Primary vs User-defined context </b>
 ///
 /// CUDA has two different types of context, the Primary context,
 /// which is usable by all threads on a given process for a given device, and
 /// the aforementioned custom contexts.
-/// CUDA documentation, and performance analysis, indicates it is recommended
-/// to use Primary context whenever possible.
-/// Primary context is used as well by the CUDA Runtime API.
+/// The CUDA documentation, confirmed with performance analysis, suggest using
+/// the Primary context whenever possible.
+/// The Primary context is also used by the CUDA Runtime API.
 /// For UR applications to interop with CUDA Runtime API, they have to use
 /// the primary context - and make that active in the thread.
-/// The `_ur_context` object can be constructed with a `kind` parameter
+/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
 /// that allows to construct a Primary or `user-defined` context, so that
 /// the UR object interface is always the same.
 ///
@@ -56,6 +56,7 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///  the PI Context can store a number of callback functions that will be
 ///  called upon destruction of the UR Context.
 ///  See proposal for details.
+///  https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
 ///
 struct ur_context_handle_t_ {
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -101,7 +101,7 @@ struct ur_event_handle_t_ {
                      uint32_t StreamToken);
 
   // This constructor is private to force programmers to use the
-  // makeWithNative for event introp
+  // makeWithNative for event interop
   ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative);
 
   ur_command_t CommandType; // The type of command associated with event.
@@ -117,33 +117,34 @@ struct ur_event_handle_t_ {
   bool IsRecorded; // Signifies wether a native CUDA event has been recorded
                    // yet.
   bool IsStarted;  // Signifies wether the operation associated with the
-                   // PI event has started or not
+                   // UR event has started or not
 
   uint32_t StreamToken;
   uint32_t EventID; // Queue identifier of the event.
 
-  native_type EvEnd; // CUDA event handle. If this _pi_event represents a user
-                     // event, this will be nullptr.
+  native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents
+                     // a user event, this will be nullptr.
 
   native_type EvStart; // CUDA event handle associated with the start
 
   native_type EvQueued; // CUDA event handle associated with the time
                         // the command was enqueued
 
-  ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a
-                           // user event, this will be nullptr.
+  ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If
+                           // this is a user event, this will be nullptr.
 
   CUstream Stream; // CUstream associated with the event. If this is a user
                    // event, this will be uninitialized.
 
-  ur_context_handle_t Context; // pi_context associated with the event. If this
-                               // is a native event, this will be the same
-                               // context associated with the queue_ member.
+  ur_context_handle_t Context; // ur_context_handle_t associated with the event.
+                               // If this is a native event, this will be the
+                               // same context associated with the queue member.
 };
 
-// Iterates over the event wait list, returns correct ur_result_t error codes.
-// Invokes the callback for the latest event of each queue in the wait list.
-// The callback must take a single pi_event argument and return a ur_result_t.
+// Iterate over `event_wait_list` and apply the given callback `f` to the
+// latest event on each queue therein. The callback must take a single
+// ur_event_handle_t argument and return a ur_result_t. If the callback returns
+// an error, the iteration terminates and the error is returned.
 template <typename Func>
 ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
                             std::size_t NumEventsInWaitList, Func &&F) {
@@ -169,14 +170,13 @@ ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
                       Event0->getEventID() > Event1->getEventID());
             });
 
-  bool First = true;
   CUstream LastSeenStream = 0;
-  for (ur_event_handle_t Event : Events) {
-    if (!Event || (!First && Event->getStream() == LastSeenStream)) {
+  for (size_t i = 0; i < Events.size(); i++) {
+    auto Event = Events[i];
+    if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) {
       continue;
     }
 
-    First = false;
     LastSeenStream = Event->getStream();
 
     auto Result = F(Event);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -66,7 +66,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
                      void *pPropValue, size_t *pPropSizeRet) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  // Here we want to query about a kernel's cuda blocks!
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
@@ -356,6 +355,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_program_handle_t hProgram,
     const ur_kernel_native_properties_t *pProperties,
     ur_kernel_handle_t *phKernel) {
+  std::ignore = hNativeKernel;
+  std::ignore = hContext;
+  std::ignore = hProgram;
+  std::ignore = pProperties;
+  std::ignore = phKernel;
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -23,15 +23,14 @@
 /// invocation. This is not the case of CUFunction objects,
 /// which are simply passed together with the arguments on the invocation.
 /// The UR Kernel implementation for CUDA stores the list of arguments,
-/// argument sizes and offsets to emulate the interface of UR Kernel,
+/// argument sizes, and offsets to emulate the interface of UR Kernel,
 /// saving the arguments for the later dispatch.
 /// Note that in UR API, the Local memory is specified as a size per
 /// individual argument, but in CUDA only the total usage of shared
 /// memory is required since it is not passed as a parameter.
 /// A compiler pass converts the UR API local memory model into the
 /// CUDA shared model. This object simply calculates the total of
 /// shared memory, and the initial offsets of each parameter.
-///
 struct ur_kernel_handle_t_ {
   using native_type = CUfunction;
 
@@ -68,7 +67,7 @@ struct ur_kernel_handle_t_ {
       Indices.emplace_back(&ImplicitOffsetArgs);
     }
 
-    /// Adds an argument to the kernel.
+    /// Add an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
@@ -104,8 +103,9 @@ struct ur_kernel_handle_t_ {
 
       // align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      if (LocalOffset % Alignment != 0) {
-        AlignedLocalOffset += Alignment - (LocalOffset % Alignment);
+      size_t Pad = LocalOffset % Alignment;
+      if (Pad != 0) {
+        AlignedLocalOffset += Alignment - Pad;
       }
 
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
@@ -171,7 +171,7 @@ struct ur_kernel_handle_t_ {
 
   const char *getName() const noexcept { return Name.c_str(); }
 
-  /// Returns the number of arguments, excluding the implicit global offset.
+  /// Get the number of kernel arguments, excluding the implicit global offset.
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -14,7 +14,7 @@
 
 /// Creates a UR Memory object using a CUDA memory allocation.
 /// Can trigger a manual copy depending on the mode.
-/// \TODO Implement USE_HOST_PTR using cuHostRegister
+/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789
 ///
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
@@ -109,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
 /// Decreases the reference count of the Mem object.
 /// If this is zero, calls the relevant CUDA Free function
 /// \return UR_RESULT_SUCCESS unless deallocation error
-///
 UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
@@ -435,7 +434,6 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType,
 /// Implements a buffer partition in the CUDA backend.
 /// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
 /// as an offset over an existing CUDA allocation.
-///
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
     ur_mem_handle_t hBuffer, ur_mem_flags_t flags,
     ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -18,7 +18,7 @@
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
 struct ur_mem_handle_t_ {
-  // Context where the memory object is accessibles
+  // Context where the memory object is accessible
   ur_context_handle_t Context;
 
   /// Reference counting of the handler
@@ -31,7 +31,7 @@ struct ur_mem_handle_t_ {
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
   /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
-  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces.
   /// This union allows implementation to use either from the same handler.
   union MemImpl {
     // Handler for plain, pointer-based CUDA allocations
@@ -80,7 +80,6 @@ struct ur_mem_handle_t_ {
       /// Returns a pointer to data visible on the host that contains
       /// the data on the device associated with this allocation.
       /// The offset is used to index into the CUDA allocation.
-      ///
       void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept {
         assert(MapPtr == nullptr);
         MapOffset = Offset;
@@ -152,7 +151,6 @@ struct ur_mem_handle_t_ {
                    ur_mem_type_t ImageType, void *HostPtr)
       : Context{Context}, RefCount{1}, MemType{Type::Surface},
         MemFlags{MemFlags} {
-    // Ignore unused parameter
     (void)HostPtr;
 
     Mem.SurfaceMem.Array = Array;
@@ -162,16 +160,13 @@ struct ur_mem_handle_t_ {
   }
 
   ~ur_mem_handle_t_() {
-    if (MemType == Type::Buffer) {
-      if (isSubBuffer()) {
-        urMemRelease(Mem.BufferMem.Parent);
-        return;
-      }
+    if (isBuffer() && isSubBuffer()) {
+      urMemRelease(Mem.BufferMem.Parent);
+      return;
     }
     urContextRelease(Context);
   }
 
-  // TODO: Move as many shared funcs up as possible
   bool isBuffer() const noexcept { return MemType == Type::Buffer; }
 
   bool isSubBuffer() const noexcept {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -56,7 +56,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 ///
 /// However because multiple devices in a context is not currently supported,
 /// place each device in a separate platform.
-///
 UR_DLLEXPORT ur_result_t UR_APICALL
 urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
               uint32_t *pNumPlatforms) {
@@ -183,7 +182,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
   return UR_RESULT_SUCCESS;
 }
 
-// Returns plugin specific backend option.
+// Get CUDA plugin specific backend option.
 // Current support is only for optimization options.
 // Return empty string for cuda.
 // TODO: Determine correct string to be passed.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -159,7 +159,7 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
 /// CUDA driver API doesn't expose an operation for this.
 /// Note: This is currently only being used by the SYCL program class for the
 ///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to PI and use cuModuleGetFunction to check for a kernel.
+///       query to UR and use cuModuleGetFunction to check for a kernel.
 /// Note: Another alternative is to add kernel names as metadata, like with
 ///       reqd_work_group_size.
 ur_result_t getKernelNames(ur_program_handle_t) {
@@ -169,7 +169,6 @@ ur_result_t getKernelNames(ur_program_handle_t) {
 /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
 /// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
 /// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
                       size_t length, const ur_program_properties_t *pProperties,
@@ -186,7 +185,6 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
 /// CUDA will handle the PTX/CUBIN binaries internally through a call to
 /// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent
 /// in terms of CUDA adapter. \TODO Implement asynchronous compilation
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
                  const char *pOptions) {
@@ -196,7 +194,6 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
 /// Loads the images from a UR program into a CUmodule that can be
 /// used later on to extract functions (kernels).
 /// See \ref ur_program_handle_t for implementation details.
-///
 UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
                                                    ur_program_handle_t hProgram,
                                                    const char *pOptions) {
@@ -218,7 +215,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
 /// Creates a new UR program object that is the outcome of linking all input
 /// programs.
 /// \TODO Implement linker options, requires mapping of OpenCL to CUDA
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramLink(ur_context_handle_t hContext, uint32_t count,
               const ur_program_handle_t *phPrograms, const char *pOptions,
@@ -390,10 +386,10 @@ urProgramRelease(ur_program_handle_t hProgram) {
 
 /// Gets the native CUDA handle of a UR program object
 ///
-/// \param[in] program The PI program to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI program object.
+/// \param[in] program The UR program handle to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the UR program object.
 ///
-/// \return TBD
+/// \return ur_result_t
 UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
     ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
   UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -115,7 +115,6 @@ CUstream ur_queue_handle_t_::getNextTransferStream() {
 /// Valid properties
 /// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
 /// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
               const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
@@ -294,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
 
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
 
-  switch (uint32_t{propName}) {
+  switch (propName) {
   case UR_QUEUE_INFO_CONTEXT:
     return ReturnValue(hQueue->Context);
   case UR_QUEUE_INFO_DEVICE:
@@ -324,7 +323,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
     }
   }
   default:
-    break;
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   }
 
   return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp