[SYCL][CUDA] Updated documentation for CUDA backend (#2042)

Ruyk · web-flow · commit b90385496f89 · 2020-07-10T10:40:09.000+03:00
Minor udpates to the CUDA backend documentation and
starting guide.

Signed-off-by: Ruyman Reyes &lt;ruyman@codeplay.com&gt;
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
@@ -129,7 +129,8 @@ the system, refer to
 
 Currently, the only combination tested is Ubuntu 18.04 with CUDA 10.2 using
 a Titan RTX GPU (SM 71), but it should work on any GPU compatible with SM 50 or
-above.
+above. The default SM for the NVIDIA CUDA backend is 5.0. Users can specify
+lower values, but some features may not be supported.
 
 ### Deployment
 
@@ -513,11 +514,10 @@ class CUDASelector : public cl::sycl::device_selector {
   public:
     int operator()(const cl::sycl::device &Device) const override {
       using namespace cl::sycl::info;
+      const std::string DriverVersion = Device.get_info<device::driver_version>();
 
-      const std::string DeviceName = Device.get_info<device::name>();
-      const std::string DeviceVendor = Device.get_info<device::vendor>();
-
-      if (Device.is_gpu() && (DeviceName.find("NVIDIA") != std::string::npos)) {
+      if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+        std::cout << " CUDA device found " << std::endl;
         return 1;
       };
       return -1;
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
@@ -176,28 +176,47 @@ struct _pi_context {
   std::vector<deleter_data> extended_deleters_;
 };
 
-/// PI Mem mapping to a CUDA memory allocation
-///
+/// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
+/// \brief Represents non-SVM allocations on the CUDA backend.
+/// Keeps tracks of all mapped regions used for Map/Unmap calls.
+/// Only one region can be active at the same time per allocation.
 struct _pi_mem {
 
   // TODO: Move as much shared data up as possible
   using pi_context = _pi_context *;
 
+  // Context where the memory object is accessibles
   pi_context context_;
+
+  /// Reference counting of the handler
   std::atomic_uint32_t refCount_;
   enum class mem_type { buffer, surface } mem_type_;
 
+  /// A PI Memory object represents either plain memory allocations ("Buffers"
+  /// in OpenCL) or typed allocations ("Images" in OpenCL).
+  /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
+  /// This union allows implementation to use either from the same handler.
   union mem_ {
+    // Handler for plain, pointer-based CUDA allocations
     struct buffer_mem_ {
       using native_type = CUdeviceptr;
 
+      // If this allocation is a sub-buffer (i.e., a view on an existing
+      // allocation), this is the pointer to the parent handler structure
       pi_mem parent_;
+      // CUDA handler for the pointer
       native_type ptr_;
+
+      /// Pointer associated with this device on the host
       void *hostPtr_;
+      /// Size of the allocation in bytes
       size_t size_;
-
+      /// Offset of the active mapped region.
       size_t mapOffset_;
+      /// Pointer to the active mapped region, if any
       void *mapPtr_;
+      /// Original flags for the mapped region
       cl_map_flags mapFlags_;
 
       /** alloc_mode
@@ -222,6 +241,10 @@ struct _pi_mem {
 
       size_t get_map_offset(void *ptr) const noexcept { return mapOffset_; }
 
+      /// Returns a pointer to data visible on the host that contains
+      /// the data on the device associated with this allocation.
+      /// The offset is used to index into the CUDA allocation.
+      ///
       void *map_to_ptr(size_t offset, cl_map_flags flags) noexcept {
         assert(mapPtr_ == nullptr);
         mapOffset_ = offset;
@@ -235,6 +258,7 @@ struct _pi_mem {
         return mapPtr_;
       }
 
+      /// Detach the allocation from the host memory.
       void unmap(void *ptr) noexcept {
         assert(mapPtr_ != nullptr);
 
@@ -251,6 +275,7 @@ struct _pi_mem {
       }
     } buffer_mem_;
 
+    // Handler data for surface object (i.e. Images)
     struct surface_mem_ {
       CUarray array_;
       CUsurfObject surfObj_;
@@ -264,7 +289,7 @@ struct _pi_mem {
     } surface_mem_;
   } mem_;
 
-  // Buffer constructor
+  /// Constructs the PI MEM handler for a non-typed allocation ("buffer")
   _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode,
           CUdeviceptr ptr, void *host_ptr, size_t size)
       : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
@@ -283,7 +308,7 @@ struct _pi_mem {
     }
   };
 
-  // Surface constructor
+  /// Constructs the PI allocation for an Image object (surface in CUDA)
   _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
           pi_mem_type image_type, void *host_ptr)
       : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {