[ROCm] Intra-node all reduce initial implementation (#1435)

jataylo · pragupta · dnikolaev-amd · commit 0c2f97c4e171 · 2024-06-20T19:47:48.000Z
* Initial commit to port intra_node_comm to ROCm (cherry picked from commit 48d1c33) * gpt-fast running now with intra-node comm (cherry picked from commit 618c54e) --------- Co-authored-by: Prachi Gupta <prachi.gupta@amd.com>
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -609,6 +609,10 @@ if(USE_ROCM)
     append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
     if(NOT WIN32)
       append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+      set_source_files_properties(
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+      )
     endif()
   endif()
   # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/intra_node_comm.cpp
@@ -18,6 +18,8 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
 #include <nvml.h>
+#else
+#include <rocm_smi/rocm_smi.h>
 #endif
 
 #include <cuda_runtime.h>
@@ -146,7 +148,26 @@ static NvlMesh getNvlMesh(const std::vector<std::string>& rankToBusId) {
   }
   return nvlMesh;
 #else
-  return {};
+  NvlMesh nvlMesh = {};
+  const auto worldSize = rankToBusId.size();
+  // For each device, loop over devices connected to it 
+  for (size_t idx = 0; idx < worldSize; ++idx) {
+    for (size_t link = 0; link < kMaxDevices; ++link) {
+        if(idx == link) continue;
+
+        bool conn = false;
+        auto ret = rsmi_is_P2P_accessible(idx, link, &conn);
+        if (ret != RSMI_STATUS_SUCCESS){
+            LOG(ERROR) << "IntraNodeComm: getNvlMesh: rsmi_is_P2P_accessible returned error ret=" << ret;
+            return {};
+        }
+
+        if (conn){
+            nvlMesh[idx][link] += 1;
+        }
+    }
+  }
+  return nvlMesh;
 #endif
 }
 
@@ -272,7 +293,6 @@ bool IntraNodeComm::rendezvous() {
   if (isInitialized_) {
     return true;
   }
-#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
   if (!isIntraNodeCommSupported() || worldSize_ < 2 ||
       worldSize_ > kMaxDevices) {
     return false;
@@ -289,12 +309,28 @@ bool IntraNodeComm::rendezvous() {
 
   DevInfo devInfo{};
   gethostname(devInfo.hostname, sizeof(devInfo.hostname));
+
+#if defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto ret = rsmi_init(0);
+  if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(ERROR) << "IntraNodeComm:: rendezvous failed in rsmi_init, ret=" << ret;
+    return false;
+  }
+#endif
+
   cudaDeviceProp prop{};
   AT_CUDA_CHECK(cudaGetDeviceProperties(&prop, deviceIdx));
+
+#if defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+  auto pci_format = "%08X:%02X:%02X.0";
+#else
+  auto pci_format = NVML_DEVICE_PCI_BUS_ID_FMT;
+#endif
+
   snprintf(
       devInfo.busId,
       sizeof(devInfo.busId),
-      NVML_DEVICE_PCI_BUS_ID_FMT,
+      pci_format,
       prop.pciDomainID,
       prop.pciBusID,
       prop.pciDeviceID);
@@ -344,8 +380,6 @@ bool IntraNodeComm::rendezvous() {
   buffersDev_ = symmetricMemory_->get_buffer_ptrs_dev();
   topoInfo_ = topoInfo;
   return true;
-#endif
-  return false;
 }
 
 } // namespace c10d::intra_node_comm
diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/intra_node_comm.cu
@@ -4,6 +4,12 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#if defined(USE_ROCM)
+#include <hip/amd_detail/amd_hip_bf16.h>
+#include <hip/amd_detail/amd_hip_atomic.h>
+#include <hip/amd_detail/hip_ldg.h>
+#endif
+
 namespace c10d {
 namespace intra_node_comm {
 
@@ -17,7 +23,7 @@ static constexpr size_t kOneShotThreshBytes = 256 * 1024;
 static constexpr size_t kTwoShotThreshBytes = 10 * 1024 * 1024;
 
 #if defined(USE_ROCM)
-using __nv_bfloat162 = uint32_t;
+using __nv_bfloat162 = __hip_bfloat162;
 #endif
 
 struct __align__(16) bf16x8 {
@@ -28,10 +34,7 @@ struct __align__(16) bf16x8 {
 
 DEVICE_INLINE __nv_bfloat162
 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
-#if defined(USE_ROCM)
-  CUDA_KERNEL_ASSERT(false);
-  return 0;
-#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
   __nv_bfloat162 res;
   return res;
@@ -70,8 +73,12 @@ DEVICE_INLINE bf16x8 add_bf16x8(bf16x8 a, bf16x8 b) {
  */
 template <typename T>
 DEVICE_INLINE void streamLoad128(bf16x8& val, const T* addr) {
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
+#elif defined(USE_ROCM)
+  ulonglong2 l_val = __ldg(reinterpret_cast<const ulonglong2*>(addr));
+  reinterpret_cast<unsigned long long*>(&val)[0] = l_val.data[0];
+  reinterpret_cast<unsigned long long*>(&val)[1] = l_val.data[1];
 #else
   unsigned long long int low, high;
   asm("ld.global.nc.v2.u64 {%0, %1}, [%2];"
@@ -83,8 +90,13 @@ DEVICE_INLINE void streamLoad128(bf16x8& val, const T* addr) {
 }
 
 __device__ inline void streamStore128(at::BFloat16* addr, const bf16x8& val) {
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
+#elif defined(USE_ROCM)
+    for (int i = 0; i < 8; i++)
+    {
+        addr[i] = reinterpret_cast<const at::BFloat16*>(&val)[i];
+    }
 #else
   unsigned long long int low, high;
   low = reinterpret_cast<const unsigned long long int*>(&val)[0];
@@ -104,15 +116,16 @@ DEVICE_INLINE void store128(T* addr, const bf16x8& val) {
 }
 
 DEVICE_INLINE void releaseSignal(uint32_t* addr) {
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
 #else
   atomicAdd_system(addr, 1);
+  __threadfence_system();
 #endif
 }
 
 DEVICE_INLINE void acquireSignal(uint32_t* addr) {
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
 #else
   volatile uint32_t* signal = addr;
@@ -473,7 +486,7 @@ static void getLaunchConfig(
 }
 
 bool isIntraNodeCommSupported() {
-#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   return false;
 #else
   return true;