CONSOLIDATED COMMITS: Enable tensorpipe with hip_basic backend

pruthvistony · pruthvistony · commit 0e96f1f5fcb2 · 2024-12-20T18:10:49.000-06:00
============================================================== Enable tensorpipe with hip_basic backend (#1135) * Add hip_basic tensorpipe support to PyTorch * Enabling hip_basic for Tensorpipe for pyTorch * removing upstream tensorpipe module * Adding ROCm specific tensopipe submodule * tensorpipe submodule updated * Update the hip invalid device string * Added ignore for tensorpipe git submodule * Moved include of tensorpipe_cuda.h to hipify * Updates based on review comments * Defining the variable __HIP_PLATFORM_AMD__ * Enabling the UTs Co-authored-by: Ronak Malik <Ronak.Malik@amd.com> Update tensorpipe submodule to support ROCm 6.0
diff --git a/.gitmodules b/.gitmodules
@@ -86,10 +86,6 @@
     ignore = dirty
     path = third_party/fmt
     url = https://github.com/fmtlib/fmt.git
-[submodule "third_party/tensorpipe"]
-    ignore = dirty
-    path = third_party/tensorpipe
-    url = https://github.com/pytorch/tensorpipe.git
 [submodule "third_party/cudnn_frontend"]
 	path = third_party/cudnn_frontend
 	url = https://github.com/NVIDIA/cudnn-frontend.git
@@ -134,3 +130,8 @@
 [submodule "third_party/kleidiai"]
 	path = third_party/kleidiai
 	url = https://git.gitlab.arm.com/kleidi/kleidiai.git
+[submodule "third_party/tensorpipe"]
+	ignore = dirty
+	path = third_party/tensorpipe
+	url = https://github.com/ROCmSoftwarePlatform/tensorpipe.git
+	branch = tp_rocm_60
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -1144,6 +1144,14 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
       set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
     endif()
+    if(USE_ROCM)
+      add_compile_options(-D__HIP_PLATFORM_AMD__=1)
+      set(TP_USE_ROCM ON CACHE BOOL "" FORCE)
+      set(TP_ENABLE_HIP_IPC OFF CACHE BOOL "" FORCE)
+      set(TP_ENABLE_HIP_XTH OFF CACHE BOOL "" FORCE)
+      set(TP_ENABLE_HIP_GDR OFF CACHE BOOL "" FORCE)
+      set(TP_ENABLE_IBV OFF CACHE BOOL "" FORCE)
+    endif()
     set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
     add_compile_options(-DTORCH_USE_LIBUV)
     include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/tensorpipe/third_party/libuv/include)
@@ -1158,9 +1166,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
     if(USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
     elseif(USE_ROCM)
-      message(WARNING "TensorPipe doesn't yet support ROCm")
+      message(WARNING "TensorPipe is supported on ROCm")
       # Not yet...
-      # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
+      list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
     endif()
   endif()
 endif()
diff --git a/test/run_test.py b/test/run_test.py
@@ -166,10 +166,7 @@ def __contains__(self, item):
 ] + FSDP_TEST
 
 ROCM_BLOCKLIST = [
-    "distributed/rpc/test_faulty_agent",
-    "distributed/rpc/test_tensorpipe_agent",
     "distributed/rpc/test_share_memory",
-    "distributed/rpc/cuda/test_tensorpipe_agent",
     "distributed/_shard/checkpoint/test_checkpoint"
     "distributed/_shard/checkpoint/test_file_system_checkpoint"
     "distributed/_shard/sharding_spec/test_sharding_spec",
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 52791a2fd214b2a9dc5759d36725909c1daa7f2e
+Subproject commit 135ba25f6be9991ebfe83d41d268d9c3d4cc5c5b
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
 #include <torch/csrc/distributed/rpc/tensorpipe_utils.h>
 
-#if defined(USE_TENSORPIPE) && !defined(USE_ROCM)
+#if defined(USE_TENSORPIPE)
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -48,6 +48,8 @@ C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_gdr, makeCudaGdrChannel);
 
 #endif
 
+#if TENSORPIPE_HAS_CUDA_XTH_CHANNEL
+
 std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
   auto context = tensorpipe::channel::cuda_xth::create();
   return std::make_unique<ChannelRegistration>(
@@ -57,6 +59,8 @@ std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
 // The cuda_xth channel supports same-process GPU-to-GPU comm
 C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_xth, makeCudaXthChannel);
 
+#endif
+
 std::unique_ptr<ChannelRegistration> makeCudaBasicChannel() {
   auto context = tensorpipe::channel::cuda_basic::create(
       tensorpipe::channel::basic::create());
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -32,6 +32,7 @@
     skip_if_lt_x_gpu,
     captured_output,
     tp_transports,
+    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import (
     IS_MACOS,
@@ -5054,6 +5055,7 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
 
     # Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     @dist_init(setup_rpc=False)
     def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
         initialize_pg(self.file_init_method, self.rank, self.world_size)