Enable gesvda for ROCM >= 6.1 (#1339)

xinyazhang · dnikolaev-amd · commit e3f58635346c · 2025-04-15T21:45:27.000Z
This also fixes a problem in gesvd driver when UV is not needed. (cherry picked from commit 4ce57ec)
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -321,11 +321,11 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
   // gesvd just knows how to handle m >= n, so in the other case we need to transpose A
   const auto not_A_H = A.size(-2) >= A.size(-1);
   Tensor Vcopy = V; // Shallow copy
-#ifdef USE_ROCM
+#ifdef ROCM_VERSION
   // Similar to the case in svd_magma(), experiments have shown Vh tensor is
   // not guaranteed to be column major on ROCM, we have to create a copy to
   // deal with this
-  if (!not_A_H) {
+  if (compute_uv && !not_A_H) {
     Vcopy = at::empty_like(V.mT(),
                            V.options()
                            .device(V.device())
@@ -340,8 +340,8 @@ static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S
                                        infos,
                                        full_matrices, compute_uv, calculate_all_batches, batches);
   });
-#ifdef USE_ROCM
-  if (!not_A_H) {
+#ifdef ROCM_VERSION
+  if (compute_uv && !not_A_H) {
     V.copy_(Vcopy);
   }
 #endif
@@ -515,8 +515,8 @@ static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const T
 template<typename scalar_t>
 static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
     const Tensor& infos, bool full_matrices, bool compute_uv) {
-#ifndef CUDART_VERSION
-  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
+#if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION < 60100
+  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend or ROCM >= 5.7.0.")
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   int m = cuda_int_cast(A.size(-2), "m");
@@ -654,7 +654,7 @@ void svd_cusolver(const Tensor& A,
   static const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
   // The default heuristic is to use gesvdj driver
-#ifdef USE_ROCM
+#if defined(ROCM_VERSION) && ROCM_VERSION < 60100
   const auto driver_v = std::string_view("gesvdj");
 #else
   const auto driver_v = driver.value_or("gesvdj");
diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp
@@ -470,8 +470,8 @@ void gesvdjBatched<c10::complex<double>>(
 }
 
 
-// ROCM does not implement gesdva yet
-#ifdef CUDART_VERSION
+// ROCM does not implement gesdva correctly before 6.1
+#if defined(CUDART_VERSION) || defined(ROCM_VERSION) && ROCM_VERSION >= 60100
 template<>
 void gesvdaStridedBatched_buffersize<float>(
     cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n, float *A, int lda, long long int strideA,

Original file line number	Diff line number	Diff line change
`@@ -470,8 +470,8 @@ void gesvdjBatched<c10::complex<double>>(`
`470`	`470`	`}`
`471`	`471`
`472`	`472`
`473`		`-// ROCM does not implement gesdva yet`
`474`		`-#ifdef CUDART_VERSION`
	`473`	`+// ROCM does not implement gesdva correctly before 6.1`
	`474`	`+#if defined(CUDART_VERSION) \|\| defined(ROCM_VERSION) && ROCM_VERSION >= 60100`
`475`	`475`	`template<>`
`476`	`476`	`void gesvdaStridedBatched_buffersize<float>(`
`477`	`477`	`cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n, float *A, int lda, long long int strideA,`