ROCm
diff --git a/‎aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
Lines changed: 11 additions & 16 deletions b/‎aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
Lines changed: 11 additions & 16 deletions
diff --git a/‎aten/src/ATen/functorch/BatchRulesConvolution.cpp
Lines changed: 19 additions & 20 deletions b/‎aten/src/ATen/functorch/BatchRulesConvolution.cpp
Lines changed: 19 additions & 20 deletions
diff --git a/‎aten/src/ATen/functorch/BatchRulesHelper.h
Lines changed: 4 additions & 9 deletions b/‎aten/src/ATen/functorch/BatchRulesHelper.h
Lines changed: 4 additions & 9 deletions
@@ -14,18 +14,15 @@
 namespace at::functorch {
 
 template <typename F, F Func, typename... ExtraArgs>
-std::tuple<Tensor, std::optional<int64_t>> _binary_pointwise_batch_rule(
+static Tensor _binary_pointwise_batch_rule(
     const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
     const Tensor& other, std::optional<int64_t> other_batch_dim,
     ExtraArgs... extra_args) {
 
-  auto tensor_other = _binary_pointwise_helper(
+  auto [tensor_, other_]= _binary_pointwise_helper(
       tensor, tensor_batch_dim, other, other_batch_dim);
-  auto tensor_ = std::get<0>(tensor_other);
-  auto other_ = std::get<1>(tensor_other);
 
-  auto result = Func(tensor_, other_, std::forward<ExtraArgs>(extra_args)...);
-  return std::make_tuple(result, 0);
+  return Func(tensor_, std::move(other_), std::forward<ExtraArgs>(extra_args)...);
 }
 
 template <typename A, A a, typename C>
@@ -37,9 +34,9 @@ struct BinaryPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
       const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
       const Tensor& other, std::optional<int64_t> other_batch_dim,
       T... extra_args) {
-    return _binary_pointwise_batch_rule<F, Func, T...>(
+    return std::tuple(_binary_pointwise_batch_rule<F, Func, T...>(
         tensor, tensor_batch_dim, other, other_batch_dim,
-        std::forward<T>(extra_args)...);
+        std::forward<T>(extra_args)...), 0);
   }
 };
 
@@ -82,7 +79,7 @@ struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
     auto res = _binary_pointwise_batch_rule<F, Func, T...>(
       tensor_value, tensor_bdim, other_value, other_bdim,
       std::forward<T>(extra_args)...);
-    return makeBatched(std::get<0>(res), std::get<1>(res), cur_level);
+    return makeBatched(std::move(res), 0, cur_level);
   }
 };
 
@@ -93,7 +90,7 @@ struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
       c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
 
 template <typename M, M Meth, typename... ExtraArgs>
-void binary_pointwise_inplace_batch_rule(
+static void binary_pointwise_inplace_batch_rule(
     Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
     const Tensor& other, std::optional<int64_t> other_batch_dim,
     ExtraArgs... extra_args) {
@@ -120,7 +117,7 @@ void binary_pointwise_inplace_batch_rule(
 }
 
 template <typename F, F Func>
-std::tuple<Tensor, std::optional<int64_t>> comparison_pointwise_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> comparison_pointwise_batch_rule(
     const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
     const Tensor& other, std::optional<int64_t> other_batch_dim) {
   // compute max logical rank
@@ -165,9 +162,7 @@ static std::tuple<Tensor, std::optional<int64_t>> gelu_backward_batch_rule(
     c10::string_view approximate) {
 
   // repeat the preprocessing from _binary_pointwise_batch_rule
-  const auto tensor_other = _binary_pointwise_helper(grad_out, grad_out_bdim, input, input_bdim);
-  auto grad_out_ = std::get<0>(tensor_other);
-  auto input_ = std::get<1>(tensor_other);
+  auto [grad_out_, input_]= _binary_pointwise_helper(grad_out, grad_out_bdim, input, input_bdim);
 
   // gelu_backward doesn't broadcast well so we need to insist all inputs have a bdim
   const auto batch_size = get_bdim_size2(grad_out, grad_out_bdim, input, input_bdim);
@@ -243,8 +238,8 @@ static std::tuple<Tensor, std::optional<int64_t>> cdist_backward_batch_rule(
   // We need to apply the same preprocessing on x1 and x2 as in the forward pass
   // _binary_pointwise_batch_rule
   auto x12 = _binary_pointwise_helper(x1_, x1_bdim, x2, x2_bdim);
-  x1_ = std::get<0>(x12);
-  auto x2_ = std::get<1>(x12);
+  x1_ = std::move(std::get<0>(x12));
+  auto& x2_ = std::get<1>(x12);
 
   auto grad_ = moveBatchDimToFront(grad, grad_bdim);
   if ((x1_bdim || x2_bdim) && !grad_bdim) {
 
@@ -106,8 +106,7 @@ convolution_batch_rule(const Tensor& lhs, std::optional<int64_t> lhs_bdim, const
     result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), std::nullopt);
   }
   if (separate_bias) {
-    auto A = std::get<0>(result);
-    auto A_batch_dim = std::get<1>(result);
+    auto& [A, A_batch_dim] = result;
     auto B = *bias;
     auto B_batch_dim = bias_bdim;
     A = moveBatchDimToFront(A, A_batch_dim);
@@ -273,12 +272,12 @@ convolution_backward_weight_batch_rule(
       const auto grad_output_ = reshape_dim_into(*grad_output_bdim, 1, grad_output);
       const auto out_ch_dim = transposed ? 1 : 0;
       const auto dummy_weight = make_dummy(weight, weight_bdim, out_ch_dim, batch_size);
-      const auto result = at::convolution_backward_symint(
+      auto result = at::convolution_backward_symint(
           grad_output_, input, dummy_weight, std::nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
-      auto grad_weight = std::get<1>(result);
+      auto& grad_weight = std::get<1>(result);
       grad_weight = reshape_dim_outof_symint(out_ch_dim, batch_size, grad_weight);
-      return std::make_tuple(grad_weight, out_ch_dim);
+      return std::make_tuple(std::move(grad_weight), out_ch_dim);
     } else {
       auto grad_output_ = moveBatchDimToFront(grad_output, grad_output_bdim); // BN(GO)
       grad_output_ = reshape_dim_outof_symint(2, groups, grad_output_);              // BNGO
@@ -287,23 +286,23 @@ convolution_backward_weight_batch_rule(
       if (!transposed) {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GBO)I
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward_symint(
+        auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, std::nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
-        auto grad_weight = std::get<1>(result);
+        auto& grad_weight = std::get<1>(result);
         grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBOI
         grad_weight = grad_weight.transpose(0, 1);                          // BGOI
         grad_weight = grad_weight.flatten(1, 2);                            // B(GO)I
-        return std::make_tuple(grad_weight, 0);
+        return std::make_tuple(std::move(grad_weight), 0);
       } else {
         // BN(GO), N(GI) -> N(GBO), N(GI) -> (GI)(BO)
         const auto dummy_weight = make_dummy(weight, weight_bdim, 1, batch_size);
-        const auto result = at::convolution_backward_symint(
+        auto result = at::convolution_backward_symint(
             grad_output_, input, dummy_weight, std::nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
-        auto grad_weight = std::get<1>(result);
+        auto& grad_weight = std::get<1>(result);
         grad_weight = reshape_dim_outof_symint(1, batch_size, grad_weight);
-        return std::make_tuple(grad_weight, 1);
+        return std::make_tuple(std::move(grad_weight), 1);
       }
     }
   } else if (!grad_output_bdim && input_bdim) {
@@ -314,12 +313,12 @@ convolution_backward_weight_batch_rule(
       const auto input_ = reshape_dim_into(*input_bdim, 1, input);
       const auto in_ch_dim = transposed ? 0 : 1;
       const auto dummy_weight = make_dummy(weight, weight_bdim, in_ch_dim, batch_size);
-      const auto result = at::convolution_backward_symint(
+      auto result = at::convolution_backward_symint(
           grad_output, input_, dummy_weight, std::nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
-      auto grad_weight = std::get<1>(result);
+      auto& grad_weight = std::get<1>(result);
       grad_weight = reshape_dim_outof_symint(in_ch_dim, batch_size, grad_weight);
-      return std::make_tuple(grad_weight, in_ch_dim);
+      return std::make_tuple(std::move(grad_weight), in_ch_dim);
     } else {
       auto input_ = moveBatchDimToFront(input, input_bdim); // BN(GI)
       input_ = reshape_dim_outof_symint(2, groups, input_);        // BNGI
@@ -337,23 +336,23 @@ convolution_backward_weight_batch_rule(
       } else {
         // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O
         const auto dummy_weight = make_dummy(weight, weight_bdim, 0, batch_size);
-        const auto result = at::convolution_backward_symint(
+        auto result = at::convolution_backward_symint(
             grad_output, input_, dummy_weight, std::nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
-        auto grad_weight = std::get<1>(result);
+        auto& grad_weight = std::get<1>(result);
         grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBIO
         grad_weight = grad_weight.transpose(0, 1);                          // BGIO
         grad_weight = grad_weight.flatten(1, 2);                            // B(GI)O
-        return std::make_tuple(grad_weight, 0);
+        return std::make_tuple(std::move(grad_weight), 0);
       }
     }
   } else {
     TORCH_INTERNAL_ASSERT(weight_bdim);
     const auto dummy_weight = make_dummy(weight, weight_bdim, 0, 1);
-    const auto result = at::convolution_backward_symint(
+    auto result = at::convolution_backward_symint(
         grad_output, input, dummy_weight, std::nullopt, stride, padding,
         dilation, transposed, output_padding, groups, mask);
-    return std::make_tuple(std::get<1>(result), std::nullopt);
+    return std::make_tuple(std::move(std::get<1>(result)), std::nullopt);
 
   }
 }
@@ -424,7 +423,7 @@ static std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
   Tensor grad_input;
   if (output_mask[0]) {
     c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
-    const auto result = convolution_backward_input_batch_rule(
+    auto result = convolution_backward_input_batch_rule(
         grad_output, grad_output_bdim,
         input, input_bdim,
         weight, weight_bdim,
 
@@ -145,7 +145,7 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S
     const auto& ivalue = arguments[idx];
     if (ivalue.isTensor()) {
       auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
-      tensor_inputs.emplace_back(tensor_value, tensor_bdim);
+      tensor_inputs.emplace_back(std::move(tensor_value), tensor_bdim);
       tensor_pos.push_back(static_cast<int64_t>(idx));
     }
   }
@@ -220,8 +220,7 @@ inline void find_and_unpack_tensors(
       continue;
     }
     auto unpacked = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
-    const auto& tensor_value = std::get<0>(unpacked);
-    const auto tensor_bdim = std::get<1>(unpacked);
+    const auto& [tensor_value, tensor_bdim] = unpacked;
     if (tensor_bdim.has_value()) {
       auto candidate_batch_size = tensor_value.size(*tensor_bdim);
       if (computed_batch_size == -1) {
@@ -265,13 +264,9 @@ inline void boxed_existing_bdim_all_batch_rule(
 
   // for each tensor, ensure it has a bdim and reshape it.
   for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
-    const auto& value = std::get<0>(tensor_inputs[tensor_idx]);
-    auto bdim = std::get<1>(tensor_inputs[tensor_idx]);
+    const auto& [value, bdim] = tensor_inputs[tensor_idx];
     auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
-    if (!bdim.has_value()) {
-      bdim = 0;
-    }
-    (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(*bdim, 0, value_);
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(bdim.value_or(0), 0, value_);
   }
 
   op.callBoxed(stack);