Enable NHWC batchnorm for miopen (#1400)

dnikolaev-amd · dnikolaev-amd · commit 4c94122829bb · 2024-09-16T14:38:12.000Z
* Enable batchnorm NHWC for MIOpen

* cleanup

* test to compare NHWC MIOpen batchnorm with CPU

* fix 'use_miopen' condition for nhwc miopen

* fix includes

* use native nhwc batchnorm to verify miopen

* remove extra spaces

* remove empty lines

* set PYTORCH_MIOPEN_SUGGEST_NHWC=1 for all test_nn.py test
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -516,6 +516,11 @@ BatchNormBackend _select_batch_norm_backend(
     return BatchNormBackend::Cudnn;
   }
 
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  // non static variable is used to be able to change environment variable in runtime for testing
+  bool PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(false);
+
   if (
       input.is_cuda()
       && input.dim() <= MIOPEN_DIM_MAX
@@ -528,8 +533,8 @@ BatchNormBackend _select_batch_norm_backend(
       && (input.dim() >= 3)
       && detail::getCUDAHooks().compiledWithMIOpen()
       && cudnn_enabled
-      && input.suggest_memory_format() != MemoryFormat::ChannelsLast
-      && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d
+      && (input.suggest_memory_format() == MemoryFormat::Contiguous
+        || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC))
   ) {
     return BatchNormBackend::Miopen;
   }
@@ -609,7 +614,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
   if (backend == BatchNormBackend::Miopen) {
     return std::tuple_cat(
              at::miopen_batch_norm(
-               input.contiguous(), weight.contiguous(), bias.contiguous(),
+               input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
                running_mean.defined() ? running_mean.contiguous() : running_mean,
                running_var.defined() ? running_var.contiguous() : running_var,
                training, momentum, eps),
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -100,7 +100,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-  auto output_t = at::empty(input->sizes(), input->options());
+  auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -177,8 +177,10 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   const Tensor& save_var_t =
       c10::value_or_else(save_var_t_opt, [] { return Tensor(); });
 
+  auto grad_output_contig =
+      grad_output_t.contiguous(input_t.suggest_memory_format());
   TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_t, "grad_output", 2 },
+            grad_output{ grad_output_contig, "grad_output", 2 },
             weight{ weight_t, "weight", 3 },
             save_mean{ save_mean_t, "save_mean", 4 },
             save_var{ save_var_t, "save_var", 5 };
@@ -193,7 +195,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   }
   checkAllSameType(c, {input, grad_output});
   checkAllSameType(c, {weight, save_mean, save_var});
-  checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+  checkAllContiguous(c, {save_mean, save_var});
+  TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
+  TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
   checkSameSize(c, input, grad_output);
   auto num_features = input->size(1);
@@ -208,7 +212,8 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-  auto grad_input_t  = at::empty(input->sizes(), input->options());
+  auto grad_input_t = at::empty(
+      input->sizes(), input->options(), input->suggest_memory_format());
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
   auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -9,6 +9,7 @@
 import warnings
 import pickle
 import re
+import os
 from copy import deepcopy
 from itertools import product
 from functools import partial
@@ -4928,6 +4929,54 @@ def run_test(input, grad_output):
         grad = grad.permute(0, 2, 1, 3)
         run_test(input, grad)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    def test_batchnorm_nhwc_miopen(self):
+        def run_test(input, grad_output):
+            c = input.size(1)
+            mod = nn.BatchNorm2d(c).cuda().float()
+            mod.weight.data.uniform_()
+            mod.bias.data.uniform_()
+            ref_input = input.detach().clone(memory_format=torch.preserve_format).requires_grad_(True)
+            ref_grad = grad.detach().clone(memory_format=torch.preserve_format)
+            ref_mod = nn.BatchNorm2d(c).cuda().float()
+            ref_mod.load_state_dict(mod.state_dict())
+            out = mod(input)
+            out.backward(grad_output)
+            with torch.backends.cudnn.flags(enabled=False): # force to use native nhwc batchnorm
+                ref_out = ref_mod(ref_input)
+                ref_out.backward(ref_grad)
+            self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
+            self.assertTrue(ref_out.is_contiguous(memory_format=torch.channels_last))
+            self.assertEqual(out, ref_out)
+            self.assertEqual(mod.weight.grad, ref_mod.weight.grad)
+            self.assertEqual(mod.bias.grad, ref_mod.bias.grad)
+            self.assertEqual(input.grad, ref_input.grad)
+
+        # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+        PYTORCH_MIOPEN_SUGGEST_NHWC = "PYTORCH_MIOPEN_SUGGEST_NHWC"
+        prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC)
+        try:
+            os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = "1"
+            input = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda")
+            input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_()
+
+            grad = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda")
+            grad = grad.contiguous(memory_format=torch.channels_last)
+            run_test(input, grad)
+            # see #42588, grad is channels_last contiguous, but grad.suggest_memory_format (rightly) return "contiguous"
+            # not channels_last
+            input = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda")
+            input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_()
+            grad = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda")
+            grad = grad.permute(0, 2, 1, 3)
+            run_test(input, grad)
+        finally:
+            if prev_val is None:
+                del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC]
+            else:
+                os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = prev_val
+
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_batchnorm_cudnn_half(self):
         # THNN
@@ -13023,4 +13072,14 @@ def __init__(self) -> None:
 
 if __name__ == '__main__':
     TestCase._default_dtype_check_enabled = True
-    run_tests()
+    # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+    PYTORCH_MIOPEN_SUGGEST_NHWC = "PYTORCH_MIOPEN_SUGGEST_NHWC"
+    prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC)
+    try:
+        os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = "1"
+        run_tests()
+    finally:
+        if prev_val is None:
+            del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC]
+        else:
+            os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = prev_val