Add test_batchnorm_nhwc_miopen_cuda_float32 (#1561)

dnikolaev-amd · web-flow · commit d768940fb5b1 · 2024-08-26T15:04:18.000-05:00
New tests introduced for testing NHWC and NCHW batchnorm on MIOpen : 

- test_batchnorm_nhwc_miopen_cuda_float32
- test_batchnorm_nchw_miopen_cuda_float32

This test verifies weight and bias gradients, running_mean and
running_var
We can add other dtypes later

How to run:
`MIOPEN_ENABLE_LOGGING_CMD=1 python -u test/test_nn.py -v -k
test_batchnorm_nhwc_miopen_cuda_float32`

There is a difference in running_variance for NHWC batchnorm fp32
between MIOpen and native
```
MIOPEN_ENABLE_LOGGING_CMD=1 python -u test/test_nn.py -v -k test_batchnorm_nhwc_miopen_cuda_float32
...
self.assertEqual(mod.running_var, ref_mod.running_var)
AssertionError: Tensor-likes are not close!
Mismatched elements: 8 / 8 (100.0%)
Greatest absolute difference: 0.05455732345581055 at index (5,) (up to 1e-05 allowed)
Greatest relative difference: 0.030772637575864792 at index (5,) (up to 1.3e-06 allowed)
```
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -8238,6 +8238,64 @@ def test_affine_3d_rotateRandom(self, device):
 
             self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary))
 
+    def batchnorm2d_miopen(self, dtype, memory_format):
+        def run_test(input, grad_output):
+            c = input.size(1)
+            mod = nn.BatchNorm2d(c).cuda().to(dtype=input.dtype)
+            mod.weight.data.uniform_()
+            mod.bias.data.uniform_()
+            ref_input = input.detach().clone(memory_format=torch.preserve_format).requires_grad_(True)
+            ref_grad = grad.detach().clone(memory_format=torch.preserve_format)
+            ref_mod = nn.BatchNorm2d(c).cuda().to(dtype=input.dtype)
+            ref_mod.load_state_dict(mod.state_dict())
+            out = mod(input)
+            out.backward(grad_output)
+            with torch.backends.cudnn.flags(enabled=False): # force to use native nhwc batchnorm
+                ref_out = ref_mod(ref_input)
+                ref_out.backward(ref_grad)
+            self.assertTrue(out.is_contiguous(memory_format=memory_format))
+            self.assertTrue(ref_out.is_contiguous(memory_format=memory_format))
+            self.assertEqual(out, ref_out)
+            self.assertEqual(mod.weight.grad, ref_mod.weight.grad)
+            self.assertEqual(mod.bias.grad, ref_mod.bias.grad)
+            self.assertEqual(mod.running_mean, ref_mod.running_mean)
+            self.assertEqual(mod.running_var, ref_mod.running_var)
+            self.assertEqual(input.grad, ref_input.grad)
+
+        size = (4, 8, 2, 2)
+        input = torch.randint(1, 10, size=size, dtype=dtype, device="cuda")
+        input = input.contiguous(memory_format=memory_format).detach().requires_grad_()
+        grad = torch.randint(1, 10, size=size, dtype=dtype, device="cuda")
+        grad = grad.contiguous(memory_format=memory_format)
+        run_test(input, grad)
+        # see #42588, grad is channels_last contiguous, but grad.suggest_memory_format (rightly) return "contiguous"
+        # not channels_last
+        input = torch.randint(1, 10, (2, 8, 8, 1), dtype=dtype, device="cuda")
+        input = input.contiguous(memory_format=memory_format).detach().requires_grad_()
+        grad = torch.randint(1, 10, (2, 8, 8, 1), dtype=dtype, device="cuda")
+        grad = grad.permute(0, 2, 1, 3)
+        run_test(input, grad)
+
+
+    @onlyCUDA
+    @dtypes(torch.float)
+    def test_batchnorm_nhwc_miopen(self, dtype):
+        # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+        PYTORCH_MIOPEN_SUGGEST_NHWC = "PYTORCH_MIOPEN_SUGGEST_NHWC"
+        prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC)
+        try:
+            os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = "1"
+            self.batchnorm2d_miopen(dtype, torch.channels_last)
+        finally:
+            if prev_val is None:
+                del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC]
+            else:
+                os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC] = prev_val
+
+    @onlyCUDA
+    @dtypes(torch.float)
+    def test_batchnorm_nchw_miopen(self, dtype):
+        self.batchnorm2d_miopen(dtype, torch.contiguous_format)
 
     @onlyCUDA
     @dtypes(torch.float, torch.half)