Fix channel last 3d support for batch_norm (#642)

jiayisunx · web-flow · commit ae268ac1760d · 2022-03-25T09:38:05.000+08:00
diff --git a/intel_extension_for_pytorch/csrc/aten/cpu/Normalization.cpp b/intel_extension_for_pytorch/csrc/aten/cpu/Normalization.cpp
@@ -68,7 +68,8 @@ struct Var {
 };
 
 static inline bool is_contiguous(const at::Tensor& t) {
-  return t.is_contiguous() || t.is_contiguous(at::MemoryFormat::ChannelsLast);
+  return t.is_contiguous() || t.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+      t.is_contiguous(at::MemoryFormat::ChannelsLast3d);
 }
 
 // For some ambiguous cases, it is possible a channels last contiguous
@@ -78,7 +79,9 @@ static inline bool is_contiguous(const at::Tensor& t) {
 static inline at::MemoryFormat suggest_memory_format_contig(
     const at::Tensor& t) {
   return t.is_contiguous() ? at::MemoryFormat::Contiguous
-                           : at::MemoryFormat::ChannelsLast;
+                           : (t.is_contiguous(at::MemoryFormat::ChannelsLast3d)
+                                  ? at::MemoryFormat::ChannelsLast3d
+                                  : at::MemoryFormat::ChannelsLast);
 }
 
 template <typename scalar_t, typename param_t>
diff --git a/intel_extension_for_pytorch/csrc/aten/cpu/kernels/NormalizationKrnl.cpp b/intel_extension_for_pytorch/csrc/aten/cpu/kernels/NormalizationKrnl.cpp
@@ -1267,7 +1267,9 @@ void batch_norm_cpu_kernel_impl(
                 eps);
           }
         });
-  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+  } else if (
+      input.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+      input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND(
         at::ScalarType::BFloat16,
         input.scalar_type(),
@@ -1338,7 +1340,9 @@ void batch_norm_cpu_collect_stats_kernel_impl(
             }
           }
         });
-  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+  } else if (
+      input.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+      input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND(
         at::ScalarType::BFloat16,
         input.scalar_type(),
@@ -1445,7 +1449,9 @@ void batch_norm_cpu_backward_kernel_impl(
             }
           }
         });
-  } else if (input.is_contiguous(at::MemoryFormat::ChannelsLast)) {
+  } else if (
+      input.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+      input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
     AT_DISPATCH_FLOATING_TYPES_AND(
         at::ScalarType::BFloat16,
         input.scalar_type(),
diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py
@@ -13,6 +13,8 @@
     HAS_TORCHVISION = False
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
+bn_m = {1 : nn.BatchNorm1d, 2 : nn.BatchNorm2d, 3 : nn.BatchNorm3d}
+
 class CPUOPsTester(TestCase):
 
     def test_channelshuffle(self):
@@ -142,38 +144,52 @@ def test_pixel_shuffle_nhwc_cpu(self):
         self.assertEqual(input.grad, ref_input.grad)
 
     def test_batch_norm(self):
-        m = nn.BatchNorm2d(100)
-        x = torch.randn(20, 100, 35, 45)
-        x1 = x.clone().detach().requires_grad_()
-        y1 = m(x1)
-        y1.mean().backward()
-
-        # test channels last
-        x2 = x.clone().detach().to(memory_format=torch.channels_last).requires_grad_()
-        y2 = m(x2)
-        y2.mean().backward()
-        self.assertTrue(y2.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(y1, y2)
-        self.assertTrue(x2.grad.is_contiguous(memory_format=torch.channels_last))
-        self.assertEqual(x1.grad, x2.grad)
-
-        # test bfloat16
-        x3 = x.clone().detach().bfloat16().requires_grad_()
-        y3 = m(x3)
-        y3.mean().backward()
-        self.assertTrue(y3.dtype == torch.bfloat16)
-        self.assertEqual(y1, y3, prec=0.1)
-        self.assertTrue(x3.grad.dtype == torch.bfloat16)
-        self.assertEqual(x1.grad, x3.grad)
-
-        # test autocast
-        with torch.cpu.amp.autocast():
-            for datatype in (torch.bfloat16, torch.float32):
-                x4 = x.clone().detach().to(datatype).requires_grad_()
-                y4 = m(x4)
-                y4.mean().backward()
-                self.assertTrue(y4.dtype == datatype)
-                self.assertTrue(x4.grad.dtype == datatype)
+        for dim in [2, 3]:
+            m = bn_m[dim](10)
+            input_size = [3, 10, 25, 25]
+            if dim == 3:
+                input_size.append(25)
+            x = torch.randn(input_size)
+            x1 = x.clone().detach().requires_grad_()
+            y1 = m(x1)
+            y1.mean().backward()
+
+            # test channels last
+            suggest_memory_format = torch.channels_last if dim == 2 else torch.channels_last_3d
+            x2 = x.clone().detach().to(memory_format=suggest_memory_format).requires_grad_()
+
+            y2 = m(x2)
+            y2.mean().backward()
+            self.assertTrue(y2.is_contiguous(memory_format=suggest_memory_format))
+            self.assertEqual(y1, y2)
+            self.assertTrue(x2.grad.is_contiguous(memory_format=suggest_memory_format))
+            self.assertEqual(x1.grad, x2.grad)
+
+            # test bfloat16
+            x3 = x.clone().detach().bfloat16().requires_grad_()
+            y3 = m(x3)
+            y3.mean().backward()
+            self.assertTrue(y3.dtype == torch.bfloat16)
+            self.assertEqual(y1, y3, prec=0.1)
+            self.assertTrue(x3.grad.dtype == torch.bfloat16)
+            self.assertEqual(x1.grad, x3.grad)
+
+            # test autocast
+            with torch.cpu.amp.autocast():
+                for datatype in (torch.bfloat16, torch.float32):
+                    x4 = x.clone().detach().to(datatype).requires_grad_()
+                    y4 = m(x4)
+                    y4.mean().backward()
+                    self.assertTrue(y4.dtype == datatype)
+                    self.assertTrue(x4.grad.dtype == datatype)
+
+                    x5 = x.clone().detach().to(datatype).to(memory_format=suggest_memory_format).requires_grad_()
+                    y5 = m(x5)
+                    y5.mean().backward()
+                    self.assertTrue(y5.dtype == datatype)
+                    self.assertTrue(x5.grad.dtype == datatype)
+                    self.assertTrue(y5.is_contiguous(memory_format=suggest_memory_format))
+                    self.assertTrue(x5.grad.is_contiguous(memory_format=suggest_memory_format))
 
     def test_adaptive_avg_pool2d(self):
         m = nn.AdaptiveAvgPool2d((5,7))