Enable permute_memory_to_nhwc for corstone300 unittests

oscarandersson8218 · web-flow · commit 8c4427c72daf · 2024-08-28T11:34:43.000-07:00
Differential Revision: D61480408 Pull Request resolved: #4773
diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -148,8 +148,9 @@ class ArmBackend final : public PyTorchBackendInterface {
       if (both_char and permuted_input_shape) {
         // permuted byte copy CHW to HWC
         permute_CHW_to_HWC(
-            scratch_addr,
             tensor_in.mutable_data_ptr<char>(),
+            scratch_addr,
+            tensor_in.size(1),
             tensor_in.size(2),
             tensor_in.size(3));
       } else if (both_char or both_int) {
@@ -204,13 +205,31 @@ class ArmBackend final : public PyTorchBackendInterface {
       // Process input EValue into scratch
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
-      for (int j = 0; j < tensor_out.numel(); j++) {
-        if (tensor_out.scalar_type() == ScalarType::Char) {
-          char* output_address = (char*)output_addr;
-          tensor_out.mutable_data_ptr<char>()[j] = output_address[j];
-        } else {
-          int* output_address = (int*)output_addr;
-          tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+      bool permuted_output_shape;
+      ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
+          i,
+          tensor_out,
+          &handles.outputs->io[i],
+          execution_handle->permuted_io_flag,
+          &permuted_output_shape));
+      if (tensor_out.scalar_type() == ScalarType::Char and
+          permuted_output_shape) {
+        char* output_address = (char*)output_addr;
+        permute_HWC_to_CHW(
+            output_address,
+            tensor_out.mutable_data_ptr<char>(),
+            tensor_out.size(1),
+            tensor_out.size(2),
+            tensor_out.size(3));
+      } else {
+        for (int j = 0; j < tensor_out.numel(); j++) {
+          if (tensor_out.scalar_type() == ScalarType::Char) {
+            char* output_address = (char*)output_addr;
+            tensor_out.mutable_data_ptr<char>()[j] = output_address[j];
+          } else {
+            int* output_address = (int*)output_addr;
+            tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+          }
         }
       }
     }
@@ -225,51 +244,62 @@ class ArmBackend final : public PyTorchBackendInterface {
  private:
   Error check_requires_permute(
       int index,
-      const exec_aten::Tensor tensor_in,
-      VelaIO* input,
+      const exec_aten::Tensor tensor,
+      VelaIO* io,
       bool permuted_io_flag,
       bool* is_permuted) const {
-    bool permuted_input_shape = false;
-    if (tensor_in.dim() == 4) {
+    bool permuted_shape = false;
+    if (tensor.dim() == 4) {
       // special case for NHWC workaround in AOT; as the compilation has
       // permuted to channel last in an undetectable way, we assume here
-      // that the application has similarly permuted any input tensors.
-      permuted_input_shape = tensor_in.size(0) == input->shape[0] &&
-          tensor_in.size(1) == input->shape[3] &&
-          tensor_in.size(2) == input->shape[1] &&
-          tensor_in.size(3) == input->shape[2];
-      if (permuted_input_shape) {
-        ET_LOG(Info, "Tensor input %d will be permuted", index);
+      // that the application has similarly permuted any input/output tensors.
+      permuted_shape = tensor.size(0) == io->shape[0] &&
+          tensor.size(1) == io->shape[3] && tensor.size(2) == io->shape[1] &&
+          tensor.size(3) == io->shape[2];
+      if (permuted_shape) {
+        ET_LOG(Info, "Tensor input/output %d will be permuted", index);
       }
-      if (permuted_io_flag != permuted_input_shape) {
-        ET_LOG(Error, "Permute compile flag and permuted input don't agree");
+      if (permuted_io_flag != permuted_shape) {
+        ET_LOG(
+            Error,
+            "Permute compile flag and permuted input/output don't agree");
         return Error::InvalidProgram;
       }
     }
-    if (!permuted_input_shape) {
+    if (!permuted_shape) {
       // Error check matching shapes in the general case
-      for (int i = 0; i < tensor_in.dim(); i++) {
-        if (tensor_in.size(i) != input->shape[i]) {
-          ET_LOG(Error, "Tensor input %d mismatched shape", index);
+      for (int i = 0; i < tensor.dim(); i++) {
+        if (tensor.size(i) != io->shape[i]) {
+          ET_LOG(Error, "Tensor input/output %d mismatched shape", index);
           ET_LOG(
               Error,
               "dimension %d mismatch, %zd != %d",
               index,
-              tensor_in.size(i),
-              input->shape[i]);
+              tensor.size(i),
+              io->shape[i]);
           return Error::InvalidProgram;
         }
       }
     }
-    *is_permuted = permuted_input_shape;
+    *is_permuted = permuted_shape;
     return Error::Ok;
   }
 
-  void permute_CHW_to_HWC(char* input, char* output, int H, int W) const {
+  void permute_CHW_to_HWC(char* input, char* output, int C, int H, int W)
+      const {
     for (int i = 0; i != H * W; ++i) {
-      output[i * 3 + 0] = input[i + 0 * W * H];
-      output[i * 3 + 1] = input[i + 1 * W * H];
-      output[i * 3 + 2] = input[i + 2 * W * H];
+      for (int j = 0; j < C; ++j) {
+        output[i * C + j] = input[i + j * W * H];
+      }
+    }
+  }
+
+  void permute_HWC_to_CHW(char* input, char* output, int C, int H, int W)
+      const {
+    for (int i = 0; i != H * W; ++i) {
+      for (int j = 0; j < C; ++j) {
+        output[i + j * W * H] = input[i * C + j];
+      }
     }
   }
 };
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
@@ -37,9 +37,9 @@ class Add2(torch.nn.Module):
                 torch.FloatTensor([1, 2, 3, 5, 7]),
                 (torch.FloatTensor([2, 1, 2, 1, 10])),
             ),
-            (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)),
+            (torch.ones(1, 10, 4, 6), torch.ones(1, 10, 4, 6)),
             (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
-            (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
+            (torch.randn(1, 3, 4, 4), torch.randn(1, 3, 4, 4)),
             (10000 * torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
         ]
 
@@ -101,7 +101,7 @@ def _test_add_u55_BI_pipeline(
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             )
             .quantize()
             .export()
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import unittest
 
 from typing import List, Tuple, Union
@@ -15,9 +14,6 @@
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
 
 class Conv2d(torch.nn.Module):
     """
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
@@ -26,17 +26,17 @@
     (
         "model_linear_rank1_zeros",
         torch.zeros(10),
-        10,
+        15,
     ),
     (
         "model_linear_rank1_ones",
         torch.ones(10),
-        10,
+        15,
     ),
     (
         "model_linear_rank1_negative_ones",
         torch.ones(10) * (-1),
-        10,
+        20,
     ),
     (
         "model_linear_rank1_rand",
@@ -46,12 +46,12 @@
     (
         "model_linear_rank1_negative_large_rand",
         torch.rand(10) * (-100),
-        10,
+        30,
     ),
     (
         "model_linear_rank1_large_randn",
-        torch.randn(10) * 100,
-        10,
+        torch.randn(15) * 100,
+        20,
     ),
 ]
 
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
@@ -265,9 +265,12 @@ def run_corstone300(
             raise RuntimeError(
                 f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
+        elif "E [" in result_stdout:
+            logger.error(result_stdout)
 
         tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
-        tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(inputs[0].shape)
+        output_shape = self.output_node.args[0][0].meta["val"].shape
+        tosa_ref_output = torch.from_numpy(tosa_ref_output).reshape(output_shape)
         return [tosa_ref_output]
 
     def run_tosa_ref_model(
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
@@ -252,7 +252,10 @@ def run_method_and_compare_outputs(
                 if isinstance(arg, tuple) and isinstance(arg[0], torch.Tensor):
                     test_input.extend(list(arg))
 
-            if is_nhwc:
+            if (
+                is_nhwc
+                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
+            ):
                 test_input = self.transpose_data_format(test_input, "NHWC")
 
             input_shapes = [
@@ -263,7 +266,10 @@ def run_method_and_compare_outputs(
 
             reference_output = reference_stage.run_artifact(reference_input)
             test_output = tuple(test_stage.run_artifact(test_input))
-            if is_nhwc:
+            if (
+                is_nhwc
+                and test_stage == self.stages[self.stage_name(tester.ToExecutorch)]
+            ):
                 test_output = self.transpose_data_format(test_output, "NCHW")
 
             self._compare_outputs(
diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp
@@ -71,7 +71,8 @@ void et_pal_emit_log_message(
     size_t line,
     const char* message,
     ET_UNUSED size_t length) {
-  fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message);
+  fprintf(
+      stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
 }
 
 namespace {

Original file line number	Diff line number	Diff line change
`@@ -26,17 +26,17 @@`
`26`	`26`	`(`
`27`	`27`	`"model_linear_rank1_zeros",`
`28`	`28`	`torch.zeros(10),`
`29`		`- 10,`
	`29`	`+ 15,`
`30`	`30`	`),`
`31`	`31`	`(`
`32`	`32`	`"model_linear_rank1_ones",`
`33`	`33`	`torch.ones(10),`
`34`		`- 10,`
	`34`	`+ 15,`
`35`	`35`	`),`
`36`	`36`	`(`
`37`	`37`	`"model_linear_rank1_negative_ones",`
`38`	`38`	`torch.ones(10) * (-1),`
`39`		`- 10,`
	`39`	`+ 20,`
`40`	`40`	`),`
`41`	`41`	`(`
`42`	`42`	`"model_linear_rank1_rand",`
`@@ -46,12 +46,12 @@`
`46`	`46`	`(`
`47`	`47`	`"model_linear_rank1_negative_large_rand",`
`48`	`48`	`torch.rand(10) * (-100),`
`49`		`- 10,`
	`49`	`+ 30,`
`50`	`50`	`),`
`51`	`51`	`(`
`52`	`52`	`"model_linear_rank1_large_randn",`
`53`		`- torch.randn(10) * 100,`
`54`		`- 10,`
	`53`	`+ torch.randn(15) * 100,`
	`54`	`+ 20,`
`55`	`55`	`),`
`56`	`56`	`]`
`57`	`57`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,8 @@ void et_pal_emit_log_message(`
`71`	`71`	`size_t line,`
`72`	`72`	`const char* message,`
`73`	`73`	`ET_UNUSED size_t length) {`
`74`		`- fprintf(stderr, "%c executorch:%s:%zu] %s\n", level, filename, line, message);`
	`74`	`+ fprintf(`
	`75`	`+ stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);`
`75`	`76`	`}`
`76`	`77`
`77`	`78`	`namespace {`