pytorch · Erik-Lundell · Mar 27, 2024 · Apr 29, 2024 · May 6, 2024 · May 7, 2024
@@ -94,9 +94,9 @@ def tosa_compile_spec(self):
         self.output_format = "tosa"
         return self
 
-    def dump_intermediate_tosa(self, output_path: str):
+    def dump_intermediate_artifacts_to(self, output_path: str):
         """
-        Output intermediate .tosa file
+        Sets a path for dumping intermediate results during such as tosa and pte.
         """
         self.path_for_intermediates = output_path
         return self
@@ -131,7 +131,7 @@ def build(self):
 
         if self.path_for_intermediates is not None:
             self.compile_spec.append(
-                CompileSpec("debug_tosa_path", self.path_for_intermediates.encode())
+                CompileSpec("debug_artifact_path", self.path_for_intermediates.encode())
             )
 
         if self.permute_nhwc:
@@ -161,7 +161,7 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool:
 
 def get_intermediate_path(compile_spec: List[CompileSpec]) -> str:
     for spec in compile_spec:
-        if spec.key == "debug_tosa_path":
+        if spec.key == "debug_artifact_path":
             return spec.value.decode()
     return None
 
@@ -198,7 +198,7 @@ def generate_tosa_compile_spec(
         ArmCompileSpecBuilder()
         .tosa_compile_spec()
         .set_permute_memory_format(permute_memory_to_nhwc)
-        .dump_intermediate_tosa(output_path)
+        .dump_intermediate_artifacts_to(output_path)
         .build()
     )
 
@@ -213,15 +213,13 @@ def preprocess(  # noqa: C901
         logger.info("ArmBackend::preprocess")
 
         # if a debug/test build capture output files from TOSA stage
-        path = None
-        debug_output = False
+        artifact_path = None
         output_format = ""
         compile_flags = []
         permute_memory_to_nhwc = False
         for spec in compile_spec:
-            if spec.key == "debug_tosa_path":
-                path = spec.value.decode()
-                debug_output = True
+            if spec.key == "debug_artifact_path":
+                artifact_path = spec.value.decode()
             if spec.key == "output_format":
                 output_format = spec.value.decode()
             if spec.key == "compile_flags":
@@ -242,7 +240,7 @@ def preprocess(  # noqa: C901
 
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
-        tosa_graph = ts.TosaSerializer(path)
+        tosa_graph = ts.TosaSerializer(artifact_path)
 
         node_visitors = get_node_visitors(edge_program)
 
@@ -317,13 +315,13 @@ def preprocess(  # noqa: C901
             else:
                 # This will only happen if an unpartitioned graph is passed without
                 # any checking of compatibility.
-                dbg_fail(node, tosa_graph, path)
+                dbg_fail(node, tosa_graph, artifact_path)
 
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
-        if debug_output is True:
-            dbg_tosa_dump(tosa_graph, path)
+        if artifact_path is not None:
+            dbg_tosa_dump(tosa_graph, artifact_path)
 
         # Serialize and return the program. While we have always produced TOSA
         # output as an intermediate, some flows compile to device binaries in

@@ -124,8 +124,9 @@ class ArmBackend final : public PyTorchBackendInterface {
       if (!supported) {
         ET_LOG(
             Error,
-            "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs",
-            i);
+            "Input %d expected Integer (4 byte) or Char (1 byte) integer inputs, got ScalarType id %d",
+            i,
+            tensor_in.scalar_type());
         return Error::InvalidProgram;
       }
 
@@ -199,11 +200,16 @@ class ArmBackend final : public PyTorchBackendInterface {
       const char* output_addr =
           handles.scratch_data + handles.outputs->io[i].offset;
       // Process input EValue into scratch
-      int* output_address = (int*)output_addr;
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
       for (int j = 0; j < tensor_out.numel(); j++) {
-        tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+        if (tensor_out.scalar_type() == ScalarType::Char) {
+          char* output_address = (char*)output_addr;
+          tensor_out.mutable_data_ptr<char>()[j] = output_address[j];
+        } else {
+          int* output_address = (int*)output_addr;
+          tensor_out.mutable_data_ptr<int>()[j] = output_address[j];
+        }
       }
     }
 

@@ -5,10 +5,85 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+import shutil
+import subprocess
 import tempfile
 
+import pytest
+
+import torch
+
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
 
+_enabled_options: list[str] = []
+
+# ==== Pytest hooks ====
+
+
+def pytest_addoption(parser):
+    parser.addoption("--arm_quantize_io", action="store_true")
+    parser.addoption("--arm_run_corstone300", action="store_true")
+
+
+def pytest_configure(config):
+    if config.option.arm_quantize_io:
+        load_libquantized_ops_aot_lib()
+        _enabled_options.append("quantize_io")
+    if config.option.arm_run_corstone300:
+        corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55")
+        if not corstone300_exists:
+            raise RuntimeError(
+                "Tests are run with --arm_run_corstone300 but corstone300 FVP is not installed."
+            )
+        _enabled_options.append("corstone300")
+
+
+def pytest_collection_modifyitems(config, items):
+    if not config.option.arm_quantize_io:
+        skip_if_aot_lib_not_loaded = pytest.mark.skip(
+            "u55 tests can only run with quantize_io=True."
+        )
+
+        for item in items:
+            if "u55" in item.name:
+                item.add_marker(skip_if_aot_lib_not_loaded)
+
+
+# ==== End of Pytest hooks =====
+
+
+def load_libquantized_ops_aot_lib():
+    find_lib_cmd = [
+        "find",
+        "cmake-out-aot-lib",
+        "-name",
+        "libquantized_ops_aot_lib.so",
+    ]
+    res = subprocess.run(find_lib_cmd, capture_output=True)
+    if res.returncode == 0:
+        library_path = res.stdout.decode().strip()
+        torch.ops.load_library(library_path)
+
+
+def is_option_enabled(option: str, fail_if_not_enabled: bool = False) -> bool:
+    """
+    Returns whether an option is successfully enabled, i.e. if the flag was
+    given to pytest and the necessary requirements are available.
+    Implemented options are:
+        - corstone300.
+        - quantize_io.
+
+    The optional parameter 'fail_if_not_enabled' makes the function raise
+      a RuntimeError instead of returning False.
+    """
+    if option.lower() in _enabled_options:
+        return True
+    else:
+        if fail_if_not_enabled:
+            raise RuntimeError(f"Required option '{option}' for test is not enabled")
+        else:
+            return False
+
 
 def get_tosa_compile_spec(permute_memory_to_nhwc=False, custom_path=None):
     """
@@ -21,16 +96,17 @@ def get_tosa_compile_spec(permute_memory_to_nhwc=False, custom_path=None):
         ArmCompileSpecBuilder()
         .tosa_compile_spec()
         .set_permute_memory_format(permute_memory_to_nhwc)
-        .dump_intermediate_tosa(intermediate_path)
+        .dump_intermediate_artifacts_to(intermediate_path)
         .build()
     )
     return compile_spec
 
 
-def get_u55_compile_spec(permute_memory_to_nhwc=False):
+def get_u55_compile_spec(permute_memory_to_nhwc=False, custom_path=None):
     """
     Default compile spec for Ethos-U55 tests.
     """
+    artifact_path = custom_path or tempfile.mkdtemp(prefix="arm_u55_")
     compile_spec = (
         ArmCompileSpecBuilder()
         .ethosu_compile_spec(
@@ -39,7 +115,9 @@ def get_u55_compile_spec(permute_memory_to_nhwc=False):
             memory_mode="Shared_Sram",
             extra_flags=None,
         )
+        .set_quantize_io(is_option_enabled("quantize_io"))
         .set_permute_memory_format(permute_memory_to_nhwc)
+        .dump_intermediate_artifacts_to(artifact_path)
         .build()
     )
     return compile_spec
@@ -12,7 +12,6 @@
 
 import torch
 from executorch.backends.arm.test import common
-
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
 from parameterized import parameterized
@@ -24,9 +23,11 @@
 class TestSimpleAdd(unittest.TestCase):
     class Add(torch.nn.Module):
         test_parameters = [
-            (torch.ones(5),),
+            (torch.FloatTensor([1, 2, 3, 5, 7]),),
             (3 * torch.ones(8),),
             (10 * torch.randn(8),),
+            (torch.ones(1, 1, 4, 4),),
+            (torch.ones(1, 3, 4, 2),),
         ]
 
         def __init__(self):
@@ -38,6 +39,10 @@ def forward(self, x):
 
     class Add2(torch.nn.Module):
         test_parameters = [
+            (
+                torch.FloatTensor([1, 2, 3, 5, 7]),
+                (torch.FloatTensor([2, 1, 2, 1, 10])),
+            ),
             (torch.ones(1, 1, 4, 4), torch.ones(1, 1, 4, 4)),
             (torch.randn(1, 1, 4, 4), torch.ones(1, 1, 4, 1)),
             (torch.randn(1, 1, 4, 4), torch.randn(1, 1, 4, 1)),
@@ -95,9 +100,11 @@ def _test_add_tosa_BI_pipeline(
         )
 
     def _test_add_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+        self,
+        module: torch.nn.Module,
+        test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -107,12 +114,16 @@ def _test_add_u55_BI_pipeline(
             .export()
             .check_count({"torch.ops.aten.add.Tensor": 1})
             .check(["torch.ops.quantized_decomposed"])
-            .to_edge(config=self._edge_compile_config)
+            .to_edge()
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
 
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+
     @parameterized.expand(Add.test_parameters)
     def test_add_tosa_MI(self, test_data: torch.Tensor):
         test_data = (test_data,)

@@ -239,6 +239,9 @@ def forward(self, x):
 testsuite_u55.remove(("2x2_3x1x40x40_nobias", conv2d_2x2_3x1x40x40_nobias))
 testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
 
+# Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191.
+testsuite_u55.remove(("2x2_1x1x14x14_st2", conv2d_2x2_1x1x14x14_st2))
+
 
 class TestConv2D(unittest.TestCase):
     def _test_conv2d_tosa_MI_pipeline(

@@ -125,6 +125,9 @@
 )
 testsuite_u55.remove(("two_dw_conv2d", two_dw_conv2d))
 
+# Fails when enabling CompileSpec.set_quantize_io(True). MLETORCH-191.
+testsuite_u55.remove(("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1))
+
 
 class TestDepthwiseConv2D(unittest.TestCase):
     def _test_dw_conv2d_tosa_MI_pipeline(
@@ -190,6 +193,7 @@ def test_dw_conv2d_tosa_MI(self, test_name, model):
     def test_dw_conv2d_tosa_BI(self, test_name, model):
         self._test_dw_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
-    @parameterized.expand(testsuite_u55)
+    @parameterized.expand(testsuite_u55, skip_on_empty=True)
+    @unittest.expectedFailure
     def test_dw_conv2d_u55_BI(self, test_name, model):
         self._test_dw_conv2d_u55_BI_pipeline(model, model.get_inputs())