pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 3 additions & 7 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 3 additions & 7 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 2 additions & 2 deletions b/‎backends/apple/mps/setup.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/test/common.py
Lines changed: 8 additions & 1 deletion b/‎backends/arm/test/common.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/arm/test/models/test_mobilenet_v2_arm.py
Lines changed: 7 additions & 2 deletions b/‎backends/arm/test/models/test_mobilenet_v2_arm.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎backends/arm/test/ops/test_add.py
Lines changed: 16 additions & 4 deletions b/‎backends/arm/test/ops/test_add.py
Lines changed: 16 additions & 4 deletions
diff --git a/‎backends/arm/test/ops/test_conv_combos.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/test/ops/test_conv_combos.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_split.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/test/ops/test_split.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/test/runner_utils.py
Lines changed: 70 additions & 31 deletions b/‎backends/arm/test/runner_utils.py
Lines changed: 70 additions & 31 deletions
diff --git a/‎backends/arm/test/tester/arm_tester.py
Lines changed: 8 additions & 2 deletions b/‎backends/arm/test/tester/arm_tester.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎backends/qualcomm/TARGETS
Lines changed: 13 additions & 0 deletions b/‎backends/qualcomm/TARGETS
Lines changed: 13 additions & 0 deletions
@@ -1 +1 @@
-aec9b2ab77389967ef39bb9c10662fd0fe3e185a
+5ba404f68775bb06a1125a100687f86b6d6de6a8
@@ -176,8 +176,8 @@ jobs:
         fi
         echo "::endgroup::"
 
-  build-llm-demo:
-    name: build-llm-demo
+  build-benchmark-app:
+    name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
     with:
@@ -211,7 +211,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
-      - build-llm-demo
+      - build-benchmark-app
       - export-models
     strategy:
       matrix:
@@ -228,10 +228,6 @@ jobs:
       # This is the ARN of ExecuTorch project on AWS
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
       device-pool-arn: ${{ matrix.device }}
-      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
-      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
-      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
-      # one app+flavor that could load and run the model.
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       # NB: Need to set the default spec here so that it works for periodic too
 
@@ -727,7 +727,7 @@ if(EXECUTORCH_BUILD_PYBIND)
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
   )
   target_compile_options(util PUBLIC ${_pybind_compile_options})
-  target_link_libraries(util PRIVATE torch c10 executorch)
+  target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
 
   # pybind portable_lib
   pybind11_add_module(portable_lib SHARED extension/pybindings/pybindings.cpp)
 
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
 
@@ -6,6 +6,7 @@
 
 import logging
 import os
+import platform
 import shutil
 import subprocess
 import sys
@@ -57,11 +58,17 @@ def pytest_collection_modifyitems(config, items):
 
 
 def load_libquantized_ops_aot_lib():
+    so_ext = {
+        "Darwin": "dylib",
+        "Linux": "so",
+        "Windows": "dll",
+    }.get(platform.system(), None)
+
     find_lib_cmd = [
         "find",
         "cmake-out-aot-lib",
         "-name",
-        "libquantized_ops_aot_lib.so",
+        f"libquantized_ops_aot_lib.{so_ext}",
     ]
     res = subprocess.run(find_lib_cmd, capture_output=True)
     if res.returncode == 0:
 
@@ -100,11 +100,11 @@ def test_mv2_u55_BI(self):
         )
         if common.is_option_enabled("corstone300"):
             tester.run_method_and_compare_outputs(
-                atol=1.0, qtol=1, inputs=self.model_inputs
+                atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-300"
             )
 
     def test_mv2_u85_BI(self):
-        (
+        tester = (
             ArmTester(
                 self.mv2,
                 example_inputs=self.model_inputs,
@@ -116,4 +116,9 @@ def test_mv2_u85_BI(self):
             .check(list(self.operators_after_quantization))
             .partition()
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-320"
+            )
@@ -137,16 +137,22 @@ def test_add_u55_BI(self, test_data: torch.Tensor):
             test_data,
         )
         if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-300"
+            )
 
     @parameterized.expand(Add.test_parameters)
     def test_add_u85_BI(self, test_data: torch.Tensor):
         test_data = (test_data,)
-        self._test_add_ethos_BI_pipeline(
+        tester = self._test_add_ethos_BI_pipeline(
             self.Add(),
             common.get_u85_compile_spec(permute_memory_to_nhwc=True),
             test_data,
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-320"
+            )
 
     @parameterized.expand(Add2.test_parameters)
     def test_add2_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -165,11 +171,17 @@ def test_add2_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
             self.Add2(), common.get_u55_compile_spec(), test_data
         )
         if common.is_option_enabled("corstone300"):
-            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-300"
+            )
 
     @parameterized.expand(Add2.test_parameters)
     def test_add2_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_add_ethos_BI_pipeline(
+        tester = self._test_add_ethos_BI_pipeline(
             self.Add2(), common.get_u85_compile_spec(), test_data
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(
+                qtol=1, inputs=test_data, target_board="corstone-320"
+            )
@@ -9,6 +9,8 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -311,6 +313,8 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
+    # TODO: Investigate flakyness (MLTORCH-307)
+    @pytest.mark.flaky(reruns=3)
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
@@ -124,8 +124,11 @@ def test_split_with_sizes_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data)
 
     @parameterized.expand(Split.test_data)
-    def test_split_n_out_tosa_MI(self, test_data: test_data_t):
+    def test_split_one_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data)
+
+    @parameterized.expand(Split.test_data)
+    def test_split_two_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data)
 
     @parameterized.expand(Split.test_data)
 
@@ -177,6 +177,7 @@ def __init__(
         self.qp_input: list[QuantizationParams] = None
         self.qp_output: QuantizationParams = None
         self.timeout = 120
+        self.target_board: str = None
 
         self._has_init_run = False
 
@@ -185,11 +186,17 @@ def init_run(
         exported_program: ExportedProgram,
         edge_program: ExportedProgram,
         is_quantized: bool,
+        target_board: str,
     ):
+
+        if target_board not in ["corstone-300", "corstone-320"]:
+            raise RuntimeError(f"Unknown target board: {target_board}")
+
         self.input_names = _get_input_names(edge_program)
         self.output_node = _get_output_node(exported_program)
         self.output_name = self.output_node.name
         self.is_quantized = is_quantized
+        self.target_board = target_board
 
         if is_quantized:
             self.qp_input = _get_input_quantization_params(exported_program)
@@ -205,7 +212,7 @@ def init_run(
     def set_timeout(self, timeout: int):
         self.timeout = timeout
 
-    def run_corstone300(
+    def run_corstone(
         self,
         inputs: Tuple[torch.Tensor],
     ) -> list[torch.Tensor]:
@@ -231,7 +238,7 @@ def run_corstone300(
             )
         elf_path = os.path.join(
             "cmake-out",
-            "arm_semihosting_executor_runner_corstone-300",
+            f"arm_semihosting_executor_runner_{self.target_board}",
             "arm_executor_runner",
         )
         assert os.path.exists(
@@ -242,32 +249,66 @@ def run_corstone300(
         for input_path in input_paths:
             cmd_line += f" -i {input_path}"
 
-        command_args = [
-            "FVP_Corstone_SSE-300_Ethos-U55",
-            "-C",
-            "ethosu.num_macs=128",
-            "-C",
-            "mps3_board.visualisation.disable-visualisation=1",
-            "-C",
-            "mps3_board.telnetterminal0.start_telnet=0",
-            "-C",
-            "mps3_board.uart0.out_file='-'",
-            "-C",
-            "cpu0.CFGITCMSZ=11",
-            "-C",
-            "cpu0.semihosting-enable=1",
-            "-C",
-            "cpu0.semihosting-stack_base=0",
-            "-C",
-            "cpu0.semihosting-heap_limit=0",
-            "-C",
-            f"cpu0.semihosting-cmd_line='{cmd_line}'",
-            "-a",
-            elf_path,
-            "--timelimit",
-            f"{self.timeout}",
-        ]
-        result = _run_cmd(command_args, check=False)
+        command_args = {
+            "corstone-300": [
+                "FVP_Corstone_SSE-300_Ethos-U55",
+                "-C",
+                "ethosu.num_macs=128",
+                "-C",
+                "mps3_board.visualisation.disable-visualisation=1",
+                "-C",
+                "mps3_board.telnetterminal0.start_telnet=0",
+                "-C",
+                "mps3_board.uart0.out_file='-'",
+                "-C",
+                "cpu0.CFGITCMSZ=11",
+                "-C",
+                "cpu0.semihosting-enable=1",
+                "-C",
+                "cpu0.semihosting-stack_base=0",
+                "-C",
+                "cpu0.semihosting-heap_limit=0",
+                "-C",
+                f"cpu0.semihosting-cmd_line='{cmd_line}'",
+                "-a",
+                elf_path,
+                "--timelimit",
+                f"{self.timeout}",
+            ],
+            "corstone-320": [
+                "FVP_Corstone_SSE-320",
+                "-C",
+                "mps4_board.subsystem.ethosu.num_macs=128",
+                "-C",
+                "mps4_board.visualisation.disable-visualisation=1",
+                "-C",
+                "mps4_board.telnetterminal0.start_telnet=0",
+                "-C",
+                "mps4_board.uart0.out_file='-'",
+                "-C",
+                "mps4_board.uart0.unbuffered_output=1",
+                "-C",
+                "mps4_board.uart0.shutdown_on_eot=1",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-enable=1",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-stack_base=0",
+                "-C",
+                "mps4_board.subsystem.cpu0.semihosting-heap_limit=0",
+                "-C",
+                f"mps4_board.subsystem.cpu0.semihosting-cmd_line='{cmd_line}'",
+                "-a",
+                elf_path,
+                "--timelimit",
+                f"{self.timeout}",
+            ],
+        }
+
+        result = _run_cmd(command_args[self.target_board], check=False)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to run {command_args[self.target_board]}\nError: {result.stderr.decode()}"
+            )
         result_stdout = result.stdout.decode()
 
         error_regex = r"(^[EF][: ].*$)|(^.*Hard fault.*$)|(^.*Assertion.*$)"
@@ -276,10 +317,8 @@ def run_corstone300(
         # regex to check for error or fault messages in stdout from FVP
         if re.compile(error_regex, re.MULTILINE).search(result_stdout):
             raise RuntimeError(
-                f"Corstone simulation failed, log: \n {result_stdout}\n{result.stderr.decode()}"
+                f"Corstone simulation failed:\ncmd: {command_args[self.target_board]}\n, log: \n {result_stdout}\n{result.stderr.decode()}"
             )
-        elif "E [" in result_stdout:
-            logger.error(result_stdout)
 
         tosa_ref_output = np.fromfile(out_path_with_suffix, dtype=np.float32)
         output_shape = self.output_node.args[0][0].meta["val"].shape
 
@@ -98,7 +98,7 @@ def __init__(self, runner_util: RunnerUtil, timeout: int = 1):
         self.runner.set_timeout(timeout)
 
     def run_artifact(self, inputs):
-        return self.runner.run_corstone300(inputs)
+        return self.runner.run_corstone(inputs)
 
     def dump_artifact(self, path_to_dump: Optional[str]):
         if not path_to_dump:
@@ -226,6 +226,7 @@ def run_method_and_compare_outputs(
         self,
         inputs: Optional[Tuple[torch.Tensor]] = None,
         stage: Optional[str] = None,
+        target_board: Optional[str] = "corstone-300",
         num_runs=1,
         atol=1e-03,
         rtol=1e-03,
@@ -260,7 +261,12 @@ def run_method_and_compare_outputs(
         edge_program = self.stages[
             self.stage_name(tester.ToEdge)
         ].artifact.exported_program()
-        self.runner_util.init_run(exported_program, edge_program, is_quantized)
+        self.runner_util.init_run(
+            exported_program,
+            edge_program,
+            is_quantized,
+            target_board,
+        )
 
         if is_quantized:
             reference_stage = self.stages[self.stage_name(tester.Quantize)]
 
@@ -1,5 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
 
 oncall("executorch")
 
 define_common_targets()
+
+runtime.python_library(
+    name = "preprocess",
+    srcs = ["qnn_preprocess.py"],
+    visibility = [
+        "//executorch/backends/qualcomm/...",
+        "@EXECUTORCH_CLIENTS",
+    ],
+    deps = [
+        "//executorch/backends/qualcomm/passes:passes",
+    ],
+)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-aec9b2ab77389967ef39bb9c10662fd0fe3e185a`
	`1`	`+5ba404f68775bb06a1125a100687f86b6d6de6a8`
Original file line number	Diff line number	Diff line change
`@@ -727,7 +727,7 @@ if(EXECUTORCH_BUILD_PYBIND)`
`727`	`727`	`util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}`
`728`	`728`	`)`
`729`	`729`	`target_compile_options(util PUBLIC ${_pybind_compile_options})`
`730`		`- target_link_libraries(util PRIVATE torch c10 executorch)`
	`730`	`+ target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)`
`731`	`731`
`732`	`732`	`# pybind portable_lib`
`733`	`733`	`pybind11_add_module(portable_lib SHARED extension/pybindings/pybindings.cpp)`