pytorch
diff --git a/‎.ci/scripts/test.sh
Lines changed: 4 additions & 1 deletion b/‎.ci/scripts/test.sh
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 14 additions & 4 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 14 additions & 4 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/apple/coreml/runtime/include/coreml_backend/delegate.h
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/runtime/include/coreml_backend/delegate.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/runtime/MPSBackend.mm
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/runtime/MPSBackend.mm
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 2 additions & 6 deletions b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 2 additions & 6 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 1 addition & 1 deletion b/‎backends/arm/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/TARGETS
Lines changed: 83 additions & 0 deletions b/‎backends/arm/TARGETS
Lines changed: 83 additions & 0 deletions
diff --git a/‎backends/arm/arm_backend.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/arm_backend.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/arm_vela.py
Lines changed: 9 additions & 13 deletions b/‎backends/arm/arm_vela.py
Lines changed: 9 additions & 13 deletions
diff --git a/‎backends/arm/operators/TARGETS
Lines changed: 34 additions & 0 deletions b/‎backends/arm/operators/TARGETS
Lines changed: 34 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_bmm.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operators/op_bmm.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_conv2d.py
Lines changed: 4 additions & 3 deletions b/‎backends/arm/operators/op_conv2d.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/arm/operators/op_mm.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operators/op_mm.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_mul.py
Lines changed: 7 additions & 3 deletions b/‎backends/arm/operators/op_mul.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎backends/arm/operators/op_output.py
Lines changed: 3 additions & 1 deletion b/‎backends/arm/operators/op_output.py
Lines changed: 3 additions & 1 deletion
@@ -175,7 +175,10 @@ test_model_with_qnn() {
     EXPORTED_MODEL_NAME=vit_qnn.pte
   fi
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m SM8550 --compile_only
+  # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24
+  QNN_CHIPSET=SM8450
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
   EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME}
 }
 
 
@@ -15,7 +15,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s2x
+        default: samsung_galaxy_s22
       delegates:
         description: Backend delegates
         required: false
@@ -45,7 +45,7 @@ on:
         description: Target devices to run benchmark
         required: false
         type: string
-        default: samsung_galaxy_s2x
+        default: samsung_galaxy_s22
       delegates:
         description: Backend delegates
         required: false
@@ -85,7 +85,7 @@ jobs:
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit"
-          CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x"
+          CRON_DEFAULT_DEVICES: "samsung_galaxy_s22"
           CRON_DEFAULT_DELEGATES: "xnnpack,qnn"
         run: |
           set -ex
@@ -104,7 +104,7 @@ jobs:
 
           # Mapping devices to their corresponding device-pool-arn
           declare -A DEVICE_POOL_ARNS
-          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+          DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
 
           # Resolve device names with their corresponding ARNs
           if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -206,6 +206,10 @@ jobs:
     name: build-llm-demo
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
+    strategy:
+      matrix:
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
@@ -222,8 +226,14 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
+        if [[ ${{ matrix.delegate }} == "qnn" ]]; then
+            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+            PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        fi
+
         # TODO: This needs to be replaced with a generic loader .apk
         # Build LLM Demo for Android
+        export ANDROID_ABIS="arm64-v8a"
         bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
 
@@ -65,7 +65,8 @@ jobs:
       script: |
         FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
           examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
-          examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java)
+          examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/*.java \
+          extension/android/benchmark/app/src/main/java/org/pytorch/minibench/*.java)
         if [ -n "$FILES_NEEDS_FORMAT" ]; then
           echo "Warning: The following files need formatting. Please use google-java-format."
           echo "$FILES_NEEDS_FORMAT"
 
@@ -228,6 +228,7 @@ cmake_dependent_option(
 )
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
+  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
   set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
 endif()
 
 
@@ -20,7 +20,7 @@ class BackendDelegate;
 namespace torch {
 namespace executor {
 
-class CoreMLBackendDelegate final : public PyTorchBackendInterface {
+class CoreMLBackendDelegate final : public ::executorch::runtime::BackendInterface {
 public:
     CoreMLBackendDelegate() noexcept;
     ~CoreMLBackendDelegate() = default;
 
@@ -19,7 +19,7 @@
 namespace torch {
 namespace executor {
 
-class MPSBackend final : public PyTorchBackendInterface {
+class MPSBackend final : public ::executorch::runtime::BackendInterface {
  public:
   ~MPSBackend() = default;
 
 
@@ -239,9 +239,7 @@ def lower_module_and_test_output(
             )
 
             executorch_program = delegated_program.to_executorch(
-                config=ExecutorchBackendConfig(
-                    extract_delegate_segments=False, extract_constant_segment=False
-                )
+                config=ExecutorchBackendConfig(extract_delegate_segments=False)
             )
         else:
             delegated_program = to_backend(
@@ -258,9 +256,7 @@ def lower_module_and_test_output(
                     _skip_dim_order=True,  # TODO(T182928844): Delegate dim order op to backend.
                 ),
             ).to_executorch(
-                config=ExecutorchBackendConfig(
-                    extract_delegate_segments=False, extract_constant_segment=False
-                )
+                config=ExecutorchBackendConfig(extract_delegate_segments=False)
             )
 
         if bundled_program:
 
@@ -33,7 +33,7 @@ Quantization:
 - `arm_quantizer_utils.py` - Utilities for quantization
 
 Runtime:
-- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (PyTorchBackendInterface) for Ethos-U
+- `runtime/ArmBackendEthosU.cpp` - The Arm backend implementation of the ExecuTorch runtime backend (BackendInterface) for Ethos-U
 
 Other:
 - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U
 
@@ -0,0 +1,83 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "arm_partitioner",
+    srcs = [
+        "arm_partitioner.py",
+    ],
+    typing = True,
+    deps = [
+        ":arm_backend",
+        "//executorch/backends/arm/passes:passes",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_library(
+    name = "arm_backend",
+    srcs = [
+        "arm_backend.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/flatbuffers:flatbuffers",
+        "fbsource//third-party/pypi/ml-dtypes:ml-dtypes",
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":arm_vela",
+        "//executorch/backends/arm/operators:lib",
+        "//executorch/backends/arm/operators:node_visitor",
+        "//executorch/backends/arm/passes:passes",
+    ],
+)
+
+python_library(
+    name = "arm_vela",
+    srcs = [
+        "arm_vela.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela",
+    ],
+)
+
+python_library(
+    name = "tosa_mapping",
+    srcs = [
+        "tosa_mapping.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "//caffe2:torch",
+    ],
+)
+
+python_library(
+    name = "tosa_quant_utils",
+    srcs = [
+        "tosa_quant_utils.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/numpy:numpy",
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":tosa_mapping",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
+python_library(
+    name = "tosa_utils",
+    srcs = [
+        "tosa_utils.py",
+    ],
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/serializer:serializer",
+        ":tosa_quant_utils",
+        "//executorch/backends/arm/operators:node_visitor",
+    ],
+)
@@ -159,7 +159,7 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     return False
 
 
-def get_intermediate_path(compile_spec: List[CompileSpec]) -> str:
+def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]:
     for spec in compile_spec:
         if spec.key == "debug_artifact_path":
             return spec.value.decode()
 
@@ -5,12 +5,12 @@
 
 import os
 import struct
-import subprocess
 import tempfile
 
 from typing import List
 
 import numpy as np
+from ethosu.vela import vela
 
 
 # Pack either input or output tensor block, compose the related arrays into
@@ -38,21 +38,17 @@ def vela_compile(tosa_graph, args: List[str]):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_graph.serialize()
-        with open(os.path.join(tmpdir, tosaname), "wb") as f:
+        tosa_path = os.path.join(tmpdir, tosaname)
+        with open(tosa_path, "wb") as f:
             f.write(flatbuffer)
 
         # invoke vela
-        vela_command = f"cd {tmpdir}; vela {' '.join(args)} {tosaname}"
-        try:
-            subprocess.run([vela_command], shell=True, check=True, capture_output=True)
-        except subprocess.CalledProcessError as process_error:
-            raise RuntimeError(
-                f"Vela compiler ('{vela_command}') failed with error:\n \
-                                     {process_error.stderr.decode()}\n \
-                                      Stdout:\n{process_error.stdout.decode()}"
-            )
-
-        np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
+        output_dir = os.path.join(tmpdir, "output")
+        args.append(f"--output-dir={output_dir}")
+        args.append(tosa_path)
+        vela.main(" ".join(args).split(" "))
+
+        np_path = os.path.join(output_dir, "out_sg0_vela.npz")
         blocks = b""
 
         with np.load(np_path, allow_pickle=False) as data:
 
@@ -0,0 +1,34 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "node_visitor",
+    srcs = ["node_visitor.py"],
+    typing = True,
+    deps = [
+        "//executorch/backends/arm:tosa_mapping",
+    ],
+)
+
+python_library(
+    name = "ops",
+    srcs = glob(["op_*.py"]),
+    typing = True,
+    deps = [
+        "fbsource//third-party/serialization_lib/python/tosa:tosa",
+        ":node_visitor",
+        "//executorch/backends/arm:tosa_mapping",
+        "//executorch/backends/arm:tosa_quant_utils",
+        "//executorch/backends/arm:tosa_utils",
+        "//executorch/exir:lib",
+    ],
+)
+
+python_library(
+    name = "lib",
+    srcs = ["__init__.py"],
+    typing = True,
+    deps = [
+        ":node_visitor",
+        ":ops",
+    ],
+)
@@ -72,6 +72,7 @@ def define_node(
             build_rescale(
                 tosa_fb=tosa_graph,
                 scale=final_output_scale,
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `bmm_result` is undefined, or not always defined.
                 input_node=bmm_result,
                 output_name=output.name,
                 output_type=ts.DType.INT8,
 
@@ -2,7 +2,7 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import List
+from typing import cast, List
 
 import serializer.tosa_serializer as ts
 import torch
@@ -156,11 +156,12 @@ def define_node(
         # integer value domain of the next op. Otherwise return float32 output.
         if is_quant_node:
             # Get scale_factor from input, weight, and output.
-            _, input_scale, _, _, _, _ = getNodeArgs(node.args[0])
-            _, weight_scale, _, _, _, _ = getNodeArgs(node.args[1])
+            _, input_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[0]))
+            _, weight_scale, _, _, _, _ = getNodeArgs(cast(torch.fx.Node, node.args[1]))
             _, output_scale, output_zp, _, _, _ = getNodeArgs(list(node.users)[0])
             build_rescale_conv_output(
                 tosa_graph,
+                # pyre-fixme[61]: Uninitialized local [61]: Local variable `conv2d_res` is undefined, or not always defined.
                 conv2d_res,
                 output.name,
                 actual_out_type,
 
@@ -96,6 +96,7 @@ def define_node(
             build_rescale(
                 tosa_fb=tosa_graph,
                 scale=final_output_scale,
+                # pyre-ignore[61]: Uninitialized local [61]: Local variable `reshape_intermediate` is undefined, or not always defined.
                 input_node=reshape_intermediate,
                 output_name=output.name,
                 output_type=ts.DType.INT8,
 
@@ -3,7 +3,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import List
+from typing import cast, List
 
 import executorch.backends.arm.tosa_quant_utils as tqutils
 import executorch.backends.arm.tosa_utils as tutils
@@ -35,8 +35,12 @@ def define_node(
         if is_quant_node:
             input_A = inputs[0]
             input_B = inputs[1]
-            input_A_qargs = tqutils.get_quant_node_args(node.args[0])
-            input_B_qargs = tqutils.get_quant_node_args(node.args[1])
+            input_A_qargs = tqutils.get_quant_node_args(
+                cast(torch.fx.Node, node.args[0])
+            )
+            input_B_qargs = tqutils.get_quant_node_args(
+                cast(torch.fx.Node, node.args[1])
+            )
 
             input_A.shape = tutils.tosa_shape(input_A.shape, input_A.dim_order)
             input_B.shape = tutils.tosa_shape(input_B.shape, input_B.dim_order)
 
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import cast
+
 import serializer.tosa_serializer as ts
 import torch
 
@@ -11,7 +13,7 @@ def process_output(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
 ):
-    for output in node.args[0]:
+    for output in cast(tuple[torch.fx.Node, ...], node.args[0]):
         tosa_graph.addOutputTensor(
             tosa_graph.currRegion.currBasicBlock.tensors[output.name]
         )
Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ cmake_dependent_option(`
`228`	`228`	`)`
`229`	`229`
`230`	`230`	`if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)`
	`231`	`+ set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)`
`231`	`232`	`set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)`
`232`	`233`	`endif()`
`233`	`234`
Original file line number	Diff line number	Diff line change
`@@ -239,9 +239,7 @@ def lower_module_and_test_output(`
`239`	`239`	`)`
`240`	`240`
`241`	`241`	`executorch_program = delegated_program.to_executorch(`
`242`		`- config=ExecutorchBackendConfig(`
`243`		`- extract_delegate_segments=False, extract_constant_segment=False`
`244`		`- )`
	`242`	`+ config=ExecutorchBackendConfig(extract_delegate_segments=False)`
`245`	`243`	`)`
`246`	`244`	`else:`
`247`	`245`	`delegated_program = to_backend(`
`@@ -258,9 +256,7 @@ def lower_module_and_test_output(`
`258`	`256`	`_skip_dim_order=True, # TODO(T182928844): Delegate dim order op to backend.`
`259`	`257`	`),`
`260`	`258`	`).to_executorch(`
`261`		`- config=ExecutorchBackendConfig(`
`262`		`- extract_delegate_segments=False, extract_constant_segment=False`
`263`		`- )`
	`259`	`+ config=ExecutorchBackendConfig(extract_delegate_segments=False)`
`264`	`260`	`)`
`265`	`261`
`266`	`262`	`if bundled_program:`