pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 36 additions & 6 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 36 additions & 6 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitmodules
Lines changed: 2 additions & 2 deletions b/‎.gitmodules
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/README.md
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/TARGETS
Lines changed: 159 additions & 1 deletion b/‎backends/cadence/aot/TARGETS
Lines changed: 159 additions & 1 deletion
diff --git a/‎backends/cadence/aot/compiler.py
Lines changed: 21 additions & 5 deletions b/‎backends/cadence/aot/compiler.py
Lines changed: 21 additions & 5 deletions
diff --git a/‎backends/cadence/aot/functions.yaml
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 17 additions & 2 deletions b/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 17 additions & 2 deletions
@@ -9,11 +9,41 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-MODEL_NAME=$1 # stories110M
-BUILD_TOOL=$2 # buck2 or cmake
-DTYPE=$3 # fp16, bf16, or fp32
-MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
-UPLOAD_DIR=${5:-}
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -model)
+      MODEL_NAME="$2" # stories110M
+      shift 2
+      ;;
+    -build_tool)
+      BUILD_TOOL="$2" # buck2 or cmake
+      shift 2
+      ;;
+    -dtype)
+      DTYPE="$2" # fp16, bf16, or fp32
+      shift 2
+      ;;
+    -mode)
+      MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
+      shift 2
+      ;;
+    -upload)
+      UPLOAD_DIR="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      usage
+      ;;
+  esac
+done
+
+# Default mode to xnnpack+custom if not set
+MODE=${MODE:-"xnnpack+custom"}
+
+# Default UPLOAD_DIR to empty string if not set
+UPLOAD_DIR="${UPLOAD_DIR:-}"
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -150,7 +180,7 @@ cleanup_files() {
 }
 
 prepare_artifacts_upload() {
-  if [ -n "$UPLOAD_DIR" ]; then
+  if [ -n "${UPLOAD_DIR}" ]; then
     echo "Preparing for uploading generated artifacs"
     zip -j model.zip "${EXPORTED_MODEL_NAME}" tokenizer.bin
     mkdir -p "${UPLOAD_DIR}"
 
@@ -117,7 +117,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" "${ARTIFACTS_DIR_NAME}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}" -upload "${ARTIFACTS_DIR_NAME}"
 
   test-llama-runner-linux-android:
     name: test-llama-runner-linux-android
@@ -393,7 +393,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
 
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux
 
@@ -261,7 +261,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}"
+        PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
 
   # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
   # test-llava-runner-macos:
 
@@ -1,9 +1,9 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
-	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/
+	url = https://github.com/pytorch-labs/ethos-u-core-driver-mirror
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
-	url = https://git.mlplatform.org/tosa/serialization_lib.git/
+	url = https://github.com/pytorch-labs/tosa_serialization_lib-mirror
 [submodule "backends/vulkan/third-party/Vulkan-Headers"]
 	path = backends/vulkan/third-party/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
 
@@ -2,7 +2,7 @@
 
 ## Supported DSPs (in progress)
 - HiFi Audio
-- ...
+- Fusion G3
 
 ## Tutorial
 
 
@@ -39,6 +39,7 @@ python_library(
         ":passes",
         ":utils",
         ":ops_registrations",
+        ":replace_ops",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
@@ -74,12 +75,14 @@ python_library(
         ":utils",
         ":fuse_ops",
         ":simplify_ops",
+        ":replace_ops",
+        ":reorder_ops",
+        ":remove_ops",
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
         "//executorch/exir/passes:lib",
         "//executorch/exir/passes:spec_prop_pass",
-        "//executorch/backends/transforms:remove_clone_ops"
     ],
 )
 
@@ -180,6 +183,63 @@ python_library(
     ],
 )
 
+python_library(
+    name = "remove_ops",
+    srcs = [
+        "remove_ops.py",
+    ],
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:simplify_ops",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
+        "//executorch/exir/passes:spec_prop_pass",
+        "//executorch/backends/transforms:remove_clone_ops"
+    ],
+)
+
+python_library(
+    name = "reorder_ops",
+    srcs = [
+        "reorder_ops.py",
+    ],
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler_utils",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir:tensor",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
+    ],
+)
+
+python_library(
+    name = "replace_ops",
+    srcs = [
+        "replace_ops.py",
+    ],
+    typing = True,
+    deps = [
+        ":pass_utils",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler_utils",
+        "//executorch/backends/cadence/aot:fuse_ops",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:remove_ops",
+        "//executorch/backends/cadence/aot:utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
+        "//executorch/exir/passes:spec_prop_pass",
+    ],
+)
+
 python_unittest(
     name = "test_graph_builder",
     srcs = [
@@ -196,3 +256,101 @@ python_unittest(
         ":ops_registrations"
     ],
 )
+
+python_unittest(
+    name = "test_replace_ops_passes",
+    srcs = [
+        "tests/test_replace_ops_passes.py",
+    ],
+    supports_static_listing = False,
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        ":compiler",
+        ":replace_ops",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:graph_builder",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/passes:lib",
+    ],
+)
+
+python_unittest(
+    name = "test_fusion_ops_passes",
+    srcs = [
+        "tests/test_fusion_ops_passes.py",
+    ],
+    typing = True,
+    deps = [
+        ":compiler",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:fuse_ops",
+        "//executorch/backends/cadence/aot:graph_builder",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/exir/dialects:lib",
+        "//executorch/exir/dialects/edge:lib",
+    ],
+)
+
+python_unittest(
+    name = "test_remove_ops_passes",
+    srcs = [
+        "tests/test_remove_ops_passes.py",
+    ],
+    supports_static_listing = False,
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "fbsource//third-party/pypi/pyre-extensions:pyre-extensions",
+        ":compiler",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:remove_ops",
+        "//executorch/backends/cadence/aot/quantizer:quantizer",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
+python_unittest(
+    name = "test_simplify_ops_passes",
+    srcs = [
+        "tests/test_simplify_ops_passes.py",
+    ],
+    supports_static_listing = False,
+    typing = True,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:simplify_ops",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
+python_unittest(
+    name = "test_reorder_ops_passes",
+    srcs = [
+        "tests/test_reorder_ops_passes.py",
+    ],
+    typing = True,
+    deps = [
+        ":compiler",
+        ":pass_utils",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:compiler",
+        "//executorch/backends/cadence/aot:fuse_ops",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/backends/cadence/aot:reorder_ops",
+        "//executorch/exir/dialects:lib",
+    ],
+)
@@ -12,10 +12,10 @@
 
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
-
-from executorch.backends.cadence.aot.passes import ReplaceSafeSoftmaxWithSoftmax
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceQuantizer
+
+from executorch.backends.cadence.aot.replace_ops import ReplaceSafeSoftmaxWithSoftmax
 from executorch.backends.cadence.aot.utils import model_gm_has_SDPA, model_is_quantized
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
@@ -194,9 +194,6 @@ def export_to_edge(
     return edge_prog_manager
 
 
-# Export the model and lower it to an EdgeProgramManager (in edge IR), and
-# apply passes specific to Cadence DSP execution. Return both to print the
-# differences.
 def export_to_cadence(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
@@ -216,6 +213,25 @@ def export_to_cadence(
     return cadence_prog_manager
 
 
+def quantize_and_export_to_cadence(
+    model: torch.nn.Module,
+    inputs: tuple[object, ...],
+    dump_graphs: bool = False,
+    opt_level: int = 1,
+) -> EdgeProgramManager:
+    quantized_model = quantize_pt2(model, inputs)
+
+    return export_to_cadence(
+        quantized_model,
+        inputs,
+        opt_level=opt_level,
+        dump_graphs=dump_graphs,
+    )
+
+
+# Export the model and lower it to an EdgeProgramManager (in edge IR), and
+# apply passes specific to Cadence DSP execution. Return both to print the
+# differences.
 def export_to_executorch_gen_etrecord(
     model: torch.nn.Module,
     inputs: tuple[object, ...],
 
@@ -77,6 +77,16 @@
     - arg_meta: null
       kernel_name: torch::executor::gelu_out
 
+- op: hardtanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::hardtanh_out
+
+- op: max_pool2d_with_indices.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
 
@@ -62,11 +62,26 @@
     - arg_meta: null
       kernel_name: torch::executor::full_out
 
+- op: gelu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::gelu_out
+
+- op: hardtanh.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::hardtanh_out
+
+- op: max_pool2d_with_indices.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::max_pool2d_with_indices_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out   
-      
+      kernel_name: cadence::impl::HiFi::mean_dim_out
+
 - op: mul.out
   kernels:
     - arg_meta: null