pytorch
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 27 additions & 1 deletion b/‎.ci/scripts/test_model.sh
Lines changed: 27 additions & 1 deletion
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 28 additions & 3 deletions b/‎.github/workflows/trunk.yml
Lines changed: 28 additions & 3 deletions
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/__init__.py b/‎backends/__init__.py
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 2 additions & 24 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 2 additions & 24 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 14 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 9 additions & 14 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 3 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_linalg_vector_norm_pass.py
Lines changed: 5 additions & 3 deletions b/‎backends/arm/_passes/decompose_linalg_vector_norm_pass.py
Lines changed: 5 additions & 3 deletions
@@ -188,6 +188,14 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
     pip install piq
+  elif [[ "${MODEL_NAME}" == "albert" ]]; then
+    EXPORT_SCRIPT=albert
+  elif [[ "${MODEL_NAME}" == "bert" ]]; then
+    EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
+    EXPORT_SCRIPT=distilbert
+  elif [[ "${MODEL_NAME}" == "eurobert" ]]; then
+    EXPORT_SCRIPT=eurobert
   else
     echo "Unsupported model $MODEL_NAME"
     exit 1
@@ -197,7 +205,25 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+  SCRIPT_FOLDER=""
+  case "${MODEL_NAME}" in
+    "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l")
+        SCRIPT_FOLDER=scripts
+        ;;
+    "albert"|"bert"|"distilbert")
+        pip install evaluate
+        SCRIPT_FOLDER=oss_scripts
+        # Bert models running in 16bit will encounter op validation fail on some operations,
+        # which requires CHIPSET >= SM8550.
+        QNN_CHIPSET=SM8550
+        ;;
+    *)
+        echo "Unsupported model $MODEL_NAME"
+        exit 1
+        ;;
+  esac
+
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 
@@ -156,13 +156,14 @@ build_executorch_runner() {
 }
 
 cmake_install_executorch_lib() {
+  build_type="${1:-Release}"
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_BUILD_TYPE=${build_type} \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
-  cmake --build cmake-out -j9 --target install --config Release
+  cmake --build cmake-out -j9 --target install --config ${build_type}
 }
 
 download_stories_model_artifacts() {
 
@@ -342,8 +342,8 @@ jobs:
               git clone https://github.com/huggingface/optimum-executorch
               pushd optimum-executorch
               # There is no release yet, for CI stability, always test from the same commit on main
-              git checkout 1c653dc49812fc431a22312c7295d97005d22e12
-              python install_dev.py
+              git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+              python install_dev.py --skip_override_torch
               pip list
 
               ARGS=(
 
@@ -347,8 +347,8 @@ jobs:
             git clone https://github.com/huggingface/optimum-executorch
             pushd optimum-executorch
             # There is no release yet, for CI stability, always test from the same commit on main
-            git checkout 1c653dc49812fc431a22312c7295d97005d22e12
-            ${CONDA_RUN} python install_dev.py
+            git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+            ${CONDA_RUN} python install_dev.py --skip_override_torch
             pip list
 
             ARGS=(
 
@@ -46,7 +46,7 @@ jobs:
         fi
 
         # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
+        lintrunner init
 
         RC=0
         # Run lintrunner on all files
 
@@ -480,6 +480,32 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
 
+  test-qnn-optimum-model:
+    name: test-qnn-optimum-model
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      matrix:
+        dtype: [fp32]
+        model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh ${{ matrix.model }} "cmake" "qnn"
+
   test-apple-model:
     name: test-apple-model
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -571,9 +597,8 @@ jobs:
         git clone https://github.com/huggingface/optimum-executorch
         pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 1c653dc49812fc431a22312c7295d97005d22e12
-        pip install .[tests]
-        pip install transformers==4.52.4
+        git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
+        python install_dev.py --skip_override_torch
         popd
         pip list
         echo "::endgroup::"
 
@@ -42,6 +42,9 @@ xcuserdata/
 *.xcworkspace/
 *.xcframework/
 
+# clangd
+.cache/
+
 # misc
 /.vscode/
 *.so
 
@@ -29,6 +29,7 @@
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
+from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
 
@@ -5,15 +5,12 @@
 
 # pyre-unsafe
 
-from typing import cast
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
-    insert_q_dq_pair,
 )
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -59,20 +56,10 @@ class AnnotateChannelsLastDimOrder(ExportPass):
 
     def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
         """
-        returns True for dq and w in the following sequences;
+        returns True for w in the following sequence;
         w -> depthwise_conv2d -> ...
-        w -> dq -> depthwise_conv2d -> ...
         """
-        if node.op == "call_function":
-            if node.target != dq_op:
-                return False
-            prev_node = node.args[0]
-            if cast(torch.fx.Node, prev_node).op != "placeholder":
-                return False
-            if is_consumer_node_depthwise_conv2d(node):
-                consumer_node = list(node.users)[0]
-                return consumer_node.args[1] == node
-        elif node.op == "placeholder":
+        if node.op == "placeholder":
             # node is an input, weight or bias node
             consumer_node = list(node.users)[0]
             if self.is_weight_node_for_depthwise_conv2d(consumer_node):
@@ -129,8 +116,6 @@ def is_channel_reshape(input_shape, output_shape):
 
     @staticmethod
     def insert_input_transpose(node, input_node, graph_module):
-        quantize = input_node.target == dq_op
-        q_params = input_node.args[1:] if quantize else None
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
@@ -143,8 +128,6 @@ def insert_input_transpose(node, input_node, graph_module):
                         else AnnotateChannelsLastDimOrder.NHWC_inverse_order
                     ),
                 ),
-                quantize=quantize,
-                q_params=q_params,
             )
             node.replace_input_with(input_node, permute_node)
 
@@ -185,11 +168,6 @@ def insert_output_transpose(node, graph_module):
             for user in users:
                 user.replace_input_with(node, permute_node)
 
-            quantize = node.args[0] == q_op
-            if quantize:
-                q_params = node.args[0].args[1:]
-                insert_q_dq_pair(graph_module.graph, node, q_params)
-
     @staticmethod
     def _insert_view_transpose(
         input_shape, output_shape, node, input_node, graph_module
 
@@ -7,13 +7,14 @@
 
 import itertools
 import operator
-from typing import List
+from typing import cast, List
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, QuantArgs
+from executorch.backends.arm.tosa_quant_utils import dq_ops, q_ops
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
@@ -61,7 +62,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
         }
         for partition in matmul_partitions:
             quantized_input = all(
-                input_node.target == dq_op for input_node in partition.input_nodes
+                input_node.target in dq_ops for input_node in partition.input_nodes
             )
             matmul_node = [
                 node for node in partition.nodes if node.target in matmul_targets
@@ -74,17 +75,14 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     input_node = self._match_partition_to_node(
                         node, partition.input_nodes
                     )
-                    input_node_qargs = QuantArgs.from_operator(
-                        input_node.target, input_node.args
-                    )
                     # Insert new dq-node just before the mm/bmm with input_node's qparams
                     with graph_module.graph.inserting_before(matmul_node):
                         # Create new dq-node before matmul
                         dq_node = create_node(
                             graph=graph_module.graph,
-                            op_target=dq_op,
+                            op_target=cast(EdgeOpOverload, input_node.target),  # type: ignore[arg-type]
                         )
-                        dq_node.args = (node, *input_node_qargs)
+                        dq_node.args = (node, *input_node.args[1:])
                         matmul_node.replace_input_with(node, dq_node)
 
                 for partition_input in partition.input_nodes:
@@ -95,19 +93,16 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     graph_module.graph.erase_node(partition_input)
 
             partition_output = list(partition.output_nodes[0].users)[0]
-            quantized_output = partition_output.target == q_op
+            quantized_output = partition_output.target in q_ops
             if quantized_output:
-                output_node_qargs = QuantArgs.from_operator(
-                    partition_output.target, partition_output.args
-                )
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
                         graph=graph_module.graph,
-                        op_target=q_op,
+                        op_target=cast(EdgeOpOverload, partition_output.target),  # type: ignore[arg-type]
                     )
                     matmul_node.replace_all_uses_with(q_node)
-                    q_node.args = (matmul_node, *output_node_qargs)
+                    q_node.args = (matmul_node, *partition_output.args[1:])
                 # Remove partition output q-node
                 partition_output.replace_all_uses_with(
                     partition_output.all_input_nodes[0]
 
@@ -32,6 +32,7 @@
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeLinearVectorNormPass,
+    DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
     DecomposeSelectPass,
@@ -92,7 +93,6 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
-        self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLinearVectorNormPass())
         self.add_pass(
             DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
@@ -108,12 +108,13 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
-        self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
+        self.add_pass(FoldAndAnnotateQParamsPass(exported_program))  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
         if self.tosa_spec.is_U55_subset:
             self.add_pass(BroadcastArgsPass())
+        self.add_pass(DecomposeLinearPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
@@ -123,6 +124,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass())
+        self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
@@ -166,7 +168,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
 
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
-        self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
+        self.add_pass(FoldAndAnnotateQParamsPass(exported_program))  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
@@ -179,6 +181,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass())
+        self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
 
@@ -35,6 +35,8 @@ def _assert_within_int32(self, tensor: torch.Tensor, node: torch.fx.Node):
 
     def _to_int32(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
+            if len(node.users) == 0:
+                continue
             fake_tensor = node.meta["val"]
             if not isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
                 continue
 
@@ -51,10 +51,12 @@ def call_operator(self, op, args, kwargs, meta):
                 f"is not supported for linalg_vector_norm operator"
             )
 
+        # Sum over all dimensions if dim is None
         if norm_dim is None:
-            raise ValueError("The norm_dim for linalg_vector_norm is None.")
-
-        dims = [norm_dim] if isinstance(norm_dim, int) else list(norm_dim)
+            rank = input_tensor.data.dim()
+            dims = list(range(rank))
+        else:
+            dims = [norm_dim] if isinstance(norm_dim, int) else list(norm_dim)
 
         # Decomposition based on norm order.
         if norm_order == 1: