pytorch
diff --git a/‎.ci/scripts/test_ane_static_llama.sh
Lines changed: 27 additions & 0 deletions b/‎.ci/scripts/test_ane_static_llama.sh
Lines changed: 27 additions & 0 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 8 additions & 0 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/scripts/build_executorch_runner.sh
Lines changed: 20 additions & 5 deletions b/‎backends/arm/scripts/build_executorch_runner.sh
Lines changed: 20 additions & 5 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 13 additions & 3 deletions b/‎backends/cadence/aot/functions_hifi.yaml
Lines changed: 13 additions & 3 deletions
diff --git a/‎backends/cadence/aot/pass_utils.py
Lines changed: 10 additions & 0 deletions b/‎backends/cadence/aot/pass_utils.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/cadence/aot/remove_ops.py
Lines changed: 64 additions & 1 deletion b/‎backends/cadence/aot/remove_ops.py
Lines changed: 64 additions & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 2 additions & 3 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 33 additions & 1 deletion b/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 33 additions & 1 deletion
diff --git a/‎backends/cadence/hifi/operators/CMakeLists.txt
Lines changed: 2 additions & 1 deletion b/‎backends/cadence/hifi/operators/CMakeLists.txt
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
+
+popd
@@ -100,6 +100,14 @@ test_model() {
       rm "./${MODEL_NAME}.pte"
       return  # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
   fi
+  if [[ "${MODEL_NAME}" == "phi4_mini" ]]; then
+      # Install requirements for export_llama
+      bash examples/models/llama/install_requirements.sh
+      # Test export_llama script: python3 -m examples.models.llama.export_llama.
+      "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
+      run_portable_executor_runner
+      rm "./${MODEL_NAME}.pte"
+  fi
 
   # Export a basic .pte and run the model.
   "${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}"
 
@@ -229,6 +229,28 @@ jobs:
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
 
+  test-static-llama-ane:
+    name: test-static-llama-ane
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        sh install_requirements.sh
+        sh backends/apple/coreml/scripts/install_requirements.sh
+        python install_executorch.py --pybind coreml
+        sh examples/models/llama/install_requirements.sh
+
+        # Test ANE llama
+        sh .ci/scripts/test_ane_static_llama.sh
+
   test-llama-runner-macos:
     name: test-llama-runner-mac
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
 
@@ -749,9 +749,9 @@ endif()
 
 if(EXECUTORCH_BUILD_PTHREADPOOL
    AND EXECUTORCH_BUILD_CPUINFO
-   AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
 )
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
 
@@ -14,8 +14,9 @@ toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmak
 pte_file=""
 target="ethos-u55-128"
 build_type="Release"
-system_config=""
 bundleio=false
+system_config=""
+memory_mode=""
 build_with_etdump=false
 extra_build_flags=""
 output_folder_set=false
@@ -32,9 +33,12 @@ help() {
     echo "  --pte=<PTE_FILE>                pte file (genrated by the aot_arm_compier from the model to include in the elf"
     echo "  --target=<TARGET>               Target to build and run for Default: ${target}"
     echo "  --build_type=<TYPE>             Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
-    echo "  --system_config=<CONFIG>        System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
-    echo "                                     NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
     echo "  --bundleio                      Support both pte and Bundle IO bpte using Devtools BundelIO with Input/RefOutput included"
+    echo "  --system_config=<CONFIG>        System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
+    echo "                                     NOTE: If given, this option must match the given target. This option along with the memory_mode sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
+    echo "  --memory_mode=<CONFIG>          Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
+    echo "                                  Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
+    echo "                                  Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
     echo "  --etdump                        Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
     echo "  --extra_build_flags=<FLAGS>     Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
     echo "  --output=<FOLDER>               Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
@@ -49,8 +53,9 @@ for arg in "$@"; do
       --pte=*) pte_file="${arg#*=}";;
       --target=*) target="${arg#*=}";;
       --build_type=*) build_type="${arg#*=}";;
-      --system_config=*) system_config="${arg#*=}";;
       --bundleio) bundleio=true ;;
+      --system_config=*) system_config="${arg#*=}";;
+      --memory_mode=*) memory_mode="${arg#*=}";;
       --etdump) build_with_etdump=true ;;
       --extra_build_flags=*) extra_build_flags="${arg#*=}";;
       --output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
@@ -83,6 +88,15 @@ then
     fi
 fi
 
+if [[ ${memory_mode} == "" ]]
+then
+    memory_mode="Shared_Sram"
+    if [[ ${target} =~ "ethos-u85" ]]
+    then
+        memory_mode="Sram_Only"
+    fi
+fi
+
 output_folder=$(realpath ${output_folder})
 
 if [[ ${target} == *"ethos-u55"*  ]]; then
@@ -91,7 +105,7 @@ else
     target_cpu=cortex-m85
 fi
 echo "--------------------------------------------------------------------------------"
-echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${extra_build_flags} to '${output_folder}/cmake-out'"
+echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${memory_mode} ${extra_build_flags} to '${output_folder}/cmake-out'"
 echo "--------------------------------------------------------------------------------"
 
 cd ${et_root_dir}/examples/arm/executor_runner
@@ -120,6 +134,7 @@ cmake \
     ${build_with_etdump_flags}                  \
     -DPYTHON_EXECUTABLE=$(which python3)        \
     -DSYSTEM_CONFIG=${system_config}            \
+    -DMEMORY_MODE=${memory_mode}                \
     ${extra_build_flags}                        \
     -B ${output_folder}/cmake-out
 
 
@@ -20,7 +20,7 @@
 - op: _softmax.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::softmax_out
+      kernel_name: cadence::impl::HiFi::_softmax_out
 
 - op: atan2.out
   kernels:
@@ -100,7 +100,7 @@
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out
+      kernel_name: cadence::impl::HiFi::mean_out   
 
 - op: minimum.out
   kernels:
@@ -175,7 +175,7 @@
 - op: where.self_out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::where_out
+      kernel_name: cadence::impl::HiFi::where_self_out
 
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
@@ -189,6 +189,11 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out
+      
+- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_conv_out      
 
 - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
@@ -209,6 +214,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
 
+- func: cadence::quantized_relu_per_tensor.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
+
 - func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
 
@@ -104,6 +104,16 @@ def count_node(graph_module: torch.fx.GraphModule, target: torch.fx.node.Target)
     return total
 
 
+def op_counts_match(
+    graph_module: torch.fx.GraphModule,
+    expected_op_counts: dict[EdgeOpOverload, int],
+) -> bool:
+    for op, count in expected_op_counts.items():
+        if count_node(graph_module, op) != count:
+            return False
+    return True
+
+
 # Testing utils
 # Return the compute/function nodes in the graph
 def get_compute_nodes_in_gm(graph_module: torch.fx.GraphModule) -> List[torch.fx.Node]:
 
@@ -33,7 +33,7 @@
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
 from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
 from executorch.exir.pass_manager import PassManager, PassType
 from executorch.exir.passes import dead_code_elimination_pass
@@ -745,6 +745,68 @@ def permute_shape(
         return [shape[p] for p in permute_dims]
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class RemoveBranchedQuantDequant(ExportPass):
+    """
+    This pass looks for adjacent quant and dequant nodes with identical
+    parameters, where the quant node has other users in addition to the
+    dequant. The quant and dequant pair would be removed by the
+    FuseQuantDequantToRequantizePass if not for the multiple users. This pass
+    removes just the dequant node by connecting it to the quant's parent node
+    """
+
+    quantize_op_packets: set[EdgeOpOverloadPacket] = {
+        exir_ops.edge.cadence.quantize_per_tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor,
+    }
+    dequantize_op_packets: set[EdgeOpOverloadPacket] = {
+        exir_ops.edge.cadence.dequantize_per_tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
+    }
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self.remove_branched(
+            graph_module, self.quantize_op_packets, self.dequantize_op_packets
+        )
+        self.remove_branched(
+            graph_module, self.dequantize_op_packets, self.quantize_op_packets
+        )
+
+        graph_module.graph.eliminate_dead_code()
+        result = super().call(graph_module)
+        return result
+
+    def remove_branched(
+        self,
+        graph_module: torch.fx.GraphModule,
+        producer_pkts: set[EdgeOpOverloadPacket],
+        consumer_pkts: set[EdgeOpOverloadPacket],
+    ) -> None:
+        for node in graph_module.graph.nodes:
+            if (
+                node.op != "call_function"
+                or not isinstance(node.target, EdgeOpOverload)
+                or get_edge_overload_packet(node.target) not in producer_pkts
+            ):
+                continue
+
+            if len(node.users) < 2:
+                continue
+
+            for user in node.users:
+                if (
+                    not isinstance(user.target, EdgeOpOverload)
+                    or get_edge_overload_packet(user.target) not in consumer_pkts
+                ):
+                    continue
+
+                # check qparams match
+                if node.args[1:] != user.args[1:]:
+                    continue
+
+                user.replace_all_uses_with(node.args[0])
+
+
 # The following class consolidates functions to remove ops that are redundant
 # in Jarvis. Currently, each function in this class iterates over each node of
 # the graph module once. In future, we could consolidate them into a monolithic
@@ -765,4 +827,5 @@ class CadenceRemoveNops:
         RemoveNopMulOpPass,
         RemoveNopAddOpPass,
         RemoveNopLinalgVectorNormOpPass,
+        RemoveBranchedQuantDequant,
     ]
@@ -20,7 +20,7 @@
     FuseTransposeOpPairsPass,
 )
 from executorch.backends.cadence.aot.graph_builder import GraphBuilder
-from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 from torch import nn
@@ -32,8 +32,7 @@ def check_op_counts(
         graph_module: torch.fx.GraphModule,
         expected_op_counts: dict[EdgeOpOverload, int],
     ) -> None:
-        for op, count in expected_op_counts.items():
-            self.assertEqual(count_node(graph_module, op), count)
+        self.assertTrue(op_counts_match(graph_module, expected_op_counts))
 
 
 class TestFusionPasses(TestFusionPassesBase):
 
@@ -17,10 +17,11 @@
 from executorch.backends.cadence.aot import compiler
 from executorch.backends.cadence.aot.compiler import export_to_edge
 
-from executorch.backends.cadence.aot.pass_utils import count_node
+from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.aot.remove_ops import (
     RemoveAliasCopyOpPass,
+    RemoveBranchedQuantDequant,
     RemoveCloneOpPass,
     RemoveContiguousOpPass,
     RemoveDetachCopyPass,
@@ -709,3 +710,34 @@ def forward(self, x):
         self.assertEqual(
             count_node(graph_module, exir_ops.edge.aten.permute_copy.default), 2
         )
+
+    def test_remove_dequant_on_branch(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                x = torch.abs(x)
+                x0 = torch.ops.quantized_decomposed.quantize_per_tensor(
+                    x, 1.2, 3, 0, 127, torch.int8
+                )
+                x1 = torch.abs(x0)
+                y0 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                    x0, 1.2, 3, 0, 127, torch.int8
+                )
+                y1 = y0.view(-1)
+                return x1, y1
+
+        inputs = torch.rand(1, 8, 4, 6)
+        model = M()
+        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
+
+        graph_module = RemoveBranchedQuantDequant()(graph_module).graph_module
+        self.assertTrue(
+            op_counts_match(
+                graph_module,
+                expected_op_counts={
+                    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
+                    # we expect the pass to remove the dequantize node
+                    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
+                    exir_ops.edge.aten.abs.default: 2,
+                },
+            )
+        )
@@ -78,7 +78,8 @@ target_include_directories(
 # Custom ops that are needed to run the test model.
 add_library(
   custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
-             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp" "op_quantized_fully_connected_out"
+             "op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
+             "op_quantized_conv_out.cpp" "op_quantized_fully_connected_out"
 )
 target_include_directories(
   custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
Original file line number	Diff line number	Diff line change
`@@ -749,9 +749,9 @@ endif()`
`749`	`749`
`750`	`750`	`if(EXECUTORCH_BUILD_PTHREADPOOL`
`751`	`751`	`AND EXECUTORCH_BUILD_CPUINFO`
`752`		`- AND CMAKE_CXX_STANDARD GREATER_EQUAL 14`
`753`	`752`	`)`
`754`	`753`	`add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)`
	`754`	`+ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)`
`755`	`755`	`endif()`
`756`	`756`
`757`	`757`	`if(EXECUTORCH_BUILD_PYBIND)`