Skip to content

Commit 891fb8e

Browse files
committed
Update
[ghstack-poisoned]
2 parents 7560880 + ea92179 commit 891fb8e

File tree

98 files changed

+4176
-1526
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+4176
-1526
lines changed

.ci/scripts/test_ane_static_llama.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
13+
14+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
15+
PYTHON_EXECUTABLE=python3
16+
fi
17+
18+
which "${PYTHON_EXECUTABLE}"
19+
20+
pushd $EXECUTORCH_ROOT/examples/apple/coreml/llama
21+
22+
# Download stories llama110m artifacts
23+
download_stories_model_artifacts
24+
25+
python export.py -n model.pte -p params.json -c stories110M.pt --seq_length 32 --max_seq_length 64 --dtype fp16 --coreml-quantize c4w
26+
27+
popd

.ci/scripts/test_model.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ test_model() {
100100
rm "./${MODEL_NAME}.pte"
101101
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
102102
fi
103+
if [[ "${MODEL_NAME}" == "phi4_mini" ]]; then
104+
# Install requirements for export_llama
105+
bash examples/models/llama/install_requirements.sh
106+
# Test export_llama script: python3 -m examples.models.llama.export_llama.
107+
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
108+
run_portable_executor_runner
109+
rm "./${MODEL_NAME}.pte"
110+
fi
103111

104112
# Export a basic .pte and run the model.
105113
"${PYTHON_EXECUTABLE}" -m examples.portable.scripts.export --model_name="${MODEL_NAME}" "${STRICT}"

.github/workflows/trunk.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,28 @@ jobs:
229229
# see if we can import the module successfully
230230
${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
231231
232+
test-static-llama-ane:
233+
name: test-static-llama-ane
234+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
235+
with:
236+
runner: macos-m1-stable
237+
python-version: '3.11'
238+
submodules: 'true'
239+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
240+
script: |
241+
set -eux
242+
bash .ci/scripts/setup-conda.sh
243+
eval "$(conda shell.bash hook)"
244+
245+
# Install requirements
246+
sh install_requirements.sh
247+
sh backends/apple/coreml/scripts/install_requirements.sh
248+
python install_executorch.py --pybind coreml
249+
sh examples/models/llama/install_requirements.sh
250+
251+
# Test ANE llama
252+
sh .ci/scripts/test_ane_static_llama.sh
253+
232254
test-llama-runner-macos:
233255
name: test-llama-runner-mac
234256
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -749,9 +749,9 @@ endif()
749749

750750
if(EXECUTORCH_BUILD_PTHREADPOOL
751751
AND EXECUTORCH_BUILD_CPUINFO
752-
AND CMAKE_CXX_STANDARD GREATER_EQUAL 14
753752
)
754753
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
754+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/parallel)
755755
endif()
756756

757757
if(EXECUTORCH_BUILD_PYBIND)

backends/arm/scripts/build_executorch_runner.sh

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ toolchain_cmake=${et_root_dir}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmak
1414
pte_file=""
1515
target="ethos-u55-128"
1616
build_type="Release"
17-
system_config=""
1817
bundleio=false
18+
system_config=""
19+
memory_mode=""
1920
build_with_etdump=false
2021
extra_build_flags=""
2122
output_folder_set=false
@@ -32,9 +33,12 @@ help() {
3233
echo " --pte=<PTE_FILE> pte file (genrated by the aot_arm_compier from the model to include in the elf"
3334
echo " --target=<TARGET> Target to build and run for Default: ${target}"
3435
echo " --build_type=<TYPE> Build with Release, Debug or RelWithDebInfo, default is ${build_type}"
35-
echo " --system_config=<CONFIG> System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
36-
echo " NOTE: If given, this option must match the given target. This option also sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
3736
echo " --bundleio Support both pte and Bundle IO bpte using Devtools BundelIO with Input/RefOutput included"
37+
echo " --system_config=<CONFIG> System configuration to select from the Vela configuration file (see vela.ini). Default: Ethos_U55_High_End_Embedded for EthosU55 targets, Ethos_U85_SYS_DRAM_Mid for EthosU85 targets."
38+
echo " NOTE: If given, this option must match the given target. This option along with the memory_mode sets timing adapter values customized for specific hardware, see ./executor_runner/CMakeLists.txt."
39+
echo " --memory_mode=<CONFIG> Vela memory mode, used for setting the Timing Adapter parameters of the Corstone platforms."
40+
echo " Valid values are Shared_Sram(for Ethos-U55, Ethos-U65, Ethos-85), Sram_Only(for Ethos-U55, Ethos-U65, Ethos-U85) or Dedicated_Sram(for Ethos-U65, Ethos-U85)."
41+
echo " Default: Shared_Sram for the Ethos-U55 and Sram_Only for the Ethos-U85"
3842
echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log"
3943
echo " --extra_build_flags=<FLAGS> Extra flags to pass to cmake like -DET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE=60000 Default: none "
4044
echo " --output=<FOLDER> Output folder Default: <MODEL>/<MODEL>_<TARGET INFO>.pte"
@@ -49,8 +53,9 @@ for arg in "$@"; do
4953
--pte=*) pte_file="${arg#*=}";;
5054
--target=*) target="${arg#*=}";;
5155
--build_type=*) build_type="${arg#*=}";;
52-
--system_config=*) system_config="${arg#*=}";;
5356
--bundleio) bundleio=true ;;
57+
--system_config=*) system_config="${arg#*=}";;
58+
--memory_mode=*) memory_mode="${arg#*=}";;
5459
--etdump) build_with_etdump=true ;;
5560
--extra_build_flags=*) extra_build_flags="${arg#*=}";;
5661
--output=*) output_folder="${arg#*=}" ; output_folder_set=true ;;
@@ -83,6 +88,15 @@ then
8388
fi
8489
fi
8590

91+
if [[ ${memory_mode} == "" ]]
92+
then
93+
memory_mode="Shared_Sram"
94+
if [[ ${target} =~ "ethos-u85" ]]
95+
then
96+
memory_mode="Sram_Only"
97+
fi
98+
fi
99+
86100
output_folder=$(realpath ${output_folder})
87101

88102
if [[ ${target} == *"ethos-u55"* ]]; then
@@ -91,7 +105,7 @@ else
91105
target_cpu=cortex-m85
92106
fi
93107
echo "--------------------------------------------------------------------------------"
94-
echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${extra_build_flags} to '${output_folder}/cmake-out'"
108+
echo "Build Arm Baremetal executor_runner for ${target} with ${pte_file} using ${system_config} ${memory_mode} ${extra_build_flags} to '${output_folder}/cmake-out'"
95109
echo "--------------------------------------------------------------------------------"
96110

97111
cd ${et_root_dir}/examples/arm/executor_runner
@@ -120,6 +134,7 @@ cmake \
120134
${build_with_etdump_flags} \
121135
-DPYTHON_EXECUTABLE=$(which python3) \
122136
-DSYSTEM_CONFIG=${system_config} \
137+
-DMEMORY_MODE=${memory_mode} \
123138
${extra_build_flags} \
124139
-B ${output_folder}/cmake-out
125140

backends/cadence/aot/functions_hifi.yaml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
- op: _softmax.out
2121
kernels:
2222
- arg_meta: null
23-
kernel_name: cadence::impl::HiFi::softmax_out
23+
kernel_name: cadence::impl::HiFi::_softmax_out
2424

2525
- op: atan2.out
2626
kernels:
@@ -100,7 +100,7 @@
100100
- op: mean.out
101101
kernels:
102102
- arg_meta: null
103-
kernel_name: cadence::impl::HiFi::mean_dim_out
103+
kernel_name: cadence::impl::HiFi::mean_out
104104

105105
- op: minimum.out
106106
kernels:
@@ -175,7 +175,7 @@
175175
- op: where.self_out
176176
kernels:
177177
- arg_meta: null
178-
kernel_name: cadence::impl::HiFi::where_out
178+
kernel_name: cadence::impl::HiFi::where_self_out
179179

180180
# custom ops
181181
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
@@ -189,6 +189,11 @@
189189
kernels:
190190
- arg_meta: null
191191
kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out
192+
193+
- func: cadence::quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
194+
kernels:
195+
- arg_meta: null
196+
kernel_name: cadence::impl::HiFi::quantized_conv_out
192197

193198
- func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
194199
kernels:
@@ -209,6 +214,11 @@
209214
- arg_meta: null
210215
kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
211216

217+
- func: cadence::quantized_relu_per_tensor.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
218+
kernels:
219+
- arg_meta: null
220+
kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out
221+
212222
- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
213223
kernels:
214224
- arg_meta: null

backends/cadence/aot/pass_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,16 @@ def count_node(graph_module: torch.fx.GraphModule, target: torch.fx.node.Target)
104104
return total
105105

106106

107+
def op_counts_match(
108+
graph_module: torch.fx.GraphModule,
109+
expected_op_counts: dict[EdgeOpOverload, int],
110+
) -> bool:
111+
for op, count in expected_op_counts.items():
112+
if count_node(graph_module, op) != count:
113+
return False
114+
return True
115+
116+
107117
# Testing utils
108118
# Return the compute/function nodes in the graph
109119
def get_compute_nodes_in_gm(graph_module: torch.fx.GraphModule) -> List[torch.fx.Node]:

backends/cadence/aot/remove_ops.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from executorch.backends.cadence.aot.utils import get_edge_overload_packet
3434
from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
3535
from executorch.exir.dialects._ops import ops as exir_ops
36-
from executorch.exir.dialects.edge._ops import EdgeOpOverload
36+
from executorch.exir.dialects.edge._ops import EdgeOpOverload, EdgeOpOverloadPacket
3737
from executorch.exir.pass_base import ExportPass, NodeMetadata, PassResult, ProxyValue
3838
from executorch.exir.pass_manager import PassManager, PassType
3939
from executorch.exir.passes import dead_code_elimination_pass
@@ -745,6 +745,68 @@ def permute_shape(
745745
return [shape[p] for p in permute_dims]
746746

747747

748+
@register_cadence_pass(CadencePassAttribute(opt_level=1))
749+
class RemoveBranchedQuantDequant(ExportPass):
750+
"""
751+
This pass looks for adjacent quant and dequant nodes with identical
752+
parameters, where the quant node has other users in addition to the
753+
dequant. The quant and dequant pair would be removed by the
754+
FuseQuantDequantToRequantizePass if not for the multiple users. This pass
755+
removes just the dequant node by connecting it to the quant's parent node
756+
"""
757+
758+
quantize_op_packets: set[EdgeOpOverloadPacket] = {
759+
exir_ops.edge.cadence.quantize_per_tensor,
760+
exir_ops.edge.quantized_decomposed.quantize_per_tensor,
761+
}
762+
dequantize_op_packets: set[EdgeOpOverloadPacket] = {
763+
exir_ops.edge.cadence.dequantize_per_tensor,
764+
exir_ops.edge.quantized_decomposed.dequantize_per_tensor,
765+
}
766+
767+
def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
768+
self.remove_branched(
769+
graph_module, self.quantize_op_packets, self.dequantize_op_packets
770+
)
771+
self.remove_branched(
772+
graph_module, self.dequantize_op_packets, self.quantize_op_packets
773+
)
774+
775+
graph_module.graph.eliminate_dead_code()
776+
result = super().call(graph_module)
777+
return result
778+
779+
def remove_branched(
780+
self,
781+
graph_module: torch.fx.GraphModule,
782+
producer_pkts: set[EdgeOpOverloadPacket],
783+
consumer_pkts: set[EdgeOpOverloadPacket],
784+
) -> None:
785+
for node in graph_module.graph.nodes:
786+
if (
787+
node.op != "call_function"
788+
or not isinstance(node.target, EdgeOpOverload)
789+
or get_edge_overload_packet(node.target) not in producer_pkts
790+
):
791+
continue
792+
793+
if len(node.users) < 2:
794+
continue
795+
796+
for user in node.users:
797+
if (
798+
not isinstance(user.target, EdgeOpOverload)
799+
or get_edge_overload_packet(user.target) not in consumer_pkts
800+
):
801+
continue
802+
803+
# check qparams match
804+
if node.args[1:] != user.args[1:]:
805+
continue
806+
807+
user.replace_all_uses_with(node.args[0])
808+
809+
748810
# The following class consolidates functions to remove ops that are redundant
749811
# in Jarvis. Currently, each function in this class iterates over each node of
750812
# the graph module once. In future, we could consolidate them into a monolithic
@@ -765,4 +827,5 @@ class CadenceRemoveNops:
765827
RemoveNopMulOpPass,
766828
RemoveNopAddOpPass,
767829
RemoveNopLinalgVectorNormOpPass,
830+
RemoveBranchedQuantDequant,
768831
]

backends/cadence/aot/tests/test_fusion_ops_passes.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
FuseTransposeOpPairsPass,
2121
)
2222
from executorch.backends.cadence.aot.graph_builder import GraphBuilder
23-
from executorch.backends.cadence.aot.pass_utils import count_node
23+
from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
2424
from executorch.exir.dialects._ops import ops as exir_ops
2525
from executorch.exir.dialects.edge._ops import EdgeOpOverload
2626
from torch import nn
@@ -32,8 +32,7 @@ def check_op_counts(
3232
graph_module: torch.fx.GraphModule,
3333
expected_op_counts: dict[EdgeOpOverload, int],
3434
) -> None:
35-
for op, count in expected_op_counts.items():
36-
self.assertEqual(count_node(graph_module, op), count)
35+
self.assertTrue(op_counts_match(graph_module, expected_op_counts))
3736

3837

3938
class TestFusionPasses(TestFusionPassesBase):

backends/cadence/aot/tests/test_remove_ops_passes.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717
from executorch.backends.cadence.aot import compiler
1818
from executorch.backends.cadence.aot.compiler import export_to_edge
1919

20-
from executorch.backends.cadence.aot.pass_utils import count_node
20+
from executorch.backends.cadence.aot.pass_utils import count_node, op_counts_match
2121
from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
2222
from executorch.backends.cadence.aot.remove_ops import (
2323
RemoveAliasCopyOpPass,
24+
RemoveBranchedQuantDequant,
2425
RemoveCloneOpPass,
2526
RemoveContiguousOpPass,
2627
RemoveDetachCopyPass,
@@ -709,3 +710,34 @@ def forward(self, x):
709710
self.assertEqual(
710711
count_node(graph_module, exir_ops.edge.aten.permute_copy.default), 2
711712
)
713+
714+
def test_remove_dequant_on_branch(self):
715+
class M(torch.nn.Module):
716+
def forward(self, x):
717+
x = torch.abs(x)
718+
x0 = torch.ops.quantized_decomposed.quantize_per_tensor(
719+
x, 1.2, 3, 0, 127, torch.int8
720+
)
721+
x1 = torch.abs(x0)
722+
y0 = torch.ops.quantized_decomposed.dequantize_per_tensor(
723+
x0, 1.2, 3, 0, 127, torch.int8
724+
)
725+
y1 = y0.view(-1)
726+
return x1, y1
727+
728+
inputs = torch.rand(1, 8, 4, 6)
729+
model = M()
730+
graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
731+
732+
graph_module = RemoveBranchedQuantDequant()(graph_module).graph_module
733+
self.assertTrue(
734+
op_counts_match(
735+
graph_module,
736+
expected_op_counts={
737+
exir_ops.edge.quantized_decomposed.quantize_per_tensor.default: 1,
738+
# we expect the pass to remove the dequantize node
739+
exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default: 0,
740+
exir_ops.edge.aten.abs.default: 2,
741+
},
742+
)
743+
)

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ target_include_directories(
7878
# Custom ops that are needed to run the test model.
7979
add_library(
8080
custom_ops "op_quantized_linear_out.cpp" "op_quantized_layer_norm.cpp"
81-
"op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp" "op_quantized_fully_connected_out"
81+
"op_quantize_per_tensor.cpp" "op_quantized_relu_out.cpp" "op_dequantize_per_tensor.cpp"
82+
"op_quantized_conv_out.cpp" "op_quantized_fully_connected_out"
8283
)
8384
target_include_directories(
8485
custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}

0 commit comments

Comments
 (0)