Skip to content

Commit 8178226

Browse files
committed
Update on "[ET-VK][14/n] Add operators to Partitioner"
1. Register aten operators in the vulkan partitioner. 2. Fix some minor operators name issue due to mismatch between the torch api and actual aten name Note: Permute is not yet registered due to tensor movement issues with the "Partial" model where the `Linear` operator is decomposed into `permute` and `addmm`. Will fix in later diffs. Differential Revision: [D56695929](https://our.internmc.facebook.com/intern/diff/D56695929/) [ghstack-poisoned]
2 parents 567cd8c + 16b47fc commit 8178226

File tree

17 files changed

+642
-505
lines changed

17 files changed

+642
-505
lines changed

.github/workflows/doc-build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,11 @@ jobs:
9494
# Get github.ref for the output doc folder. By default "main"
9595
# If matches a tag like refs/tags/v1.12.0-rc3 or
9696
# refs/tags/v1.12.0 convert to 1.12
97-
GITHUB_REF=${{ github.ref }}
97+
export GITHUB_REF=${{ github.ref }}
9898
9999
# Convert refs/tags/v1.12.0rc3 into 1.12.
100100
# Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
101-
if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
101+
if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+) ]]; then
102102
TARGET_FOLDER="${BASH_REMATCH[1]}"
103103
else
104104
TARGET_FOLDER="main"

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ endif()
120120
# disables exceptions and runtime type.
121121
set(CMAKE_CXX_FLAGS_RELEASE
122122
"-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
123-
if(NOT APPLE)
123+
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
124124
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
125125
endif()
126126

backends/xnnpack/runtime/XNNExecutor.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ __ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
8787
if (i < input_ids_.size()) {
8888
size_t num_dims = tensor->dim();
8989
size_t dims[XNN_MAX_TENSOR_DIMS];
90+
ET_CHECK_OR_RETURN_ERROR(
91+
num_dims <= XNN_MAX_TENSOR_DIMS,
92+
InvalidArgument,
93+
"XNNPACK backend accepts tensors with at most %d dims, but got %zu",
94+
XNN_MAX_TENSOR_DIMS,
95+
num_dims);
9096
for (int d = 0; d < num_dims; ++d) {
9197
dims[d] = tensor->size(d);
9298
}

backends/xnnpack/targets.bzl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,11 @@ def define_common_targets():
3838
preprocessor_flags = [
3939
# "-DENABLE_XNNPACK_PROFILING",
4040
],
41+
exported_deps = [
42+
"//executorch/runtime/backend:interface",
43+
],
4144
deps = [
4245
third_party_dep("XNNPACK"),
43-
"//executorch/runtime/backend:interface",
4446
"//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
4547
"//executorch/backends/xnnpack/threadpool:threadpool",
4648
"//executorch/runtime/core/exec_aten/util:tensor_util",
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
10+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
11+
#include <gtest/gtest.h>
12+
#include <xnnpack/subgraph.h>
13+
14+
using torch::executor::Error;
15+
using torch::executor::EValue;
16+
using torch::executor::testing::TensorFactory;
17+
using torch::executor::xnnpack::delegate::XNNExecutor;
18+
19+
TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
20+
XNNExecutor executor;
21+
xnn_subgraph_t subgraph = nullptr;
22+
xnn_runtime_t rt = nullptr;
23+
et_pal_init();
24+
ASSERT_EQ(xnn_initialize(nullptr), xnn_status_success);
25+
ASSERT_EQ(xnn_create_subgraph(2, 0, &subgraph), xnn_status_success);
26+
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
27+
subgraph, xnn_delete_subgraph);
28+
29+
auto input_id = XNN_INVALID_NODE_ID;
30+
std::vector<size_t> dims = {
31+
1,
32+
};
33+
ASSERT_EQ(
34+
xnn_status_success,
35+
xnn_define_quantized_tensor_value(
36+
subgraph,
37+
xnn_datatype_qint8,
38+
0,
39+
1,
40+
dims.size(),
41+
dims.data(),
42+
nullptr,
43+
/*external_id=*/0,
44+
/*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
45+
&input_id));
46+
ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
47+
48+
auto output_id = XNN_INVALID_NODE_ID;
49+
ASSERT_EQ(
50+
xnn_status_success,
51+
xnn_define_quantized_tensor_value(
52+
subgraph,
53+
xnn_datatype_qint8,
54+
0,
55+
1,
56+
dims.size(),
57+
dims.data(),
58+
nullptr,
59+
/*external_id=*/0,
60+
/*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
61+
&output_id));
62+
ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
63+
64+
ASSERT_EQ(
65+
xnn_status_success,
66+
xnn_define_clamp(subgraph, 1, 2, input_id, output_id, 0));
67+
68+
ASSERT_EQ(xnn_create_runtime(subgraph, &rt), xnn_status_success);
69+
EXPECT_EQ(
70+
executor.initialize(
71+
rt,
72+
{
73+
0,
74+
},
75+
{
76+
1,
77+
}),
78+
Error::Ok);
79+
TensorFactory<exec_aten::ScalarType::Int> tf;
80+
auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});
81+
ASSERT_EQ(input_tensor.dim(), 9);
82+
auto output_tensor = tf.make(
83+
{
84+
1,
85+
},
86+
{
87+
1,
88+
});
89+
EValue input_ev(input_tensor);
90+
EValue output_ev(output_tensor);
91+
std::array<EValue*, 2> args = {&input_ev, &output_ev};
92+
// Check for invalid number of dimensions should fail without stack overflow.
93+
EXPECT_EQ(executor.prepare_args(args.data()), Error::InvalidArgument);
94+
}

backends/xnnpack/test/targets.bzl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
12
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
23

34
def define_common_targets():
@@ -17,3 +18,14 @@ def define_common_targets():
1718
"//executorch/backends/xnnpack:dynamic_quant_utils",
1819
],
1920
)
21+
22+
runtime.cxx_test(
23+
name = "xnnexecutor_test",
24+
srcs = ["runtime/test_xnnexecutor.cpp"],
25+
deps = [
26+
third_party_dep("XNNPACK"),
27+
"//executorch/runtime/core/exec_aten/testing_util:tensor_util",
28+
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
29+
"//executorch/backends/xnnpack:xnnpack_backend",
30+
],
31+
)

docs/source/debug-backend-delegate.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ Number of non-delegated nodes: 430
3939
From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug.
4040

4141
## Visualize delegated graph
42-
To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph:
42+
To see a more detailed view, use the `format_delegated_graph()` method to get a str of printout of the whole graph or use `print_delegated_graph()` to print directly:
4343

4444
```python
45-
from executorch.exir.backend.utils import print_delegated_graph
45+
from executorch.exir.backend.utils import format_delegated_graph
4646
graph_module = edge_manager.exported_program().graph_module
47-
print(print_delegated_graph(graph_module))
47+
print(format_delegated_graph(graph_module)) # or call print_delegated_graph(graph_module)
4848
```
4949
It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph.
5050

docs/source/llm/getting-started.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -721,12 +721,12 @@ Number of non-delegated nodes: 430
721721
| 26 | Total | 473 | 430 |
722722

723723
From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs.
724-
To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph.
724+
To see a more detailed view, use the `format_delegated_graph()` method to get a formatted str of printout of the whole graph or use `print_delegated_graph()` to print directly:
725725

726726
```python
727-
from executorch.exir.backend.utils import print_delegated_graph
727+
from executorch.exir.backend.utils import format_delegated_graph
728728
graph_module = edge_manager.exported_program().graph_module
729-
print(print_delegated_graph(graph_module))
729+
print(format_delegated_graph(graph_module))
730730
```
731731
This may generate a large amount of output for large models. Consider using "Control+F" or "Command+F" to locate the operator you’re interested in
732732
(e.g. “aten_view_copy_default”). Observe which instances are not under lowered graphs.

docs/source/tutorials_source/sdk-integration-tutorial.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,10 +172,24 @@ def forward(self, x):
172172
# Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
173173
#
174174
# cd executorch
175-
# rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
176-
# cd ..
177-
# cmake --build cmake-out -j8 -t sdk_example_runner
178-
# ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path <bundled_program>
175+
# rm -rf cmake-out
176+
# cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
177+
# -DCMAKE_BUILD_TYPE=Release \
178+
# -DEXECUTORCH_BUILD_SDK=ON \
179+
# -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
180+
# -Bcmake-out .
181+
# cmake --build cmake-out -j9 --target install --config Release
182+
#
183+
# local example_dir=examples/sdk
184+
# local build_dir=cmake-out/${example_dir}
185+
# CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
186+
# rm -rf ${build_dir}
187+
# cmake -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
188+
# -DCMAKE_BUILD_TYPE=Release \
189+
# -B${build_dir} \
190+
# ${example_dir}
191+
# cmake --build ${build_dir} -j9 --config Release
192+
# ${build_dir}/sdk_example_runner --bundled_program_path="bundled_program.bp"
179193

180194
######################################################################
181195
# Creating an Inspector

examples/models/llama2/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Note that groupsize less than 128 was not enabled, since such model were still t
3737

3838
We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
3939

40-
For Llama 3 8B, we have verified so far on iPhone 15 Pro Max and OnePlus 12 (with 16GB RAM).
40+
For Llama 3 8B, we have verified so far on iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM).
4141

4242
## Performance
4343

examples/models/llama2/builder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from executorch.exir import EdgeProgramManager
2222
from executorch.exir.backend.partitioner import Partitioner
2323

24-
from executorch.exir.backend.utils import print_delegated_graph
24+
from executorch.exir.backend.utils import format_delegated_graph
2525
from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
2626

2727
from executorch.exir.passes import MemoryPlanningPass
@@ -283,7 +283,7 @@ def export_to_edge(
283283
dynamic_shapes=dynamic_shape,
284284
edge_constant_methods=metadata,
285285
edge_compile_config=edge_config,
286-
verbose=True,
286+
verbose=self.verbose,
287287
)
288288
return self
289289

@@ -308,7 +308,7 @@ def to_backend(
308308
self.edge_manager = self.edge_manager.to_backend(partitioner)
309309
if self.verbose:
310310
logging.info(
311-
print_delegated_graph(
311+
format_delegated_graph(
312312
self.edge_manager.exported_program().graph_module
313313
)
314314
)

exir/backend/test/test_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@
1616
from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
1717
from executorch.exir.backend.utils import (
1818
DelegationBreakdown,
19+
format_delegated_graph,
1920
get_delegates,
2021
get_delegation_info,
2122
get_non_lowered_nodes,
2223
is_identical_graph,
23-
print_delegated_graph,
2424
)
2525

2626
from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
@@ -266,7 +266,7 @@ def forward(self, a, x, b):
266266

267267
edge = to_edge(export(m, inputs)).to_backend(AddMulPartitionerDemo())
268268

269-
graph_str = print_delegated_graph(edge.exported_program().graph_module)
269+
graph_str = format_delegated_graph(edge.exported_program().graph_module)
270270
self.assertIn(
271271
"BackendWithCompilerDemo",
272272
graph_str,

exir/backend/utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -448,9 +448,16 @@ def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
448448
)
449449

450450

451-
def print_delegated_graph(graph_module: torch.fx.GraphModule) -> str:
451+
def print_delegated_graph(graph_module: torch.fx.GraphModule) -> None:
452452
"""
453-
Print the graph of including lowered_module (both backend id and original graph) together with the graph module. Example output:
453+
Print the formatted graph string.
454+
"""
455+
print(format_delegated_graph(graph_module))
456+
457+
458+
def format_delegated_graph(graph_module: torch.fx.GraphModule) -> str:
459+
"""
460+
Return the formatted graph string of including lowered_module (both backend id and original graph) together with the graph module. Example output:
454461
graph():
455462
%arg0_1 : [num_users=2] = placeholder[target=arg0_1]
456463
%arg1_1 : [num_users=2] = placeholder[target=arg1_1]

0 commit comments

Comments
 (0)