Update on "[ET-VK][14/n] Add operators to Partitioner"

yipjustin · yipjustin · commit 8178226cb4ac · 2024-04-29T16:32:50.000-07:00
1. Register aten operators in the vulkan partitioner. 2. Fix some minor operators name issue due to mismatch between the torch api and actual aten name Note: Permute is not yet registered due to tensor movement issues with the "Partial" model where the `Linear` operator is decomposed into `permute` and `addmm`. Will fix in later diffs. Differential Revision: [D56695929](https://our.internmc.facebook.com/intern/diff/D56695929/) [ghstack-poisoned]
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -94,11 +94,11 @@ jobs:
         # Get github.ref for the output doc folder. By default "main"
         # If matches a tag like refs/tags/v1.12.0-rc3 or
         # refs/tags/v1.12.0 convert to 1.12
-        GITHUB_REF=${{ github.ref }}
+        export GITHUB_REF=${{ github.ref }}
 
         # Convert refs/tags/v1.12.0rc3 into 1.12.
         # Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13
-        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\\.[0-9]+)\\. ]]; then
+        if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+) ]]; then
           TARGET_FOLDER="${BASH_REMATCH[1]}"
         else
           TARGET_FOLDER="main"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -120,7 +120,7 @@ endif()
 # disables exceptions and runtime type.
 set(CMAKE_CXX_FLAGS_RELEASE
     "-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
-if(NOT APPLE)
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -87,6 +87,12 @@ __ET_NODISCARD Error XNNExecutor::prepare_args(EValue** args) {
     if (i < input_ids_.size()) {
       size_t num_dims = tensor->dim();
       size_t dims[XNN_MAX_TENSOR_DIMS];
+      ET_CHECK_OR_RETURN_ERROR(
+          num_dims <= XNN_MAX_TENSOR_DIMS,
+          InvalidArgument,
+          "XNNPACK backend accepts tensors with at most %d dims, but got %zu",
+          XNN_MAX_TENSOR_DIMS,
+          num_dims);
       for (int d = 0; d < num_dims; ++d) {
         dims[d] = tensor->size(d);
       }
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
@@ -38,9 +38,11 @@ def define_common_targets():
         preprocessor_flags = [
             # "-DENABLE_XNNPACK_PROFILING",
         ],
+        exported_deps = [
+            "//executorch/runtime/backend:interface",
+        ],
         deps = [
             third_party_dep("XNNPACK"),
-            "//executorch/runtime/backend:interface",
             "//executorch/backends/xnnpack/serialization:xnnpack_flatbuffer_header",
             "//executorch/backends/xnnpack/threadpool:threadpool",
             "//executorch/runtime/core/exec_aten/util:tensor_util",
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <gtest/gtest.h>
+#include <xnnpack/subgraph.h>
+
+using torch::executor::Error;
+using torch::executor::EValue;
+using torch::executor::testing::TensorFactory;
+using torch::executor::xnnpack::delegate::XNNExecutor;
+
+TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
+  XNNExecutor executor;
+  xnn_subgraph_t subgraph = nullptr;
+  xnn_runtime_t rt = nullptr;
+  et_pal_init();
+  ASSERT_EQ(xnn_initialize(nullptr), xnn_status_success);
+  ASSERT_EQ(xnn_create_subgraph(2, 0, &subgraph), xnn_status_success);
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
+      subgraph, xnn_delete_subgraph);
+
+  auto input_id = XNN_INVALID_NODE_ID;
+  std::vector<size_t> dims = {
+      1,
+  };
+  ASSERT_EQ(
+      xnn_status_success,
+      xnn_define_quantized_tensor_value(
+          subgraph,
+          xnn_datatype_qint8,
+          0,
+          1,
+          dims.size(),
+          dims.data(),
+          nullptr,
+          /*external_id=*/0,
+          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
+          &input_id));
+  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
+
+  auto output_id = XNN_INVALID_NODE_ID;
+  ASSERT_EQ(
+      xnn_status_success,
+      xnn_define_quantized_tensor_value(
+          subgraph,
+          xnn_datatype_qint8,
+          0,
+          1,
+          dims.size(),
+          dims.data(),
+          nullptr,
+          /*external_id=*/0,
+          /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+          &output_id));
+  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+
+  ASSERT_EQ(
+      xnn_status_success,
+      xnn_define_clamp(subgraph, 1, 2, input_id, output_id, 0));
+
+  ASSERT_EQ(xnn_create_runtime(subgraph, &rt), xnn_status_success);
+  EXPECT_EQ(
+      executor.initialize(
+          rt,
+          {
+              0,
+          },
+          {
+              1,
+          }),
+      Error::Ok);
+  TensorFactory<exec_aten::ScalarType::Int> tf;
+  auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});
+  ASSERT_EQ(input_tensor.dim(), 9);
+  auto output_tensor = tf.make(
+      {
+          1,
+      },
+      {
+          1,
+      });
+  EValue input_ev(input_tensor);
+  EValue output_ev(output_tensor);
+  std::array<EValue*, 2> args = {&input_ev, &output_ev};
+  // Check for invalid number of dimensions should fail without stack overflow.
+  EXPECT_EQ(executor.prepare_args(args.data()), Error::InvalidArgument);
+}
diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl
@@ -1,3 +1,4 @@
+load("@fbsource//xplat/executorch/backends/xnnpack/third-party:third_party_libs.bzl", "third_party_dep")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
@@ -17,3 +18,14 @@ def define_common_targets():
             "//executorch/backends/xnnpack:dynamic_quant_utils",
         ],
     )
+
+    runtime.cxx_test(
+        name = "xnnexecutor_test",
+        srcs = ["runtime/test_xnnexecutor.cpp"],
+        deps = [
+            third_party_dep("XNNPACK"),
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+            "//executorch/backends/xnnpack:xnnpack_backend",
+        ],
+    )
diff --git a/docs/source/debug-backend-delegate.md b/docs/source/debug-backend-delegate.md
@@ -39,12 +39,12 @@ Number  of  non-delegated  nodes:  430
 From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs. Users can use information like this to debug.
 
 ## Visualize delegated graph
-To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph:
+To see a more detailed view, use the `format_delegated_graph()` method to get a str of printout of the whole graph or use `print_delegated_graph()` to print directly:
 
 ```python
-from executorch.exir.backend.utils import print_delegated_graph
+from executorch.exir.backend.utils import format_delegated_graph
 graph_module = edge_manager.exported_program().graph_module
-print(print_delegated_graph(graph_module))
+print(format_delegated_graph(graph_module)) # or call print_delegated_graph(graph_module)
 ```
 It will print the whole model as well as the subgraph consumed by the backend. The generic debug function provided by fx like `print_tabular()` or `print_readable()` will only show `call_delegate` but hide the the subgraph consumes by the backend, while this function exposes the contents inside the subgraph.
 
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
@@ -721,12 +721,12 @@ Number  of  non-delegated  nodes:  430
 |  26  |  Total  |  473  |  430  |
 
 From the table, the operator `aten_view_copy_default` appears 170 times in delegate graphs and 48 times in non-delegated graphs.
-To see a more detailed view, use the `print_delegated_graph()` method to display a printout of the whole graph.
+To see a more detailed view, use the `format_delegated_graph()` method to get a formatted str of printout of the whole graph or use `print_delegated_graph()` to print directly:
 
 ```python
-from executorch.exir.backend.utils import print_delegated_graph
+from executorch.exir.backend.utils import format_delegated_graph
 graph_module = edge_manager.exported_program().graph_module
-print(print_delegated_graph(graph_module))
+print(format_delegated_graph(graph_module))
 ```
 This may generate a large amount of output for large models. Consider using "Control+F" or "Command+F" to locate the operator you’re interested in
 (e.g. “aten_view_copy_default”). Observe which instances are not under lowered graphs.
diff --git a/docs/source/tutorials_source/sdk-integration-tutorial.py b/docs/source/tutorials_source/sdk-integration-tutorial.py
@@ -172,10 +172,24 @@ def forward(self, x):
 # Use CMake (follow `these instructions <../runtime-build-and-cross-compilation.html#configure-the-cmake-build>`__ to set up cmake) to execute the Bundled Program to generate the ``ETDump``::
 #
 #       cd executorch
-#       rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DEXECUTORCH_BUILD_SDK=1 -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=1 ..
-#       cd ..
-#       cmake --build cmake-out -j8 -t sdk_example_runner
-#       ./cmake-out/examples/sdk/sdk_example_runner --bundled_program_path <bundled_program>
+#       rm -rf cmake-out
+#       cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+#           -DCMAKE_BUILD_TYPE=Release \
+#           -DEXECUTORCH_BUILD_SDK=ON \
+#           -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+#           -Bcmake-out .
+#       cmake --build cmake-out -j9 --target install --config Release
+#
+#       local example_dir=examples/sdk
+#       local build_dir=cmake-out/${example_dir}
+#       CMAKE_PREFIX_PATH="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+#       rm -rf ${build_dir}
+#       cmake -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+#           -DCMAKE_BUILD_TYPE=Release \
+#           -B${build_dir} \
+#           ${example_dir}
+#       cmake --build ${build_dir} -j9 --config Release
+#       ${build_dir}/sdk_example_runner --bundled_program_path="bundled_program.bp"
 
 ######################################################################
 # Creating an Inspector
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
@@ -37,7 +37,7 @@ Note that groupsize less than 128 was not enabled, since such model were still t
 
 We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12.
 
-For Llama 3 8B, we have verified so far on iPhone 15 Pro Max and OnePlus 12 (with 16GB RAM).
+For Llama 3 8B, we have verified so far on iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM).
 
 ## Performance
 
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
@@ -21,7 +21,7 @@
 from executorch.exir import EdgeProgramManager
 from executorch.exir.backend.partitioner import Partitioner
 
-from executorch.exir.backend.utils import print_delegated_graph
+from executorch.exir.backend.utils import format_delegated_graph
 from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
 
 from executorch.exir.passes import MemoryPlanningPass
@@ -283,7 +283,7 @@ def export_to_edge(
                 dynamic_shapes=dynamic_shape,
                 edge_constant_methods=metadata,
                 edge_compile_config=edge_config,
-                verbose=True,
+                verbose=self.verbose,
             )
         return self
 
@@ -308,7 +308,7 @@ def to_backend(
                     self.edge_manager = self.edge_manager.to_backend(partitioner)
                     if self.verbose:
                         logging.info(
-                            print_delegated_graph(
+                            format_delegated_graph(
                                 self.edge_manager.exported_program().graph_module
                             )
                         )
diff --git a/exir/backend/test/test_utils.py b/exir/backend/test/test_utils.py
@@ -16,11 +16,11 @@
 from executorch.exir.backend.test.op_partitioner_demo import AddMulPartitionerDemo
 from executorch.exir.backend.utils import (
     DelegationBreakdown,
+    format_delegated_graph,
     get_delegates,
     get_delegation_info,
     get_non_lowered_nodes,
     is_identical_graph,
-    print_delegated_graph,
 )
 
 from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
@@ -266,7 +266,7 @@ def forward(self, a, x, b):
 
         edge = to_edge(export(m, inputs)).to_backend(AddMulPartitionerDemo())
 
-        graph_str = print_delegated_graph(edge.exported_program().graph_module)
+        graph_str = format_delegated_graph(edge.exported_program().graph_module)
         self.assertIn(
             "BackendWithCompilerDemo",
             graph_str,
diff --git a/exir/backend/utils.py b/exir/backend/utils.py
@@ -448,9 +448,16 @@ def _insert_op_occurrences_dict(node_name: str, delegated: bool) -> None:
     )
 
 
-def print_delegated_graph(graph_module: torch.fx.GraphModule) -> str:
+def print_delegated_graph(graph_module: torch.fx.GraphModule) -> None:
     """
-    Print the graph of including lowered_module (both backend id and original graph) together with the graph module. Example output:
+    Print the formatted graph string.
+    """
+    print(format_delegated_graph(graph_module))
+
+
+def format_delegated_graph(graph_module: torch.fx.GraphModule) -> str:
+    """
+    Return the formatted graph string of including lowered_module (both backend id and original graph) together with the graph module. Example output:
     graph():
         %arg0_1 : [num_users=2] = placeholder[target=arg0_1]
         %arg1_1 : [num_users=2] = placeholder[target=arg1_1]
diff --git a/kernels/README.md b/kernels/README.md
diff --git a/kernels/portable/README.md b/kernels/portable/README.md
diff --git a/kernels/portable/cpu/op_convolution.cpp b/kernels/portable/cpu/op_convolution.cpp
diff --git a/profiler/parse_profiler_results.py b/profiler/parse_profiler_results.py