Skip to content

Commit ae345a3

Browse files
committed
Update on "[ET-VK] Use push constants for image and buffer to nchw prepack nodes."
Differential Revision: [D75702273](https://our.internmc.facebook.com/intern/diff/D75702273/) [ghstack-poisoned]
2 parents 4d8d85d + 60a46f5 commit ae345a3

File tree

92 files changed

+1431
-753
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+1431
-753
lines changed

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ cmake --build cmake-out -j16 --target install --config Release
4040

4141
# Install llama runner with torchao
4242
cmake -DPYTHON_EXECUTABLE=python \
43-
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
4443
-DCMAKE_BUILD_TYPE=Release \
4544
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
4645
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \

.ci/scripts/test_model.sh

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,24 @@ prepare_artifacts_upload() {
4949
}
5050

5151
build_cmake_executor_runner() {
52+
local backend_string_select="${1:-}"
5253
echo "Building executor_runner"
5354
rm -rf ${CMAKE_OUTPUT_DIR}
54-
cmake -DCMAKE_BUILD_TYPE=Debug \
55-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
56-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
57-
-B${CMAKE_OUTPUT_DIR} .
58-
59-
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
55+
mkdir ${CMAKE_OUTPUT_DIR}
56+
if [[ "$backend_string_select" == "XNNPACK" ]]; then
57+
echo "Backend $backend_string_select selected"
58+
(cd ${CMAKE_OUTPUT_DIR} \
59+
&& cmake -DCMAKE_BUILD_TYPE=Release \
60+
-DEXECUTORCH_BUILD_XNNPACK=ON \
61+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
62+
cmake --build ${CMAKE_OUTPUT_DIR} -j4
63+
else
64+
cmake -DCMAKE_BUILD_TYPE=Debug \
65+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
66+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
67+
-B${CMAKE_OUTPUT_DIR} .
68+
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
69+
fi
6070
}
6171

6272
run_portable_executor_runner() {
@@ -111,19 +121,6 @@ test_model() {
111121
run_portable_executor_runner
112122
}
113123

114-
build_cmake_xnn_executor_runner() {
115-
echo "Building xnn_executor_runner"
116-
117-
(rm -rf ${CMAKE_OUTPUT_DIR} \
118-
&& mkdir ${CMAKE_OUTPUT_DIR} \
119-
&& cd ${CMAKE_OUTPUT_DIR} \
120-
&& retry cmake -DCMAKE_BUILD_TYPE=Release \
121-
-DEXECUTORCH_BUILD_XNNPACK=ON \
122-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
123-
124-
cmake --build ${CMAKE_OUTPUT_DIR} -j4
125-
}
126-
127124
test_model_with_xnnpack() {
128125
WITH_QUANTIZATION=$1
129126
WITH_DELEGATION=$2
@@ -148,12 +145,11 @@ test_model_with_xnnpack() {
148145

149146
# Run test model
150147
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
148+
# TODO eventually buck should also use consolidated executor runners
151149
buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}"
152150
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
153-
if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then
154-
build_cmake_xnn_executor_runner
155-
fi
156-
./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}"
151+
build_cmake_executor_runner "XNNPACK"
152+
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}"
157153
else
158154
echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
159155
exit 1

.ci/scripts/utils.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ build_executorch_runner() {
158158
cmake_install_executorch_lib() {
159159
echo "Installing libexecutorch.a and libportable_kernels.a"
160160
clean_executorch_install_folders
161-
retry cmake -DBUCK2="$BUCK" \
162-
-DCMAKE_INSTALL_PREFIX=cmake-out \
161+
retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
163162
-DCMAKE_BUILD_TYPE=Release \
164163
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
165164
-Bcmake-out .

backends/vulkan/_passes/fuse_quantized_ops.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from executorch.exir import ExportedProgram
1818
from executorch.exir.dialects._ops import ops as exir_ops
1919
from executorch.exir.pass_base import ExportPass, PassResult
20+
from executorch.exir.passes import dead_code_elimination_pass
2021

2122
#################
2223
## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
224225
)
225226

226227
graph_module.recompile()
227-
graph_module = super().call(graph_module).graph_module
228+
dead_code_elimination_pass(graph_module)
228229

230+
# Re-trace the graph since new nodes were (potentially) inserted
231+
graph_module = super().call(graph_module).graph_module
229232
return PassResult(graph_module, True)

backends/vulkan/_passes/int4_weight_only_quantizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import torch
88
import torch.nn.functional as F
99

10-
from torchao.quantization.GPTQ import _check_linear_int4_k
10+
from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
1111
from torchao.quantization.unified import Quantizer
1212
from torchao.quantization.utils import groupwise_affine_quantize_tensor
1313

backends/vulkan/_passes/tag_memory_meta_pass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import logging
8-
from copy import deepcopy
98
from typing import Any, Optional, Set
109

1110
import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
2221
from executorch.exir.dialects._ops import ops as exir_ops
2322

2423
from executorch.exir.pass_base import ExportPass, PassResult
24+
from executorch.exir.tensor import TensorSpec
2525

2626
logger: logging.Logger = logging.getLogger("")
2727
logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
5252
(arg,),
5353
)
5454
clone_node.meta["val"] = arg.meta["val"]
55-
clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
55+
clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
5656
clone_node.meta["spec"].const = False
5757
set_memory_metadata(clone_node, storage, layout)
5858
arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)

backends/vulkan/op_registry.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
230230
exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
231231
# Symbolic integer ops
232232
torch.ops.aten.sym_size.int,
233+
operator.add,
234+
operator.lt,
235+
operator.gt,
236+
operator.ge,
237+
operator.le,
238+
# Guard and assert ops
239+
torch.ops.aten._assert_scalar.default,
240+
torch.ops.aten.sym_constrain_range_for_size.default,
233241
]
234242
)
235243
def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
500508
return features
501509

502510

503-
@update_features(["llama::update_cache", "llama::custom_sdpa"])
511+
@update_features(
512+
[
513+
"llama::update_cache",
514+
"llama::custom_sdpa",
515+
]
516+
)
504517
def register_sdpa_ops(features: OpFeatures):
505518
features.resize_fn = False
506519
features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
520533
return features
521534

522535

523-
@update_features(exir_ops.edge.aten.view_copy.default)
524-
def register_view_op(features: OpFeatures):
536+
@update_features(
537+
[
538+
exir_ops.edge.aten.clone.default,
539+
exir_ops.edge.aten.permute.default,
540+
exir_ops.edge.aten.permute_copy.default,
541+
exir_ops.edge.aten.select_copy.int,
542+
exir_ops.edge.aten.slice_copy.Tensor,
543+
exir_ops.edge.aten.view_copy.default,
544+
]
545+
)
546+
def register_view_ops(features: OpFeatures):
525547
features.texture_impl = TextureImplFeatures(
526548
valid_packed_dims=all_packed_dims,
527549
)
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
538560
# Indexing and lookup
539561
exir_ops.edge.aten.flip.default,
540562
exir_ops.edge.aten.index_select.default,
541-
exir_ops.edge.aten.select_copy.int,
542563
# Tensor creation
543564
exir_ops.edge.aten.arange.start_step,
544-
exir_ops.edge.aten.clone.default,
545565
exir_ops.edge.aten.constant_pad_nd.default,
546566
exir_ops.edge.aten.full.default,
547567
exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
564584
# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
565585
@update_features(
566586
[
567-
# Indexing and lookup
568-
exir_ops.edge.aten.slice_copy.Tensor,
569587
# Shape Manipulation
570588
exir_ops.edge.aten.squeeze_copy.dims,
571589
exir_ops.edge.aten.unsqueeze_copy.default,
572-
exir_ops.edge.aten.permute_copy.default,
573590
# Tensor combination
574591
exir_ops.edge.aten.cat.default,
575592
exir_ops.edge.aten.repeat.default,

backends/vulkan/partitioner/vulkan_partitioner.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,11 @@ def op_node_is_compatible( # noqa: C901: Function is too complex
146146
def node_is_compatible(
147147
self, node: torch.fx.Node, features: Optional[OpFeatures] = None
148148
) -> Tuple[bool, str]:
149-
if utils.is_symint_node(node):
150-
return node.target in vulkan_supported_ops, "Op is compatible"
151-
elif utils.is_tensor_node(node):
149+
if utils.is_tensor_node(node):
152150
return self.op_node_is_compatible(node, features=features)
151+
# For non-tensor nodes, just check if the op is registered
152+
elif hasattr(node, "target"):
153+
return node.target in vulkan_supported_ops, "Op is compatible"
153154

154155
return False, f"Unsupported node type: {node.format_node()}"
155156

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
449449
return idx;
450450
}
451451

452+
ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
453+
for (int i = 0; i < values_.size(); ++i) {
454+
if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
455+
return i;
456+
}
457+
}
458+
return add_scalar(val);
459+
}
460+
452461
ValueRef ComputeGraph::set_input_tensor(
453462
const ValueRef idx,
454463
const bool use_staging) {

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,13 @@ class ComputeGraph final {
604604

605605
ValueRef add_symint(const int32_t val);
606606

607+
/*
608+
* Searches the graph's value list for a Int value with the specified value.
609+
* If one is found, returns the index of the value. Otherwise, add a new value
610+
* and return the index of the new value.
611+
*/
612+
ValueRef get_or_add_value_for_int(const int64_t val);
613+
607614
ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
608615
ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
609616

backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
2525
const ResizeFunction& resize_fn)
2626
: DispatchNode(
2727
graph,
28-
vkapi::ShaderInfo(),
29-
{1u, 1u, 1u},
28+
pick_shader_fn(&graph, args, resize_args),
3029
{1u, 1u, 1u},
30+
{8u, 8u, 1u},
3131
args,
3232
params,
3333
push_constants,
@@ -37,7 +37,6 @@ DynamicDispatchNode::DynamicDispatchNode(
3737
pick_shader_fn_(pick_shader_fn),
3838
pick_global_wg_fn_(pick_global_wg_fn),
3939
pick_local_wg_fn_(pick_local_wg_fn) {
40-
shader_ = pick_shader_fn(&graph, args, resize_args);
4140
global_workgroup_size_ =
4241
pick_global_wg_fn(&graph, shader_, args, resize_args);
4342
local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
14+
#define T ${buffer_scalar_type(DTYPE)}
15+
16+
${define_active_storage_type(STORAGE)}
17+
18+
#include "indexing_utils.h"
19+
20+
${define_required_extensions(DTYPE)}
21+
22+
layout(std430) buffer;
23+
24+
${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
25+
${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
26+
$if STORAGE == "buffer":
27+
${layout_declare_ubo(2, "int", "numel")}
28+
$else:
29+
${layout_declare_ubo(2, "ivec3", "out_limits")}
30+
31+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
32+
33+
#include "activations.h"
34+
35+
#ifdef USING_BUFFER
36+
37+
void main() {
38+
const int i = int(gl_GlobalInvocationID.x);
39+
if (i >= numel) {
40+
return;
41+
}
42+
43+
float in_val = float(t_in[i]);
44+
t_out[i] = T(tan(in_val));
45+
}
46+
47+
#else
48+
49+
void main() {
50+
const ivec3 pos = ivec3(gl_GlobalInvocationID);
51+
52+
if (any(greaterThanEqual(pos, out_limits))) {
53+
return;
54+
}
55+
56+
VEC4_T in_texel = texelFetch(t_in, pos, 0);
57+
imageStore(t_out, pos, VEC4_T(tan(in_texel)));
58+
}
59+
60+
#endif
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
tan:
2+
parameter_names_with_default_values:
3+
DTYPE: float
4+
STORAGE: texture3d
5+
generate_variant_forall:
6+
DTYPE:
7+
- VALUE: half
8+
- VALUE: float
9+
STORAGE:
10+
- VALUE: texture3d
11+
- VALUE: buffer
12+
shader_variants:
13+
- NAME: tan

backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
1010

11+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
1112
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
1213

1314
#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -30,8 +31,8 @@ void check_binary_op_args(
3031
void resize_binary_op_node(
3132
ComputeGraph* graph,
3233
const std::vector<ArgGroup>& args,
33-
const std::vector<ValueRef>& extra_args) {
34-
(void)extra_args;
34+
const std::vector<ValueRef>& resize_args) {
35+
(void)resize_args;
3536
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
3637

3738
// TODO(T183442143): Verify tensors are broadcastable.
@@ -78,11 +79,11 @@ void add_binary_op_texture_node(
7879
add_storage_type_suffix(kernel_name, *t_out);
7980
add_dtype_suffix(kernel_name, *t_out);
8081

81-
graph.execute_nodes().emplace_back(new DispatchNode(
82+
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
8283
graph,
8384
VK_KERNEL_FROM_STR(kernel_name),
84-
graph.create_global_wg_size(out),
85-
graph.create_local_wg_size(out),
85+
default_pick_global_wg_size,
86+
default_pick_local_wg_size,
8687
// Inputs and Outputs
8788
{{out, vkapi::kWrite}, {{arg1, arg2}, vkapi::kRead}},
8889
// Shader params buffers
@@ -122,11 +123,11 @@ void add_binary_op_buffer_node(
122123
add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
123124
add_dtype_suffix(kernel_name, graph.dtype_of(out));
124125

125-
graph.execute_nodes().emplace_back(new DispatchNode(
126+
graph.execute_nodes().emplace_back(new DynamicDispatchNode(
126127
graph,
127128
VK_KERNEL_FROM_STR(kernel_name),
128-
graph.create_global_wg_size(out),
129-
graph.create_local_wg_size(out),
129+
default_pick_global_wg_size,
130+
default_pick_local_wg_size,
130131
// Inputs and Outputs
131132
{{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}},
132133
// Shader params buffers

0 commit comments

Comments
 (0)