Skip to content

Commit 5cd6447

Browse files
committed
Update on "[ET-VK] Adding batch processing in x axis to conv2d dw shader by caching input texel for reuse."
This diff adds batch processing in the x axis to the conv2d dw shader by reusing input texel overlapping between consecutive tiles. The changes include modifying the glsl code for the conv2d dw output tile, adding a new parameter to the yaml file, and modifying the Convolution.cpp file to use the new parameter. Differential Revision: [D67868671](https://our.internmc.facebook.com/intern/diff/D67868671/) [ghstack-poisoned]
2 parents b6d7a76 + 7260da1 commit 5cd6447

File tree

18 files changed

+133
-57
lines changed

18 files changed

+133
-57
lines changed

.lintrunner.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,8 @@ include_patterns = [
302302
'profiler/**/*.py',
303303
'runtime/**/*.py',
304304
'scripts/**/*.py',
305-
# 'test/**/*.py',
306-
# 'util/**/*.py',
305+
'test/**/*.py',
306+
'util/**/*.py',
307307
'*.py',
308308
]
309309
exclude_patterns = [

.mypy.ini

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@ files =
2121
profiler,
2222
runtime,
2323
scripts,
24+
test,
2425
util
2526

2627
mypy_path = executorch
2728

29+
[mypy-executorch.backends.*]
30+
follow_untyped_imports = True
31+
2832
[mypy-executorch.codegen.*]
2933
follow_untyped_imports = True
3034

@@ -46,6 +50,12 @@ follow_untyped_imports = True
4650
[mypy-executorch.runtime.*]
4751
follow_untyped_imports = True
4852

53+
[mypy-executorch.test.*]
54+
follow_untyped_imports = True
55+
56+
[mypy-functorch.*]
57+
follow_untyped_imports = True
58+
4959
[mypy-requests.*]
5060
follow_untyped_imports = True
5161

@@ -80,4 +90,4 @@ ignore_missing_imports = True
8090
ignore_missing_imports = True
8191

8292
[mypy-zstd]
83-
ignore_missing_imports = True
93+
ignore_missing_imports = True

backends/cadence/fusion_g3/operators/op_add.cpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <xa_nnlib_kernels_api.h>
1212

13+
#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
1314
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1415
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1516
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
@@ -28,15 +29,6 @@ namespace impl {
2829
namespace G3 {
2930
namespace native {
3031

31-
#define XT_KERNEL_CHECK(ctx, out, kernel, ...) \
32-
const auto ret = kernel(__VA_ARGS__); \
33-
ET_KERNEL_CHECK_MSG( \
34-
ctx, \
35-
ret == 0, \
36-
InvalidArgument, \
37-
out, \
38-
"Failed to run kernel: " #kernel "(" #__VA_ARGS__ ")");
39-
4032
Tensor& add_out(
4133
KernelRuntimeContext& ctx,
4234
const Tensor& a,

backends/cadence/fusion_g3/operators/targets.bzl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
2727
deps = deps + common_deps,
2828
exported_deps = [
2929
":operators_header",
30+
":xt_macros",
3031
],
3132
)
3233

@@ -61,5 +62,17 @@ def define_common_targets():
6162
],
6263
)
6364

65+
runtime.cxx_library(
66+
name = "xt_macros",
67+
exported_headers = ["xt_macros.h"],
68+
visibility = [
69+
"//executorch/backends/cadence/...",
70+
],
71+
exported_deps = [
72+
"//executorch/runtime/core/exec_aten:lib",
73+
"//executorch/runtime/kernel:kernel_runtime_context",
74+
],
75+
)
76+
6477
for op in OPERATORS:
6578
define_operator(op)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
12+
13+
#define XT_KERNEL_CHECK(ctx, out, kernel, ...) \
14+
const auto ret = kernel(__VA_ARGS__); \
15+
ET_KERNEL_CHECK_MSG( \
16+
ctx, \
17+
ret == 0, \
18+
InvalidArgument, \
19+
out, \
20+
"Failed to run kernel: " #kernel "(" #__VA_ARGS__ ")");

backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ void main() {
100100
}
101101

102102
// accumulate dot product in 1st sum only until tile size
103-
if (i < int(TILE_SIZE)) {
103+
if (i < TILE_SIZE) {
104104
for (int j = 0; j < TILE_SIZE; j++, kx++) {
105105
prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);
106106
for (int s = 0; s < BATCH_SIZE_X; s++) {

docs/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
load("@fbcode_macros//build_defs:native_rules.bzl", "buck_filegroup", "buck_sh_test")
22
load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
33

4-
oncall("pytorch_r2p")
4+
oncall("executorch")
55

66
python_binary(
77
name = "sphinx",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
load(":targets.bzl", "define_common_targets")
3+
4+
oncall("executorch")
5+
6+
define_common_targets()
7+
8+
runtime.python_library(
9+
name = "schema",
10+
srcs = [
11+
"flat_tensor_schema.py",
12+
],
13+
visibility = [
14+
"//executorch/...",
15+
],
16+
)

extension/flat_tensor/flat_tensor.fbs renamed to extension/flat_tensor/serialize/flat_tensor.fbs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ table TensorMetadata {
1313
scalar_type: executorch_flatbuffer.ScalarType;
1414

1515
// Size of each dimension.
16-
dim_sizes: [int32];
16+
sizes: [int32];
1717

1818
// Specifies in what order the dimensions are laid out in memory (from outer
1919
// to inner).

extension/flat_tensor/flat_tensor_schema.py renamed to extension/flat_tensor/serialize/flat_tensor_schema.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
class TensorMetadata:
1818
fully_qualified_name: str
1919
scalar_type: ScalarType
20-
dim_sizes: List[int]
20+
sizes: List[int]
2121
dim_order: List[bytes]
2222

2323
segment_index: int
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
runtime.genrule(
5+
name = "gen_schema",
6+
srcs = [
7+
"flat_tensor.fbs",
8+
"scalar_type.fbs",
9+
],
10+
outs = {
11+
"schema_generated.h": ["flat_tensor_generated.h"],
12+
"scalar_type_generated.h": ["scalar_type_generated.h"]
13+
},
14+
cmd = " ".join([
15+
"$(exe {})".format(runtime.external_dep_location("flatc")),
16+
"--cpp",
17+
"--cpp-std c++11",
18+
"--scoped-enums",
19+
"-o ${OUT}",
20+
"${SRCS}",
21+
]),
22+
default_outs = ["."],
23+
)
24+
25+
runtime.cxx_library(
26+
name = "generated_headers",
27+
srcs = [],
28+
visibility = [
29+
"//executorch/...",
30+
],
31+
exported_headers = {
32+
"schema_generated.h": ":gen_schema[schema_generated.h]",
33+
"scalar_type_generated.h": ":gen_schema[scalar_type_generated.h]",
34+
},
35+
exported_external_deps = ["flatbuffers-api"],
36+
)

test/end2end/exported_module.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,7 @@ def return_wrapper():
126126
trace_inputs_method = "get_upper_bound_inputs"
127127
get_trace_inputs = get_inputs_adapter(
128128
(
129-
# pyre-fixme[6]: For 1st argument expected `(...) -> Any` but got
130-
# `Union[Module, Tensor]`.
131-
getattr(eager_module, trace_inputs_method)
129+
getattr(eager_module, trace_inputs_method) # type: ignore[arg-type]
132130
if hasattr(eager_module, trace_inputs_method)
133131
else eager_module.get_random_inputs
134132
),
@@ -144,18 +142,14 @@ def return_wrapper():
144142
if hasattr(eager_module, "get_dynamic_shapes"):
145143
assert capture_config is not None
146144
assert capture_config.enable_aot is True
147-
# pyre-fixme[29]: `Union[nn.modules.module.Module,
148-
# torch._tensor.Tensor]` is not a function.
149-
trace_dynamic_shapes = eager_module.get_dynamic_shapes()
145+
trace_dynamic_shapes = eager_module.get_dynamic_shapes() # type: ignore[operator]
150146
method_name_to_dynamic_shapes = {}
151147
for method in methods:
152148
method_name_to_dynamic_shapes[method] = trace_dynamic_shapes
153149

154150
memory_planning_pass = MemoryPlanningPass()
155151
if hasattr(eager_module, "get_memory_planning_pass"):
156-
# pyre-fixme[29]: `Union[nn.modules.module.Module,
157-
# torch._tensor.Tensor]` is not a function.
158-
memory_planning_pass = eager_module.get_memory_planning_pass()
152+
memory_planning_pass = eager_module.get_memory_planning_pass() # type: ignore[operator]
159153

160154
class WrapperModule(nn.Module):
161155
def __init__(self, method):
@@ -172,7 +166,7 @@ def __init__(self, method):
172166
assert method_name == "forward"
173167
ep = _export(
174168
eager_module,
175-
method_input,
169+
method_input, # type: ignore[arg-type]
176170
dynamic_shapes=(
177171
method_name_to_dynamic_shapes[method_name]
178172
if method_name_to_dynamic_shapes
@@ -184,7 +178,7 @@ def __init__(self, method):
184178
else:
185179
exported_methods[method_name] = export(
186180
eager_module,
187-
method_input,
181+
method_input, # type: ignore[arg-type]
188182
dynamic_shapes=(
189183
method_name_to_dynamic_shapes[method_name]
190184
if method_name_to_dynamic_shapes
@@ -220,9 +214,7 @@ def __init__(self, method):
220214

221215
# Get a function that creates random inputs appropriate for testing.
222216
get_random_inputs_fn = get_inputs_adapter(
223-
# pyre-fixme[6]: For 1st argument expected `(...) -> Any` but got
224-
# `Union[Module, Tensor]`.
225-
eager_module.get_random_inputs,
217+
eager_module.get_random_inputs, # type: ignore[arg-type]
226218
# all exported methods must have the same signature so just pick the first one.
227219
methods[0],
228220
)

test/end2end/test_end2end.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,7 @@
5252
kernel_mode = None # either aten mode or lean mode
5353
try:
5454
from executorch.extension.pybindings.portable_lib import (
55-
_load_bundled_program_from_buffer,
5655
_load_for_executorch_from_buffer,
57-
_load_for_executorch_from_bundled_program,
5856
)
5957

6058
kernel_mode = "lean"
@@ -63,10 +61,8 @@
6361
pass
6462

6563
try:
66-
from executorch.extension.pybindings.aten_lib import (
67-
_load_bundled_program_from_buffer,
64+
from executorch.extension.pybindings.aten_lib import ( # type: ignore[import-not-found]
6865
_load_for_executorch_from_buffer,
69-
_load_for_executorch_from_bundled_program,
7066
)
7167

7268
assert kernel_mode is None

test/models/export_delegated_program.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,7 @@ def export_module_to_program(
118118
eager_module = module_class().eval()
119119
inputs = ()
120120
if hasattr(eager_module, "get_random_inputs"):
121-
# pyre-fixme[29]: `Union[nn.modules.module.Module, torch._tensor.Tensor]` is
122-
# not a function.
123-
inputs = eager_module.get_random_inputs()
121+
inputs = eager_module.get_random_inputs() # type: ignore[operator]
124122

125123
class WrapperModule(torch.nn.Module):
126124
def __init__(self, fn):
@@ -153,7 +151,7 @@ def forward(self, *args, **kwargs):
153151
).to_executorch(config=et_config)
154152
else:
155153
edge: exir.EdgeProgramManager = to_edge(exported_program)
156-
lowered_module = to_backend(
154+
lowered_module = to_backend( # type: ignore[call-arg]
157155
backend_id, edge.exported_program(), compile_specs=[]
158156
)
159157

test/models/generate_linear_out_bundled_program.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
from executorch.exir.passes import MemoryPlanningPass, ToOutVarPass
2828
from executorch.exir.print_program import pretty_print
2929

30-
from executorch.test.models.linear_model import LinearModel
30+
from executorch.test.models.linear_model import ( # type: ignore[import-not-found]
31+
LinearModel,
32+
)
3133
from torch.export import export
3234

3335

util/activation_memory_profiler.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import json
1010
import typing
1111
from dataclasses import dataclass, field
12-
from typing import List
12+
from typing import Any, Dict, List, Optional
1313

1414
import executorch.exir.memory as memory
1515
import torch
@@ -52,7 +52,7 @@ def create_tensor_allocation_info(graph: torch.fx.Graph) -> List[MemoryTimeline]
5252
allocations at that timestep.
5353
"""
5454
nodes = graph.nodes
55-
memory_timeline = [None] * len(nodes)
55+
memory_timeline: List[Optional[MemoryTimeline]] = [None for _ in range(len(nodes))]
5656
for _, node in enumerate(nodes):
5757
if node.op == "output":
5858
continue
@@ -72,11 +72,11 @@ def create_tensor_allocation_info(graph: torch.fx.Graph) -> List[MemoryTimeline]
7272
stack_trace = node.meta.get("stack_trace")
7373
fqn = _get_module_hierarchy(node)
7474
for j in range(start, end + 1):
75-
if memory_timeline[j] is None:
76-
# pyre-ignore
77-
memory_timeline[j] = MemoryTimeline()
78-
# pyre-ignore
79-
memory_timeline[j].allocations.append(
75+
memory_timeline_j = memory_timeline[j]
76+
if memory_timeline_j is None:
77+
memory_timeline_j = MemoryTimeline()
78+
assert memory_timeline_j
79+
memory_timeline_j.allocations.append(
8080
Allocation(
8181
node.name,
8282
node.target,
@@ -87,8 +87,7 @@ def create_tensor_allocation_info(graph: torch.fx.Graph) -> List[MemoryTimeline]
8787
stack_trace,
8888
)
8989
)
90-
# pyre-ignore
91-
return memory_timeline
90+
return memory_timeline # type: ignore[return-value]
9291

9392

9493
def _validate_memory_planning_is_done(exported_program: ExportedProgram):
@@ -129,7 +128,7 @@ def generate_memory_trace(
129128

130129
memory_timeline = create_tensor_allocation_info(exported_program.graph)
131130
root = {}
132-
trace_events = []
131+
trace_events: List[Dict[str, Any]] = []
133132
root["traceEvents"] = trace_events
134133

135134
tid = 0
@@ -138,7 +137,7 @@ def generate_memory_trace(
138137
if memory_timeline_event is None:
139138
continue
140139
for allocation in memory_timeline_event.allocations:
141-
e = {}
140+
e: Dict[str, Any] = {}
142141
e["name"] = allocation.name
143142
e["cat"] = "memory_allocation"
144143
e["ph"] = "X"

0 commit comments

Comments
 (0)