Skip to content

Commit 96adfcf

Browse files
committed
Update on "[kernel] Add template based unboxing"
Adding a new feature to allow users to bypass codegen and register their kernels directly. This is very useful for custom kernels for custom ops. Example usage: ``` Tensor& my_op(RuntimeContext& ctx, const Tensor& self, const Tensor& other, Tensor& out) { // ... return out; } Kernel my_kernel = Kernel.make_boxed_kernel("my_ns::my_op",EXECUTORCH_FN(my_op)); register_kernels({my_kernel}); ``` Differential Revision: [D51553099](https://our.internmc.facebook.com/intern/diff/D51553099) [ghstack-poisoned]
2 parents e927f11 + 570c0aa commit 96adfcf

File tree

324 files changed

+11141
-4510
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

324 files changed

+11141
-4510
lines changed

.github/workflows/update-viablestrict.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@ jobs:
2020
with:
2121
repository: pytorch/executorch
2222
stable-branch: viable/strict
23-
requires: '[\"pull\", \"lint\", \"Build documentation\"]'
23+
requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\"]'
2424
secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }}
2525
rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}

CMakeLists.txt

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ endif()
9898
# data into sections so they can be properly gc'd. -s: strip symbol.
9999
# -fno-exceptions -fno-rtti: disables exceptions and runtime type.
100100
set(CMAKE_CXX_FLAGS_RELEASE
101-
"-O2 -ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
101+
"-ffunction-sections -fdata-sections -fno-exceptions -fno-rtti")
102102
if(NOT APPLE)
103103
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
104104
endif()
@@ -125,6 +125,10 @@ option(EXECUTORCH_BUILD_SIZE_TEST "Whether to build size test" OFF)
125125
option(EXECUTORCH_BUILD_XNNPACK
126126
"Build xnn_executor_runner which depends on XNNPACK" OFF)
127127

128+
# Build the vulkan delegate along with the vulkan executor_runner
129+
option(EXECUTORCH_BUILD_VULKAN
130+
"Build the Vulkan delegate and the Vulkan executor_runner" OFF)
131+
128132
option(EXECUTORCH_BUILD_SDK
129133
"Build the ExecuTorch SDK library and the SDK example runner.")
130134

@@ -144,6 +148,10 @@ option(EXECUTORCH_BUILD_EXTENSION_MODULE
144148
option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL
145149
"Build the extension/runner_util directory" OFF)
146150

151+
# Build test binaries that rely on googletest
152+
option(EXECUTORCH_BUILD_GTESTS
153+
"Build googletest based test binaries" OFF)
154+
147155
if(NOT BUCK2)
148156
set(BUCK2 buck2)
149157
endif()
@@ -315,7 +323,12 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
315323
target_compile_options(executor_runner PUBLIC ${_common_compile_options})
316324
endif()
317325

318-
# Add Android JNI subdirectory
326+
# Add googletest if any test targets should be built
327+
if(EXECUTORCH_BUILD_GTESTS)
328+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
329+
endif()
330+
331+
option(EXECUTORCH_BUILD_ANDROID_JNI "Build Android JNI" OFF)
319332
if(EXECUTORCH_BUILD_ANDROID_JNI)
320333
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
321334
endif()
@@ -327,6 +340,10 @@ if(EXECUTORCH_BUILD_SDK)
327340
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
328341
endif()
329342

343+
if(EXECUTORCH_BUILD_EXTENSION_APPLE)
344+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
345+
endif()
346+
330347
if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
331348
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
332349
endif()
@@ -344,56 +361,53 @@ if(EXECUTORCH_BUILD_XNNPACK)
344361
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
345362
endif()
346363

364+
if(EXECUTORCH_BUILD_VULKAN)
365+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
366+
endif()
367+
347368
option(EXECUTORCH_BUILD_QNN "Build the backends/qualcomm directory" OFF)
348369
if(EXECUTORCH_BUILD_QNN)
349370
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
350371
endif()
351372

352-
# Build Arm Baremetal backend
353373
option(EXECUTORCH_BUILD_ARM_BAREMETAL
354374
"Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF)
355375
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
356376
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
357377
endif()
358378

379+
option(EXECUTORCH_BUILD_MPS "Build the MPS Backend" OFF)
359380
if(EXECUTORCH_BUILD_MPS)
360381
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
361382
endif()
362383

363-
# Build CoreML backend
364-
option(EXECUTORCH_BUILD_COREML "Build the backends/apple/coreml directory" OFF)
384+
option(EXECUTORCH_BUILD_COREML "Build the Core ML Backend" OFF)
365385
if(EXECUTORCH_BUILD_COREML)
366-
# CoreML delegate library can only be built with iOS toolchain
367-
if(CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS\.)|(ios\.toolchain\.)cmake$")
368-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
369-
else()
370-
message(
371-
FATAL_ERROR "executorch: Building CoreML delegate requires iOS toolchain")
372-
endif()
386+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
373387
endif()
374388

375-
# Build pybind
376-
option(EXECUTORCH_BUILD_PYBIND "Build pybindings" OFF)
389+
option(EXECUTORCH_BUILD_PYBIND "Build the Python Bindings" OFF)
377390
if(EXECUTORCH_BUILD_PYBIND)
391+
392+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
393+
378394
if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
379-
# This has already been added if above flag is on
380395
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
381396
endif()
382-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
383-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
384397

385-
if(PYBIND_LINK_COREML)
386-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
398+
if(NOT EXECUTORCH_BUILD_SDK)
399+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
400+
endif()
401+
402+
if(EXECUTORCH_BUILD_COREML)
387403
set(PYBIND_LINK_COREML "coremldelegate")
388404
endif()
389405

390-
if(PYBIND_LINK_MPS)
391-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
406+
if(EXECUTORCH_BUILD_MPS)
392407
set(PYBIND_LINK_MPS "mpsdelegate")
393408
endif()
394409

395410
if(EXECUTORCH_BUILD_XNNPACK)
396-
# set PYBIND_LINK_XNNPACK variable to link with portable lib library
397411
set(PYBIND_LINK_XNNPACK "xnnpack_backend")
398412
endif()
399413

@@ -443,5 +457,6 @@ if(EXECUTORCH_BUILD_PYBIND)
443457
install(TARGETS portable_lib
444458
LIBRARY DESTINATION executorch/extension/pybindings)
445459
endif()
460+
446461
# Print all summary
447462
executorch_print_configuration_summary()

backends/apple/mps/operators/node_visitor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,6 @@ def get_serialized_data(
235235
def get_serialized_id(
236236
self, node: Union[torch.fx.Node, float, int], mps_graph: MPSGraph
237237
) -> int:
238-
239238
"""
240239
Map a tensor to a unique id. If the tensor was already mapped, return
241240
the existent id.

backends/arm/arm_backend.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,11 @@ def preprocess( # noqa: C901
128128
# Add output to TOSA graph
129129
tosa_graph.currRegion.currBasicBlock.addTensor(
130130
output.name,
131-
inputs[0].shape
132-
if is_permute_node_before_addmm(node)
133-
else output.shape,
131+
(
132+
inputs[0].shape
133+
if is_permute_node_before_addmm(node)
134+
else output.shape
135+
),
134136
ts.DType.INT8 if is_quant_node(node) else output.dtype,
135137
)
136138

backends/arm/arm_vela.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import numpy as np
1414

15+
1516
# Pack either input or output tensor block, compose the related arrays into
1617
# per-io structs to simplify runtime use.
1718
def vela_bin_pack_io(prefix, data):

backends/arm/test/ops/test_add.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def _test_add_tosa_BI_pipeline(
8888
.to_executorch()
8989
)
9090
if TOSA_REF_MODEL_INSTALLED:
91-
tester.run_method().compare_outputs()
91+
tester.run_method().compare_outputs(qtol=1)
9292
else:
9393
logger.warning(
9494
"TOSA ref model tool not installed, skip numerical correctness tests"
@@ -118,8 +118,6 @@ def test_add_tosa_MI(self):
118118
test_data = (torch.randn(4, 4, 4),)
119119
self._test_add_tosa_MI_pipeline(self.Add(), test_data)
120120

121-
# TODO: Will this type of parametrization be supported? pytest seem
122-
# have issue with it.
123121
@parameterized.expand(
124122
[
125123
(torch.ones(5),), # test_data

backends/arm/test/tester/arm_tester.py

Lines changed: 111 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# LICENSE file in the root directory of this source tree.
55

66
from enum import Enum
7-
from typing import Optional, Tuple
7+
from typing import List, Optional, Tuple, Union
88

99
import torch
1010
from executorch.backends.arm.arm_backend import (
@@ -15,6 +15,7 @@
1515
from executorch.backends.arm.arm_partitioner import ArmPartitioner
1616

1717
from executorch.backends.arm.test.tosautil.tosa_test_utils import (
18+
QuantizationParams,
1819
TosaProfile,
1920
TosaTestUtils,
2021
)
@@ -32,6 +33,7 @@
3233
get_symmetric_quantization_config,
3334
XNNPACKQuantizer,
3435
)
36+
from torch.export import ExportedProgram
3537

3638

3739
class ArmBackendSelector(Enum):
@@ -61,6 +63,7 @@ def __init__(
6163
TosaProfile.BI or TosaProfile.MI
6264
"""
6365
self.tosa_test_util = None
66+
self.is_quantized = profile == TosaProfile.BI
6467
if backend == ArmBackendSelector.TOSA:
6568
self.tosa_test_util = TosaTestUtils(profile=profile)
6669
# The spec below tiggers arm_backend.py to output two files:
@@ -119,54 +122,121 @@ def run_method(
119122
), "self.tosa_test_util is not initialized, cannot use run_method()"
120123
inputs_to_run = inputs or self.inputs
121124

122-
# TODO: we can't possible need to use all these stages??
123-
export_stage = self.stages[
124-
self.stage_name(Export)
125-
] # this is what XNNpack use to get quant params
126-
toedge_stage = self.stages[
127-
self.stage_name(ToEdge)
128-
] # this is what get_input_quantization_params use to get quant params
129-
partition_stage = self.stages[
130-
self.stage_name(Partition)
131-
] # this is what tosa_ref_dump_inputs use....
132-
133-
# TODO: I'd prefer to use this TOSA buffer instead of output.tosa,
134-
# generated by arm_backend.py. The issue is that we're still depending
135-
# on desc.json, which is created from TosaSerializer class, not from
136-
# the serialized TOSA buffer. Leave this here for review purposes.
137-
# ts_serialized = self._get_serialized_tosa_buffer( # unused
138-
# partition_stage.artifact
139-
# )
140-
141-
# This is where the torch reference output is calculated and set
142-
# TODO: This sets self.quantization_scale, which is duplicates
143-
# self.tosa_test_util.quantization.output.scales (?). Fixme.
144-
(
145-
self.reference_output,
146-
self.quantization_scale,
147-
) = self._calculate_reference_output(export_stage.artifact, inputs_to_run)
148-
149-
# Convert the torch inputs to something TOSA ref model can use
150-
tensor_names_and_inputs_np = self.tosa_test_util.convert_inputs_to_tosa(
151-
partition_stage.artifact, toedge_stage.artifact, inputs_to_run
125+
export_stage = self.stages[self.stage_name(Export)]
126+
127+
(input_names, qp_input) = self._get_input_params(export_stage.artifact)
128+
(output_name, qp_output) = self._get_output_param(export_stage.artifact)
129+
130+
# Calculate the reference output using the original module or the quant
131+
# module. self.quantization_scale is used by compare_outputs() to
132+
# calculate the tolerance
133+
self.quantization_scale = None if qp_output is None else qp_output.scale
134+
if self.is_quantized:
135+
module_for_ref = self.stages[self.stage_name(Quantize)].artifact
136+
else:
137+
module_for_ref = self.original_module
138+
self.reference_output = self._calculate_reference_output(
139+
module_for_ref, inputs_to_run
152140
)
153141

154142
# Run the TOSA ref model to get the output tensor, which will be
155143
# compared to the torch output in compare_outputs()
156144
self.stage_output = self.tosa_test_util.run_tosa_ref_model(
157-
tensor_names_and_inputs_np
145+
params_input=(input_names, qp_input),
146+
param_output=(output_name, qp_output),
147+
inputs=inputs_to_run,
158148
)
159149

160150
return self
161151

162-
def _get_serialized_tosa_buffer(self, partition_stage: Partition) -> bytes:
152+
def _get_input_params(
153+
self, program: ExportedProgram
154+
) -> Tuple[str, Union[List[QuantizationParams], List[None]]]:
163155
"""
164-
This is just a prototype...
165-
Todo:
166-
* The "_0" indicates that there are many lowered modules. Loop it!
167-
* There's probably a better way to get this buffer. An API? Yes,
168-
it seems the serialize stage does this for you...
156+
Get name and optionally quantization parameters for the inputs to this
157+
model.
158+
159+
Args:
160+
program (ExportedProgram): The program to get input parameters from
161+
Returns:
162+
Tuple[str, Optional[QuantizationParams]]: A tuple containing the
163+
input node names and their quantization parameters.
164+
"""
165+
input_names = []
166+
# E.g. bias and weights are 'placeholders' as well. This is used to
167+
# get only the use inputs.
168+
usr_inputs = program.graph_signature.user_inputs
169+
for node in program.graph.nodes:
170+
if node.op == "placeholder" and node.name in usr_inputs:
171+
input_names.append(node.name)
172+
continue
173+
174+
if self.is_quantized:
175+
quant_params = []
176+
for node in program.graph.nodes:
177+
if (
178+
node.target
179+
== torch.ops.quantized_decomposed.quantize_per_tensor.default
180+
and node.args[0].name in input_names
181+
):
182+
qp = QuantizationParams(
183+
node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
184+
)
185+
quant_params.append(qp)
186+
if len(quant_params) == len(
187+
input_names
188+
): # break early if we have all the inputs quantized parameters
189+
break
190+
assert len(quant_params) != 0, "Quantization paramerters not found"
191+
return (input_names, quant_params)
192+
else:
193+
return (input_names, len(input_names) * [None]) # return a list of None's
194+
195+
def _get_output_param(
196+
self, program: ExportedProgram
197+
) -> Tuple[str, Union[QuantizationParams, None]]:
169198
"""
170-
return partition_stage._edge_programs[
171-
"forward"
172-
]._graph_module.lowered_module_0.processed_bytes
199+
Get name and optionally quantization parameters for the inputs to this
200+
model.
201+
202+
Args:
203+
program (ExportedProgram): The program to get output parameters from.
204+
Returns:
205+
Tuple[str, Optional[QuantizationParams]]: A tuple containing the
206+
output node name and its quantization parameters.
207+
"""
208+
output_node = None
209+
for node in program.graph.nodes:
210+
if node.op == "output":
211+
output_node = node
212+
break
213+
214+
if self.is_quantized:
215+
quant_params = None
216+
for node in program.graph.nodes:
217+
if (
218+
node.target
219+
== torch.ops.quantized_decomposed.dequantize_per_tensor.default
220+
and node == output_node.args[0][0]
221+
):
222+
quant_params = QuantizationParams(
223+
node_name=node.args[0].name, scale=node.args[1], zp=node.args[2]
224+
)
225+
break # break early, there's only one output node
226+
assert quant_params is not None, "Quantization paramerters not found"
227+
return (output_node.name, quant_params)
228+
else:
229+
return (output_node.name, None)
230+
231+
@staticmethod
232+
def _calculate_reference_output(
233+
module: Union[torch.fx.GraphModule, torch.nn.Module], inputs
234+
) -> torch.Tensor:
235+
"""
236+
Note: I'd prefer to use the base class method here, but since it use the
237+
exported program, I can't. The partitioner stage clears the state_dict
238+
of the exported program, which causes an issue when evaluating the
239+
module.
240+
"""
241+
242+
return module.forward(*inputs)

0 commit comments

Comments
 (0)