Skip to content

Commit f934bc0

Browse files
committed
Update
[ghstack-poisoned]
2 parents 44ee51a + b34f04f commit f934bc0

File tree

11 files changed

+176
-334
lines changed

11 files changed

+176
-334
lines changed

.lintrunner.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,6 @@ exclude_patterns = [
264264
'examples/**',
265265
'exir/verification/bindings.cpp',
266266
'extension/**',
267-
# Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
268-
'kernels/portable/cpu/util/elementwise_util.h',
269267
'kernels/optimized/**',
270268
'runtime/core/exec_aten/**',
271269
# Want to be able to keep c10 in sync with PyTorch core.

backends/apple/coreml/CMakeLists.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ endif()
2525

2626
option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
2727

28+
set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
29+
2830
# inmemoryfs sources
2931
set(INMEMORYFS_SOURCES
3032
runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -229,3 +231,18 @@ install(
229231
INCLUDES
230232
DESTINATION ${_common_include_directories}
231233
)
234+
235+
# We only care about building the pybinding when building for macOS wheels.
236+
if(EXECUTORCH_BUILD_COREML AND EXECUTORCH_BUILD_PYBIND)
237+
if(NOT TARGET pybind11::pybind11)
238+
add_subdirectory(${EXECUTORCH_ROOT}/third-party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
239+
endif()
240+
241+
pybind11_add_module(executorchcoreml SHARED runtime/inmemoryfs/inmemory_filesystem_py.cpp)
242+
243+
target_compile_options(executorchcoreml PRIVATE -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET})
244+
if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
245+
target_compile_options(executorchcoreml PRIVATE -g)
246+
endif()
247+
target_link_libraries(executorchcoreml PRIVATE coreml_util coreml_inmemoryfs)
248+
endif()

extension/flat_tensor/serialize/serialize.py

Lines changed: 101 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -10,29 +10,33 @@
1010
import os
1111
import tempfile
1212
from dataclasses import dataclass
13-
from typing import ClassVar, Dict, List, Literal, Optional
13+
from typing import ClassVar, Dict, List, Literal, Optional, Sequence
1414

1515
import pkg_resources
1616
from executorch.exir._serialize._cord import Cord
1717
from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
1818

1919
from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
2020
from executorch.exir._serialize._program import _insert_flatbuffer_header
21-
from executorch.exir._serialize.data_serializer import DataPayload, DataSerializer
21+
from executorch.exir._serialize.data_serializer import (
22+
DataPayload,
23+
DataSerializer,
24+
TensorEntry,
25+
)
2226

2327
from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required
2428

25-
# Byte order of numbers written to flat tensor headers. Always little-endian
26-
# regardless of the host system, since all commonly-used modern CPUs are little
27-
# endian.
28-
_HEADER_BYTEORDER: Literal["little"] = "little"
29-
3029
from executorch.extension.flat_tensor.serialize.flat_tensor_schema import (
3130
DataSegment,
3231
FlatTensor,
3332
TensorMetadata,
3433
)
3534

35+
# Byte order of numbers written to flat tensor headers. Always little-endian
36+
# regardless of the host system, since all commonly-used modern CPUs are little
37+
# endian.
38+
_HEADER_BYTEORDER: Literal["little"] = "little"
39+
3640

3741
def _serialize_to_flatbuffer(flat_tensor: FlatTensor) -> Cord:
3842
"""Serializes a FlatTensor to a flatbuffer and returns the serialized data."""
@@ -209,6 +213,62 @@ def _get_extended_header(flat_tensor_data: bytes) -> Optional[FlatTensorHeader]:
209213
return None
210214

211215

216+
def _extract_tensors(
217+
fqn_to_tensor: Dict[str, TensorEntry],
218+
buffers: Sequence[bytes],
219+
segments: List[Cord],
220+
tensor_alignment: int,
221+
) -> List[TensorMetadata]:
222+
"""Places tensors into a single segment, aligned to tensor_alignment within
223+
the segment.
224+
225+
Args:
226+
fqn_to_tensor: A map from fully qualified names to tensor entries.
227+
buffers: A sequence of tensor buffers.
228+
segments: A list of segments to append the tensor data to. Modified in-place.
229+
tensor_alignment: The alignment of the tensor data.
230+
231+
Returns:
232+
A list of TensorMetadata, which describes the tensors in the segment.
233+
"""
234+
tensor_data: Cord = Cord()
235+
tensors: List[TensorMetadata] = []
236+
# {idx, offset}
237+
saved_offsets: Dict[int, int] = {}
238+
for fqn, tensor_entry in fqn_to_tensor.items():
239+
assert tensor_entry.layout is not None
240+
# Check index into the tensor buffers is valid.
241+
assert tensor_entry.buffer_index < len(
242+
buffers
243+
), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(buffers)}."
244+
245+
# Check if the tensor has already been appended to the flat_tensor_data.
246+
offset = saved_offsets.get(tensor_entry.buffer_index, -1)
247+
if offset == -1:
248+
if len(tensor_data) > 0:
249+
# Add padding to round off the previous tensor offset.
250+
pad_length = padding_required(len(tensor_data), tensor_alignment)
251+
tensor_data.append(b"\x00" * pad_length)
252+
# Add to saved offsets.
253+
offset = len(tensor_data)
254+
saved_offsets[tensor_entry.buffer_index] = offset
255+
# Append to flat_tensor_data at the offset.
256+
tensor_data.append(buffers[tensor_entry.buffer_index])
257+
258+
tensors.append(
259+
TensorMetadata(
260+
fully_qualified_name=fqn,
261+
scalar_type=tensor_entry.layout.scalar_type,
262+
sizes=tensor_entry.layout.sizes,
263+
dim_order=tensor_entry.layout.dim_order,
264+
segment_index=len(segments),
265+
offset=offset,
266+
)
267+
)
268+
segments.append(tensor_data)
269+
return tensors
270+
271+
212272
class FlatTensorSerializer(DataSerializer):
213273
"""A concrete implementation of the DataSerializer interface that
214274
serializes and deserializes data to/from the FlatTensor format.
@@ -227,61 +287,45 @@ def serialize(
227287
self,
228288
data: DataPayload,
229289
) -> Cord:
230-
"""Serializes a list of tensor metadata and tensors into a blob."""
231-
232-
flat_tensor_metadata: List[TensorMetadata] = []
233-
flat_tensor_data: Cord = Cord()
234-
235-
# {idx, offset}
236-
saved_offsets: Dict[int, int] = {}
237-
238-
for fqn, tensor_entry in data.fqn_to_tensor.items():
239-
assert tensor_entry.layout is not None
240-
# Check index into the tensor buffers is valid.
241-
assert tensor_entry.buffer_index < len(
242-
data.buffers
243-
), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(data.buffers)}."
244-
245-
# Check if the tensor has already been appended to the flat_tensor_data.
246-
offset = saved_offsets.get(tensor_entry.buffer_index, -1)
247-
if offset == -1:
248-
if len(flat_tensor_data) > 0:
249-
# Add padding to round off the previous tensor offset.
250-
pad_length = padding_required(
251-
len(flat_tensor_data), self.config.tensor_alignment
252-
)
253-
flat_tensor_data.append(b"\x00" * pad_length)
254-
# Add to saved offsets.
255-
offset = len(flat_tensor_data)
256-
saved_offsets[tensor_entry.buffer_index] = offset
257-
# Append to flat_tensor_data at the offset.
258-
flat_tensor_data.append(data.buffers[tensor_entry.buffer_index])
259-
260-
flat_tensor_metadata.append(
261-
TensorMetadata(
262-
fully_qualified_name=fqn,
263-
scalar_type=tensor_entry.layout.scalar_type,
264-
sizes=tensor_entry.layout.sizes,
265-
dim_order=tensor_entry.layout.dim_order,
266-
segment_index=0,
267-
offset=offset,
290+
"""Serializes a list of tensors and named data into a blob."""
291+
292+
segments: List[Cord] = []
293+
tensors = _extract_tensors(
294+
data.fqn_to_tensor,
295+
data.buffers,
296+
segments,
297+
self.config.tensor_alignment,
298+
)
299+
300+
data_segments: List[DataSegment] = []
301+
segment_data = Cord()
302+
for segment in segments:
303+
prev_end = (
304+
(data_segments[-1].offset + data_segments[-1].size)
305+
if data_segments
306+
else 0
307+
)
308+
data_segments.append(
309+
DataSegment(
310+
offset=aligned_size(prev_end, self.config.segment_alignment),
311+
size=len(segment),
268312
)
269313
)
270-
271-
# Pad flat_tensor_data to segment alignment.
272-
segment_pad_length = padding_required(
273-
len(flat_tensor_data), self.config.segment_alignment
274-
)
275-
if segment_pad_length > 0:
276-
flat_tensor_data.append(b"\x00" * segment_pad_length)
314+
# Pad segment_data to segment alignment.
315+
segment_pad_length = padding_required(
316+
len(segment_data), self.config.segment_alignment
317+
)
318+
if segment_pad_length > 0:
319+
segment_data.append(b"\x00" * segment_pad_length)
320+
segment_data.append(segment)
277321

278322
# Create FlatTensor, which describes of the contents of the file and
279323
# points to all the data segments. It will be serialized to flatbuffer.
280324
flat_tensor = FlatTensor(
281325
version=0, # Keep in sync with c++ version number in serialize.h
282326
tensor_alignment=self.config.tensor_alignment,
283-
tensors=flat_tensor_metadata,
284-
segments=[DataSegment(offset=0, size=len(flat_tensor_data))],
327+
tensors=tensors,
328+
segments=data_segments,
285329
named_data=[],
286330
)
287331

@@ -307,7 +351,7 @@ def serialize(
307351
flatbuffer_offset=padded_header_length,
308352
flatbuffer_size=len(flatbuffer_payload),
309353
segment_base_offset=segment_base_offset,
310-
segment_data_size=len(flat_tensor_data),
354+
segment_data_size=len(segment_data),
311355
).to_bytes()
312356

313357
# Pad header and payload to segment alignment.
@@ -327,15 +371,15 @@ def serialize(
327371
assert eh.flatbuffer_size == original_flatbuffer_payload_size
328372
assert eh.segment_base_offset == segment_base_offset
329373
assert eh.flatbuffer_offset == padded_header_length
330-
assert eh.segment_data_size == len(flat_tensor_data)
374+
assert eh.segment_data_size == len(segment_data)
331375

332376
del header_data
333377
del flatbuffer_payload
334378

335379
# Place everything into one segment.
336380
payload = Cord()
337381
payload.append(injected_flatbuffer_data)
338-
payload.append(flat_tensor_data)
382+
payload.append(segment_data)
339383

340384
return payload
341385

kernels/optimized/cpu/binary_ops.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,34 @@
1010

1111
#include <executorch/kernels/optimized/vec/functional.h>
1212
#include <executorch/kernels/portable/cpu/scalar_utils.h>
13-
#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
1413
#include <executorch/runtime/kernel/kernel_includes.h>
1514

1615
namespace torch {
1716
namespace executor {
17+
namespace internal {
18+
// NOTE: we bake ArrayRef iterators being pointers into the return
19+
// type here because we assume that iterators are portable across
20+
// ArrayRef copies.
21+
inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
22+
ArrayRef<Tensor::SizesType> arr) {
23+
return std::find_if(
24+
arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
25+
}
26+
27+
inline bool sizes_match_ignoring_leading_1s(
28+
ArrayRef<Tensor::SizesType> lhs,
29+
ArrayRef<Tensor::SizesType> rhs) {
30+
auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
31+
auto lhs_end = lhs.end();
32+
33+
auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
34+
auto rhs_end = rhs.end();
35+
36+
return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
37+
std::equal(lhs_begin, lhs_end, rhs_begin);
38+
}
39+
} // namespace internal
40+
1841
enum class ElementwiseOptimizedPath {
1942
kNone,
2043
kTreatAs1d,

kernels/optimized/cpu/targets.bzl

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,7 @@ def define_common_targets():
131131
srcs = [],
132132
exported_headers = ["op_add_sub_impl.h"],
133133
visibility = ["//executorch/kernels/optimized/cpu/..."],
134-
exported_deps = [
135-
"//executorch/runtime/core:core",
136-
"//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
137-
],
134+
exported_deps = ["//executorch/runtime/core:core"],
138135
)
139136

140137
runtime.cxx_library(

kernels/portable/cpu/op_mul.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,17 @@ Tensor& mul_out(
5252
out);
5353

5454
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
55-
utils::apply_bitensor_elementwise_fn<
56-
CTYPE_COMPUTE,
57-
op_name,
58-
utils::SupportedTensorDtypes::REALHBBF16>(
59-
[](const auto val_a, const auto val_b) { return val_a * val_b; },
55+
utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
56+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
57+
return val_a * val_b;
58+
},
6059
ctx,
6160
a,
6261
utils::SupportedTensorDtypes::REALHBBF16,
6362
b,
6463
utils::SupportedTensorDtypes::REALHBBF16,
65-
out);
64+
out,
65+
utils::SupportedTensorDtypes::REALHBBF16);
6666
});
6767

6868
return out;

kernels/portable/cpu/util/broadcast_indexes_range.h

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,28 +21,6 @@
2121
namespace torch::executor {
2222

2323
namespace internal {
24-
// NOTE: we bake ArrayRef iterators being pointers into the return
25-
// type here because we assume that iterators are portable across
26-
// ArrayRef copies.
27-
inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
28-
ArrayRef<Tensor::SizesType> arr) {
29-
return std::find_if(
30-
arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
31-
}
32-
33-
inline bool sizes_match_ignoring_leading_1s(
34-
ArrayRef<Tensor::SizesType> lhs,
35-
ArrayRef<Tensor::SizesType> rhs) {
36-
auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
37-
auto lhs_end = lhs.end();
38-
39-
auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
40-
auto rhs_end = rhs.end();
41-
42-
return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
43-
std::equal(lhs_begin, lhs_end, rhs_begin);
44-
}
45-
4624
template <std::size_t kNumInputs>
4725
class BroadcastIndexesIterator {
4826
public:
@@ -57,10 +35,7 @@ class BroadcastIndexesIterator {
5735
template <typename... Args>
5836
explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
5937
: output_dim_or_zero_if_no_broadcasting_(
60-
(sizes_match_ignoring_leading_1s(args.sizes(), output.sizes()) &&
61-
...)
62-
? 0
63-
: output.dim()),
38+
((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
6439
output_shape_(output.sizes()) {
6540
static_assert(
6641
sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),

0 commit comments

Comments
 (0)