pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 17 additions & 0 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 17 additions & 0 deletions
diff --git a/‎extension/flat_tensor/serialize/serialize.py
Lines changed: 101 additions & 57 deletions b/‎extension/flat_tensor/serialize/serialize.py
Lines changed: 101 additions & 57 deletions
diff --git a/‎kernels/optimized/cpu/binary_ops.h
Lines changed: 24 additions & 1 deletion b/‎kernels/optimized/cpu/binary_ops.h
Lines changed: 24 additions & 1 deletion
diff --git a/‎kernels/optimized/cpu/targets.bzl
Lines changed: 1 addition & 4 deletions b/‎kernels/optimized/cpu/targets.bzl
Lines changed: 1 addition & 4 deletions
diff --git a/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 6 additions & 6 deletions b/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎kernels/portable/cpu/util/broadcast_indexes_range.h
Lines changed: 1 addition & 26 deletions b/‎kernels/portable/cpu/util/broadcast_indexes_range.h
Lines changed: 1 addition & 26 deletions
@@ -264,8 +264,6 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
-    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/elementwise_util.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
 
@@ -25,6 +25,8 @@ endif()
 
 option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
 
+set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
+
 # inmemoryfs sources
 set(INMEMORYFS_SOURCES
     runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -229,3 +231,18 @@ install(
   INCLUDES
   DESTINATION ${_common_include_directories}
 )
+
+# We only care about building the pybinding when building for macOS wheels.
+if(EXECUTORCH_BUILD_COREML AND EXECUTORCH_BUILD_PYBIND)
+  if(NOT TARGET pybind11::pybind11)
+    add_subdirectory(${EXECUTORCH_ROOT}/third-party/pybind11 ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+  endif()
+
+  pybind11_add_module(executorchcoreml SHARED runtime/inmemoryfs/inmemory_filesystem_py.cpp)
+
+  target_compile_options(executorchcoreml PRIVATE -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET})
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
+    target_compile_options(executorchcoreml PRIVATE -g)
+  endif()
+  target_link_libraries(executorchcoreml PRIVATE coreml_util coreml_inmemoryfs)
+endif()
@@ -10,29 +10,33 @@
 import os
 import tempfile
 from dataclasses import dataclass
-from typing import ClassVar, Dict, List, Literal, Optional
+from typing import ClassVar, Dict, List, Literal, Optional, Sequence
 
 import pkg_resources
 from executorch.exir._serialize._cord import Cord
 from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 
 from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
 from executorch.exir._serialize._program import _insert_flatbuffer_header
-from executorch.exir._serialize.data_serializer import DataPayload, DataSerializer
+from executorch.exir._serialize.data_serializer import (
+    DataPayload,
+    DataSerializer,
+    TensorEntry,
+)
 
 from executorch.exir._serialize.padding import aligned_size, pad_to, padding_required
 
-# Byte order of numbers written to flat tensor headers. Always little-endian
-# regardless of the host system, since all commonly-used modern CPUs are little
-# endian.
-_HEADER_BYTEORDER: Literal["little"] = "little"
-
 from executorch.extension.flat_tensor.serialize.flat_tensor_schema import (
     DataSegment,
     FlatTensor,
     TensorMetadata,
 )
 
+# Byte order of numbers written to flat tensor headers. Always little-endian
+# regardless of the host system, since all commonly-used modern CPUs are little
+# endian.
+_HEADER_BYTEORDER: Literal["little"] = "little"
+
 
 def _serialize_to_flatbuffer(flat_tensor: FlatTensor) -> Cord:
     """Serializes a FlatTensor to a flatbuffer and returns the serialized data."""
@@ -209,6 +213,62 @@ def _get_extended_header(flat_tensor_data: bytes) -> Optional[FlatTensorHeader]:
     return None
 
 
+def _extract_tensors(
+    fqn_to_tensor: Dict[str, TensorEntry],
+    buffers: Sequence[bytes],
+    segments: List[Cord],
+    tensor_alignment: int,
+) -> List[TensorMetadata]:
+    """Places tensors into a single segment, aligned to tensor_alignment within
+        the segment.
+
+    Args:
+        fqn_to_tensor: A map from fully qualified names to tensor entries.
+        buffers: A sequence of tensor buffers.
+        segments: A list of segments to append the tensor data to. Modified in-place.
+        tensor_alignment: The alignment of the tensor data.
+
+    Returns:
+        A list of TensorMetadata, which describes the tensors in the segment.
+    """
+    tensor_data: Cord = Cord()
+    tensors: List[TensorMetadata] = []
+    # {idx, offset}
+    saved_offsets: Dict[int, int] = {}
+    for fqn, tensor_entry in fqn_to_tensor.items():
+        assert tensor_entry.layout is not None
+        # Check index into the tensor buffers is valid.
+        assert tensor_entry.buffer_index < len(
+            buffers
+        ), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(buffers)}."
+
+        # Check if the tensor has already been appended to the flat_tensor_data.
+        offset = saved_offsets.get(tensor_entry.buffer_index, -1)
+        if offset == -1:
+            if len(tensor_data) > 0:
+                # Add padding to round off the previous tensor offset.
+                pad_length = padding_required(len(tensor_data), tensor_alignment)
+                tensor_data.append(b"\x00" * pad_length)
+            # Add to saved offsets.
+            offset = len(tensor_data)
+            saved_offsets[tensor_entry.buffer_index] = offset
+            # Append to flat_tensor_data at the offset.
+            tensor_data.append(buffers[tensor_entry.buffer_index])
+
+        tensors.append(
+            TensorMetadata(
+                fully_qualified_name=fqn,
+                scalar_type=tensor_entry.layout.scalar_type,
+                sizes=tensor_entry.layout.sizes,
+                dim_order=tensor_entry.layout.dim_order,
+                segment_index=len(segments),
+                offset=offset,
+            )
+        )
+    segments.append(tensor_data)
+    return tensors
+
+
 class FlatTensorSerializer(DataSerializer):
     """A concrete implementation of the DataSerializer interface that
     serializes and deserializes data to/from the FlatTensor format.
@@ -227,61 +287,45 @@ def serialize(
         self,
         data: DataPayload,
     ) -> Cord:
-        """Serializes a list of tensor metadata and tensors into a blob."""
-
-        flat_tensor_metadata: List[TensorMetadata] = []
-        flat_tensor_data: Cord = Cord()
-
-        # {idx, offset}
-        saved_offsets: Dict[int, int] = {}
-
-        for fqn, tensor_entry in data.fqn_to_tensor.items():
-            assert tensor_entry.layout is not None
-            # Check index into the tensor buffers is valid.
-            assert tensor_entry.buffer_index < len(
-                data.buffers
-            ), f"Invalid index {tensor_entry.buffer_index} is greater than tensor buffer size {len(data.buffers)}."
-
-            # Check if the tensor has already been appended to the flat_tensor_data.
-            offset = saved_offsets.get(tensor_entry.buffer_index, -1)
-            if offset == -1:
-                if len(flat_tensor_data) > 0:
-                    # Add padding to round off the previous tensor offset.
-                    pad_length = padding_required(
-                        len(flat_tensor_data), self.config.tensor_alignment
-                    )
-                    flat_tensor_data.append(b"\x00" * pad_length)
-                # Add to saved offsets.
-                offset = len(flat_tensor_data)
-                saved_offsets[tensor_entry.buffer_index] = offset
-                # Append to flat_tensor_data at the offset.
-                flat_tensor_data.append(data.buffers[tensor_entry.buffer_index])
-
-            flat_tensor_metadata.append(
-                TensorMetadata(
-                    fully_qualified_name=fqn,
-                    scalar_type=tensor_entry.layout.scalar_type,
-                    sizes=tensor_entry.layout.sizes,
-                    dim_order=tensor_entry.layout.dim_order,
-                    segment_index=0,
-                    offset=offset,
+        """Serializes a list of tensors and named data into a blob."""
+
+        segments: List[Cord] = []
+        tensors = _extract_tensors(
+            data.fqn_to_tensor,
+            data.buffers,
+            segments,
+            self.config.tensor_alignment,
+        )
+
+        data_segments: List[DataSegment] = []
+        segment_data = Cord()
+        for segment in segments:
+            prev_end = (
+                (data_segments[-1].offset + data_segments[-1].size)
+                if data_segments
+                else 0
+            )
+            data_segments.append(
+                DataSegment(
+                    offset=aligned_size(prev_end, self.config.segment_alignment),
+                    size=len(segment),
                 )
             )
-
-        # Pad flat_tensor_data to segment alignment.
-        segment_pad_length = padding_required(
-            len(flat_tensor_data), self.config.segment_alignment
-        )
-        if segment_pad_length > 0:
-            flat_tensor_data.append(b"\x00" * segment_pad_length)
+            # Pad segment_data to segment alignment.
+            segment_pad_length = padding_required(
+                len(segment_data), self.config.segment_alignment
+            )
+            if segment_pad_length > 0:
+                segment_data.append(b"\x00" * segment_pad_length)
+            segment_data.append(segment)
 
         # Create FlatTensor, which describes of the contents of the file and
         # points to all the data segments. It will be serialized to flatbuffer.
         flat_tensor = FlatTensor(
             version=0,  # Keep in sync with c++ version number in serialize.h
             tensor_alignment=self.config.tensor_alignment,
-            tensors=flat_tensor_metadata,
-            segments=[DataSegment(offset=0, size=len(flat_tensor_data))],
+            tensors=tensors,
+            segments=data_segments,
             named_data=[],
         )
 
@@ -307,7 +351,7 @@ def serialize(
             flatbuffer_offset=padded_header_length,
             flatbuffer_size=len(flatbuffer_payload),
             segment_base_offset=segment_base_offset,
-            segment_data_size=len(flat_tensor_data),
+            segment_data_size=len(segment_data),
         ).to_bytes()
 
         # Pad header and payload to segment alignment.
@@ -327,15 +371,15 @@ def serialize(
         assert eh.flatbuffer_size == original_flatbuffer_payload_size
         assert eh.segment_base_offset == segment_base_offset
         assert eh.flatbuffer_offset == padded_header_length
-        assert eh.segment_data_size == len(flat_tensor_data)
+        assert eh.segment_data_size == len(segment_data)
 
         del header_data
         del flatbuffer_payload
 
         # Place everything into one segment.
         payload = Cord()
         payload.append(injected_flatbuffer_data)
-        payload.append(flat_tensor_data)
+        payload.append(segment_data)
 
         return payload
 
 
@@ -10,11 +10,34 @@
 
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
-#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
 namespace executor {
+namespace internal {
+// NOTE: we bake ArrayRef iterators being pointers into the return
+// type here because we assume that iterators are portable across
+// ArrayRef copies.
+inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> arr) {
+  return std::find_if(
+      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
+}
+
+inline bool sizes_match_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> lhs,
+    ArrayRef<Tensor::SizesType> rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
+  auto lhs_end = lhs.end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
+  auto rhs_end = rhs.end();
+
+  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
+      std::equal(lhs_begin, lhs_end, rhs_begin);
+}
+} // namespace internal
+
 enum class ElementwiseOptimizedPath {
   kNone,
   kTreatAs1d,
 
@@ -131,10 +131,7 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["op_add_sub_impl.h"],
         visibility = ["//executorch/kernels/optimized/cpu/..."],
-        exported_deps = [
-            "//executorch/runtime/core:core",
-            "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
-        ],
+        exported_deps = ["//executorch/runtime/core:core"],
     )
 
     runtime.cxx_library(
 
@@ -52,17 +52,17 @@ Tensor& mul_out(
       out);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<
-        CTYPE_COMPUTE,
-        op_name,
-        utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) { return val_a * val_b; },
+    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          return val_a * val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out);
+        out,
+        utils::SupportedTensorDtypes::REALHBBF16);
   });
 
   return out;
 
@@ -21,28 +21,6 @@
 namespace torch::executor {
 
 namespace internal {
-// NOTE: we bake ArrayRef iterators being pointers into the return
-// type here because we assume that iterators are portable across
-// ArrayRef copies.
-inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> arr) {
-  return std::find_if(
-      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
-}
-
-inline bool sizes_match_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> lhs,
-    ArrayRef<Tensor::SizesType> rhs) {
-  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
-  auto lhs_end = lhs.end();
-
-  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
-  auto rhs_end = rhs.end();
-
-  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
-      std::equal(lhs_begin, lhs_end, rhs_begin);
-}
-
 template <std::size_t kNumInputs>
 class BroadcastIndexesIterator {
  public:
@@ -57,10 +35,7 @@ class BroadcastIndexesIterator {
   template <typename... Args>
   explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
       : output_dim_or_zero_if_no_broadcasting_(
-            (sizes_match_ignoring_leading_1s(args.sizes(), output.sizes()) &&
-             ...)
-                ? 0
-                : output.dim()),
+            ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
         output_shape_(output.sizes()) {
     static_assert(
         sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),