pytorch
diff --git a/‎.ci/docker/common/install_java.sh
Lines changed: 12 additions & 0 deletions b/‎.ci/docker/common/install_java.sh
Lines changed: 12 additions & 0 deletions
diff --git a/‎.ci/docker/ubuntu/Dockerfile
Lines changed: 4 additions & 0 deletions b/‎.ci/docker/ubuntu/Dockerfile
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 6 additions & 0 deletions
diff --git a/‎CODEOWNERS
Lines changed: 8 additions & 8 deletions b/‎CODEOWNERS
Lines changed: 8 additions & 8 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 1 addition & 1 deletion b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/mps/runtime/MPSBackend.mm
Lines changed: 5 additions & 2 deletions b/‎backends/apple/mps/runtime/MPSBackend.mm
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend.cpp
Lines changed: 5 additions & 2 deletions b/‎backends/arm/runtime/EthosUBackend.cpp
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/cadence/aot/remove_ops.py
Lines changed: 66 additions & 0 deletions b/‎backends/cadence/aot/remove_ops.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 52 additions & 0 deletions b/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 52 additions & 0 deletions
diff --git a/‎backends/mediatek/runtime/NeuronBackend.cpp
Lines changed: 6 additions & 2 deletions b/‎backends/mediatek/runtime/NeuronBackend.cpp
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_any.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/_passes/decompose_any.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/decompose_linalg_vector_norm.py
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/_passes/decompose_linalg_vector_norm.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/tests/utils.py
Lines changed: 3 additions & 1 deletion b/‎backends/qualcomm/tests/utils.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/qualcomm/utils/utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/utils/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/transforms/test/test_rank_0_to_rank_1.py
Lines changed: 1 addition & 1 deletion b/‎backends/transforms/test/test_rank_0_to_rank_1.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/op_registry.py
Lines changed: 13 additions & 1 deletion b/‎backends/vulkan/op_registry.py
Lines changed: 13 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 5 additions & 2 deletions b/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/gen_vulkan_spv.py
Lines changed: 6 additions & 1 deletion b/‎backends/vulkan/runtime/gen_vulkan_spv.py
Lines changed: 6 additions & 1 deletion
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+apt-get update
+
+apt-get install -y --no-install-recommends openjdk-17-jdk
@@ -30,6 +30,10 @@ ARG BUCK2_VERSION
 COPY ./common/install_buck.sh install_buck.sh
 RUN bash ./install_buck.sh && rm install_buck.sh
 
+# Install java
+COPY ./common/install_java.sh install_java.sh
+RUN bash ./install_java.sh && rm install_java.sh
+
 # Setup user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 
@@ -68,6 +68,12 @@ jobs:
         make html
         cd ..
 
+        # Build javadoc:
+        cd extension/android
+        ./gradlew javadoc
+        cp -rf build/docs/javadoc "${RUNNER_DOCS_DIR}"
+        cd ../..
+
         # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
         echo "GitHub Ref: ${GITHUB_REF}"
         if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
 
@@ -15,7 +15,7 @@
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
 
-/build @GregoryComer @dbort @kirklandsign
+/build @GregoryComer @kirklandsign
 
 /codegen @larryliu0820 @lucylq
 
@@ -47,32 +47,32 @@
 /extension/apple @shoumikhin
 /extension/aten_util @JacobSzwejbka
 /extension/benchmark @tarun292
-/extension/data_loader @JacobSzwejbka @lucylq @dbort
-/extension/evalue_util @GregoryComer @dbort
+/extension/data_loader @JacobSzwejbka @lucylq
+/extension/evalue_util @GregoryComer
 /extension/export_util @kimishpatel
 /extension/flat_tensor @lucylq
 /extension/gguf_util @larryliu0820
 /extension/kernel_util @kimishpatel @manuelcandales
 /extension/llm @jackzhxng @iseeyuan @larryliu0820
-/extension/memory_allocator @JacobSzwejbka @dbort
+/extension/memory_allocator @JacobSzwejbka
 /extension/module @shoumikhin
 /extension/parallel @kimishpatel
 /extension/pybindings @JacobSzwejbka @larryliu0820
 /extension/pytree @JacobSzwejbka
-/extension/runner_util @dbort
+# /extension/runner_util @dbort
 /extension/tensor @shoumikhin
-/extension/testing_util @dbort
+# /extension/testing_util @dbort
 /extension/threadpool @kimishpatel
 /extension/training @JacobSzwejbka
 
 /kernels @manuelcandales
 
 /profiler @tarun292 @Gasoonjia
 
-/runtime @dbort @JacobSzwejbka @lucylq
+/runtime @JacobSzwejbka @lucylq
 /runtime/backend @cccclai
 
-/schema @dbort @JacobSzwejbka @lucylq
+/schema @JacobSzwejbka @lucylq
 
 /scripts @GregoryComer
 
 
@@ -117,7 +117,7 @@ def forward(self, q, k, v, mask):
         v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
         mask = torch.randn(seq_len, max_seq_length)
         example_inputs = (q, k, v, mask)
-        ep = torch.export.export(model, example_inputs)
+        ep = torch.export.export(model, example_inputs, strict=True)
         coreml_partitioner = CoreMLPartitioner()
 
         # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
 
@@ -43,8 +43,11 @@ bool is_available() const override {
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
-    auto executor = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        context.get_runtime_allocator(), mps::delegate::MPSExecutor);
+    auto executor = context.get_runtime_allocator()->allocateInstance<mps::delegate::MPSExecutor>();
+    if (executor == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     // NOTE: Since we use placement new and since this type is not trivially
     // destructible, we must call the destructor manually in destroy().
     new (executor) mps::delegate::MPSExecutor;
 
@@ -120,8 +120,11 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     MemoryAllocator* allocator = context.get_runtime_allocator();
-    ExecutionHandle* handle =
-        ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(allocator, ExecutionHandle);
+    ExecutionHandle* handle = allocator->allocateInstance<ExecutionHandle>();
+    if (handle == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+
     handle->processed = processed;
 
     // Return the same buffer we were passed - this data will be
 
@@ -807,6 +807,72 @@ def remove_branched(
                 user.replace_all_uses_with(node.args[0])
 
 
+class RemoveCatFromSliceCopyPass(ExportPass):
+    def _remove_unused_cat(self, graph_module: torch.fx.GraphModule) -> None:
+        slice_copy_nodes = [
+            node
+            for node in graph_module.graph.nodes
+            if node.target == exir_ops.edge.aten.slice_copy.Tensor
+        ]
+        for slice_copy_node in slice_copy_nodes:
+            slice_dim, start_idx, end_idx, step = 0, 0, float("inf"), 1
+            input_node, *other_args = slice_copy_node.args
+            if len(other_args) >= 1:
+                slice_dim = other_args[0]
+            if len(other_args) >= 2:
+                start_idx = other_args[1]
+            if len(other_args) >= 3:
+                end_idx = other_args[2]
+            if len(other_args) >= 4:
+                step = other_args[3]
+            if step != 1:
+                continue
+            slice_copy_dtype = slice_copy_node.meta["val"].dtype
+            if input_node.target != exir_ops.edge.aten.cat.default:
+                continue
+            cat_dtype = input_node.meta["val"].dtype
+            if slice_copy_dtype != cat_dtype:
+                continue
+            cat_dim = input_node.args[1:]
+            if len(cat_dim) == 0:
+                cat_dim = 0
+            if cat_dim != slice_dim:
+                continue
+            cat_output_shape = input_node.meta["val"].shape
+            start_idx = (
+                cat_output_shape[cat_dim] + start_idx if start_idx < 0 else start_idx
+            )
+            end_idx = (
+                cat_output_shape[cat_dim]
+                if end_idx > cat_output_shape[cat_dim]
+                else end_idx
+            )
+            base_idx = 0
+            cat_input_to_keep = None
+            for cat_input_node in input_node.args[0]:
+                cat_input_dtype = cat_input_node.meta["val"].dtype
+                if slice_copy_dtype != cat_input_dtype:
+                    continue
+                cat_input_shape = cat_input_node.meta["val"].shape
+
+                # check if the slice range overlaps with the cat range
+                if (
+                    base_idx <= start_idx
+                    and end_idx <= list(cat_input_shape)[cat_dim] + base_idx
+                ):
+                    cat_input_to_keep = cat_input_node
+                    break
+                base_idx += list(cat_input_shape)[cat_dim]
+            if cat_input_to_keep is not None:
+                slice_copy_node.replace_input_with(input_node, cat_input_to_keep)
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._remove_unused_cat(graph_module)
+        graph_module.recompile()
+        graph_module.graph.eliminate_dead_code()
+        return super().call(graph_module)
+
+
 # The following class consolidates functions to remove ops that are redundant
 # in Jarvis. Currently, each function in this class iterates over each node of
 # the graph module once. In future, we could consolidate them into a monolithic
 
@@ -22,6 +22,7 @@
 from executorch.backends.cadence.aot.remove_ops import (
     RemoveAliasCopyOpPass,
     RemoveBranchedQuantDequant,
+    RemoveCatFromSliceCopyPass,
     RemoveCloneOpPass,
     RemoveContiguousOpPass,
     RemoveDetachCopyPass,
@@ -741,3 +742,54 @@ def forward(self, x):
                 },
             )
         )
+
+    def test_remove_cat_from_slice_copy_all_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=1)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
+
+    def test_remove_cat_from_slice_copy_no_removal(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=3)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 1)
+
+    def test_remove_cat_from_slice_copy_zero_range(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                x1 = torch.cat((x, y), 0)  # (2, 4)
+                return torch.slice_copy(x1, dim=0, start=0, end=0)
+
+        inputs = tuple(torch.randn(2, 4) for _ in range(2))
+        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
+        p = RemoveCatFromSliceCopyPass()
+        graph_module = cast(PassResult, p(graph_module)).graph_module
+
+        # Ensure both cat nodes were removed
+        self.assertEqual(count_node(graph_module, exir_ops.edge.aten.cat.default), 0)
@@ -68,8 +68,12 @@ Result<DelegateHandle*> NeuronBackend::init(
       processed->size());
 
   MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-  NeuronExecuTorchDelegate* delegate = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-      runtime_allocator, NeuronExecuTorchDelegate);
+  NeuronExecuTorchDelegate* delegate =
+      runtime_allocator->allocateInstance<NeuronExecuTorchDelegate>();
+  if (delegate == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
+
   new (delegate) NeuronExecuTorchDelegate();
 
   if (delegate == nullptr) {
 
@@ -41,7 +41,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 keepdim = node.args[2] if len(node.args) > 2 else False
                 model = Any(dim, keepdim)
                 edge_mgr = to_edge(
-                    torch.export.export(model, (node.args[0].meta["val"],))
+                    torch.export.export(model, (node.args[0].meta["val"],), strict=True)
                 )
                 decomposed_module = edge_mgr.exported_program()
 
 
@@ -46,11 +46,13 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 model = LinalgVectorNorm(ord, dim, keepdim)
                 if self.aten_dialect_capture:
                     decomposed_module = torch.export.export(
-                        model, (node.args[0].meta["val"],)
+                        model, (node.args[0].meta["val"],), strict=True
                     ).module()
                 else:
                     edge_mgr = to_edge(
-                        torch.export.export(model, (node.args[0].meta["val"],))
+                        torch.export.export(
+                            model, (node.args[0].meta["val"],), strict=True
+                        )
                     )
                     decomposed_module = edge_mgr.exported_program()
 
 
@@ -66,8 +66,10 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
 
   // Create QnnManager
   MemoryAllocator* runtime_allocator = context.get_runtime_allocator();
-  QnnManager* qnn_manager =
-      ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(runtime_allocator, QnnManager);
+  QnnManager* qnn_manager = runtime_allocator->allocateInstance<QnnManager>();
+  if (qnn_manager == nullptr) {
+    return Error::MemoryAllocationFailed;
+  }
 
   // NOTE: Since we use placement new and since this type is not trivially
   // destructible, we must call the destructor manually in destroy().
 
@@ -526,7 +526,9 @@ def get_qdq_module(
         dynamic_shapes: Dict = None,
         bypass_check: bool = False,
     ) -> torch.fx.GraphModule:
-        m = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes).module()
+        m = torch.export.export(
+            module, inputs, dynamic_shapes=dynamic_shapes, strict=True
+        ).module()
 
         quantizer = QnnQuantizer()
         quantizer.add_custom_quant_annotations(custom_quant_annotations)
 
@@ -452,7 +452,7 @@ def capture_program(
     dynamic_shapes: Dict = None,
 ) -> exir.ExirExportedProgram:
     module = _preprocess_module(module, inputs)
-    ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes)
+    ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes, strict=True)
     decomposed_ep = ep.run_decompositions(get_decomp_table())
     core_ep = ExirExportedProgram(decomposed_ep, False)
     core_ep.transform(TensorI64toI32(edge_program=core_ep))
 
@@ -17,7 +17,7 @@ def forward(self, x, y):
         model.eval()
 
         example_inputs = (torch.tensor(1.0), torch.tensor(2.0))
-        aten = torch.export.export(model, example_inputs)
+        aten = torch.export.export(model, example_inputs, strict=True)
 
         # Check that the input rank is 0
         for node in aten.graph.nodes:
 
@@ -530,7 +530,6 @@ def register_view_op(features: OpFeatures):
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
         exir_ops.edge.aten.select_copy.int,
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
@@ -557,6 +556,19 @@ def register_ported_op(features: OpFeatures):
     return features
 
 
+@update_features(
+    [
+        # Indexing and lookup
+        exir_ops.edge.aten.slice_copy.Tensor,
+    ]
+)
+def register_ported_op_all_packed_dims(features: OpFeatures):
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims=all_packed_dims,
+    )
+    return features
+
+
 # Ported ops that support their own prepacking.
 @update_features(
     [
 
@@ -510,8 +510,11 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
-    ComputeGraph* compute_graph = ET_ALLOCATE_INSTANCE_OR_RETURN_ERROR(
-        context.get_runtime_allocator(), ComputeGraph);
+    ComputeGraph* compute_graph =
+        context.get_runtime_allocator()->allocateInstance<ComputeGraph>();
+    if (compute_graph == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
 
     new (compute_graph) ComputeGraph(get_graph_config(compile_specs));
 
 
@@ -769,7 +769,12 @@ def process_shader(shader_paths_pair):
                     + self.glslc_flags.split()
                 )
 
-                subprocess.check_call(cmd)
+                try:
+                    subprocess.check_call(cmd)
+                except subprocess.CalledProcessError as e:
+                    raise RuntimeError(
+                        f"Failed to compile {os.getcwd()}/{glsl_out_path}"
+                    ) from e
 
                 return (spv_out_path, glsl_out_path)
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:`
`41`	`41`	`keepdim = node.args[2] if len(node.args) > 2 else False`
`42`	`42`	`model = Any(dim, keepdim)`
`43`	`43`	`edge_mgr = to_edge(`
`44`		`- torch.export.export(model, (node.args[0].meta["val"],))`
	`44`	`+ torch.export.export(model, (node.args[0].meta["val"],), strict=True)`
`45`	`45`	`)`
`46`	`46`	`decomposed_module = edge_mgr.exported_program()`
`47`	`47`