pytorch
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 98 additions & 0 deletions b/‎.ci/scripts/test_llava.sh
Lines changed: 98 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 4 additions & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 4 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 13 additions & 5 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_slice.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/operators/op_slice.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/passes/arm_pass_manager.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/passes/arm_pass_manager.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/passes/convert_split_to_slice.py
Lines changed: 70 additions & 0 deletions b/‎backends/arm/passes/convert_split_to_slice.py
Lines changed: 70 additions & 0 deletions
diff --git a/‎backends/arm/quantizer/arm_quantizer_utils.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/quantizer/arm_quantizer_utils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_slice.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/ops/test_slice.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,98 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+# shellcheck source=/dev/null
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_libraries() {
+    cmake                                               \
+        -DCMAKE_INSTALL_PREFIX=cmake-out                \
+        -DCMAKE_BUILD_TYPE=Debug                        \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON          \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON     \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON            \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON         \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON         \
+        -DEXECUTORCH_BUILD_XNNPACK=ON                   \
+        -DEXECUTORCH_DO_NOT_USE_CXX11_ABI=ON            \
+        -Bcmake-out .
+
+
+    cmake --build cmake-out -j9 --target install --config Debug
+}
+
+cmake_build_llava_runner() {
+    dir=examples/models/llava
+    python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+
+    cmake                                       \
+        -DCMAKE_INSTALL_PREFIX=cmake-out        \
+        -DCMAKE_BUILD_TYPE=Debug                \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON    \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON           \
+        -DCMAKE_PREFIX_PATH="$python_lib"       \
+        -Bcmake-out/${dir}                      \
+        ${dir}
+
+
+    cmake --build cmake-out/${dir} -j9 --config Debug
+}
+
+# only export the one without custom op for now since it's
+export_llava() {
+    echo "Starting to export Llava. This will take about 6 mins"
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
+}
+
+run_and_verify() {
+    NOW=$(date +"%H:%M:%S")
+    echo "Starting to run llava runner at ${NOW}"
+    if [[ ! -f "llava.pte" ]]; then
+        echo "Export failed. Abort"
+        exit 1
+    fi
+    if [[ ! -f "image.pt" ]]; then
+        echo "image.pt is missing."
+        exit 1
+    fi
+    if [[ ! -f "tokenizer.bin" ]]; then
+        echo "tokenizer.bin is missing."
+        exit 1
+    fi
+    RUNTIME_ARGS="--model_path=llava.pte \
+     --tokenizer_path=tokenizer.bin \
+     --image_path=image.pt \
+     --prompt=ASSISTANT: \
+     --temperature=0 \
+     --seq_len=650"
+    cmake-out/examples/models/llava/llava_main ${RUNTIME_ARGS} > result.txt
+    # verify result.txt
+    RESULT=$(cat result.txt)
+    # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
+    EXPECTED_PREFIX="ASSISTANT:"
+    if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
+        echo "Expected result prefix: ${EXPECTED_PREFIX}"
+        echo "Actual result: ${RESULT}"
+        echo "Success"
+        exit 0
+    else
+        echo "Expected result prefix: ${EXPECTED_PREFIX}"
+        echo "Actual result: ${RESULT}"
+        echo "Failure; results not the same"
+        exit 1
+    fi
+}
+
+cmake_install_executorch_libraries
+cmake_build_llava_runner
+export_llava
+run_and_verify
@@ -187,7 +187,7 @@ jobs:
         # Test selective build
         PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
 
-  test-export-llava-linux:
+  test-llava-runner-linux:
     name: test-export-llava-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
@@ -215,6 +215,9 @@ jobs:
         # run python unittest
         python -m unittest examples.models.llava.test.test_llava
 
+        # run e2e (export, tokenizer and runner)
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
+
   test-quantized-aot-lib-linux:
     name: test-quantized-aot-lib-linux
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
 
@@ -130,6 +130,12 @@ if(EXECUTORCH_ENABLE_EVENT_TRACER)
   add_definitions(-DET_EVENT_TRACER_ENABLED)
 endif()
 
+option(EXECUTORCH_DO_NOT_USE_CXX11_ABI "Define _GLIBCXX_USE_CXX11_ABI=0 if ON"
+       OFF
+)
+if(EXECUTORCH_DO_NOT_USE_CXX11_ABI)
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+endif()
 # -ffunction-sections -fdata-sections: breaks function and data into sections so
 # they can be properly gc'd. -s: strip symbol. -fno-exceptions -fno-rtti:
 # disables exceptions and runtime type.
 
@@ -68,15 +68,23 @@ def test_vit_skip_conv(self):
             )
         )
 
+        conv_block = ["aten.convolution.default", "executorch_call_delegate"]
+        safe_softmax_block = [
+            "getitem",
+            "getitem",
+            "getitem",
+            "getitem",
+            "aten.any.dim",
+            "executorch_call_delegate",
+        ]
+        final_block = ["getitem"]
+        total = conv_block + 12 * safe_softmax_block + final_block
+
         assert [
             node.target.__name__
             for node in delegated_program_manager.exported_program().graph.nodes
             if node.op == "call_function"
-        ] == [
-            "aten.convolution.default",
-            "executorch_call_delegate",
-            "getitem",
-        ]
+        ] == total
 
 
 if __name__ == "__main__":
 
@@ -43,6 +43,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
             exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten.split_with_sizes_copy.default,
             exir_ops.edge.aten.full.default,
             exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
             exir_ops.edge.aten.avg_pool2d.default,
 
@@ -40,6 +40,8 @@ def define_node(
         shape = input_node.shape
         dim = dim.number
         end = (shape[dim] + end.number) % shape[dim]
+        if end == 0:
+            end = shape[dim]
         size = end - start.number
         assert size > 0
         assert size <= shape[dim]
 
@@ -12,6 +12,9 @@
 from executorch.backends.arm.passes.convert_expand_copy_to_repeat import (
     ConvertExpandCopyToRepeatPass,
 )
+from executorch.backends.arm.passes.convert_split_to_slice import (
+    ConvertSplitToSlicePass,
+)
 from executorch.backends.arm.passes.remove_clone_pass import RemoveClonePass
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.pass_manager import PassManager
@@ -28,6 +31,7 @@ def transform_to_backend_pipeline(
         """Apply passes before transforming program to backend"""
         self.add_pass(RemoveClonePass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(ConvertSplitToSlicePass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()
 
@@ -0,0 +1,70 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.fx
+from executorch.backends.arm.tosa_mapping import extract_tensor_meta
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class ConvertSplitToSlicePass(ExportPass):
+    """
+    Replace a split operation with many slice operations.
+    """
+
+    split_ops = (
+        exir_ops.edge.aten.split_with_sizes_copy.default,
+        exir_ops.edge.aten.split_copy.Tensor,
+    )
+    slice = exir_ops.edge.aten.slice_copy.Tensor
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target not in self.split_ops:
+                continue
+
+            # Get useful variables
+            split_node = node
+            input_node = split_node.all_input_nodes[0]
+            output_nodes = split_node.users.copy()
+            _, shape, _ = extract_tensor_meta(input_node.meta)
+            rank = len(shape)
+            split_lengths = split_node.args[1]
+            dim = split_node.args[2] if len(split_node.args) > 2 else 0
+            dim = (dim + rank) % rank
+
+            assert (
+                sum(split_lengths) == shape[dim]
+            ), "Given split lengths don't sum up to the size of the dimension."
+
+            # Convert split argument 'split_lengths' to slice arguments start and end.
+            starts = [0] * len(split_lengths)
+            ends = [0] * len(split_lengths)
+            start = 0
+            end = 0
+            for i, split_length in enumerate(split_lengths):
+                end = start + split_length
+                starts[i] = start
+                ends[i] = end
+                start = end
+
+            # Output nodes are of type getitem
+            # Create one slice node for each output node with matching argumetns.
+            with graph_module.graph.inserting_before(split_node):
+                for output_node in output_nodes:
+                    index = output_node.args[1]
+                    slice_node = graph.create_node(
+                        "call_function",
+                        self.slice,
+                        (input_node, dim, starts[index], ends[index]),
+                    )
+                    slice_node.meta = split_node.meta.copy()
+                    slice_node.meta["val"] = slice_node.meta["val"][index]
+                    output_node.replace_input_with(split_node, slice_node)
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -9,6 +9,7 @@
 # Utility functions for ArmQuantizer
 #
 
+import operator
 from typing import Callable, cast, List
 
 import torch
@@ -141,8 +142,11 @@ def is_share_obs_or_fq_op(op: Callable) -> bool:
         torch.ops.aten.view_copy.default,
         torch.ops.aten.view.default,
         torch.ops.aten.slice.Tensor,
+        torch.ops.aten.split.Tensor,
+        torch.ops.aten.split_with_sizes.default,
         torch.ops.aten.flatten.using_ints,
         torch.ops.aten.dropout.default,
+        operator.getitem,
     ]
 
 
 
@@ -33,7 +33,7 @@ def forward(self, x: torch.Tensor):
             elif x.dim() == 3:
                 return x[0:7, 0:1, 0:8]
             elif x.dim() == 4:
-                return x[:, 2:5, 3:5, 4:5]
+                return x[:, 2:5, 3:5, 4:10]
 
     def _test_slice_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: torch.Tensor