pytorch
diff --git a/‎.ci/docker/ci_commit_pins/torchao.txt
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/ci_commit_pins/torchao.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llava.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 28 additions & 27 deletions b/‎.github/workflows/trunk.yml
Lines changed: 28 additions & 27 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 38 additions & 38 deletions b/‎CMakeLists.txt
Lines changed: 38 additions & 38 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 3 additions & 3 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/apple/mps/test/test_mps_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_bmm.py
Lines changed: 82 additions & 0 deletions b/‎backends/arm/operators/op_bmm.py
Lines changed: 82 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_softmax.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_softmax.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/quantizer/quantization_annotation/mm_annotator.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/quantizer/quantization_annotation/mm_annotator.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1 @@
+0916b5b29b092afcbf2b898caae49abe80662bac
@@ -91,7 +91,7 @@ run_and_verify() {
     RESULT=$(cat result.txt)
     # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
     if [[ "$(uname)" == "Darwin" ]]; then
-        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress on a basketball court. There are several players on the court, with one player in the foreground holding a basketball, and"
+        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
     else
         # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
         EXPECTED_PREFIX="ASSISTANT:"
 
@@ -270,33 +270,34 @@ jobs:
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
 
-  test-llava-runner-macos:
-    name: test-llava-runner-macos
-    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
-    strategy:
-      fail-fast: false
-    with:
-      runner: macos-m1-stable
-      python-version: '3.11'
-      submodules: 'true'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-macos.sh "cmake"
-
-        # install Llava requirements
-        bash examples/models/llama2/install_requirements.sh
-        bash examples/models/llava/install_requirements.sh
-
-        # run python unittest
-        python -m unittest examples.models.llava.test.test_llava
-
-        # run e2e (export, tokenizer and runner)
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh Release
+  # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
+  # test-llava-runner-macos:
+  #   name: test-llava-runner-macos
+  #   uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+  #   strategy:
+  #     fail-fast: false
+  #   with:
+  #     runner: macos-14-xlarge
+  #     python-version: '3.11'
+  #     submodules: 'true'
+  #     ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+  #     timeout: 900
+  #     script: |
+  #       BUILD_TOOL=cmake
+
+  #       bash .ci/scripts/setup-conda.sh
+  #       # Setup MacOS dependencies as there is no Docker support on MacOS atm
+  #       GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
+
+  #       # install Llava requirements
+  #       ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
+  #       ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
+
+  #       # run python unittest
+  #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
+
+  #       # run e2e (export, tokenizer and runner)
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
 
   test-qnn-model:
     name: test-qnn-model
 
@@ -505,7 +505,8 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   )
   target_link_libraries(executorch_no_prim_ops_shared PRIVATE program_schema)
   if(DL_LIBRARY_EXISTS)
-    target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl) # For dladdr()
+    # For dladdr()
+    target_link_libraries(executorch_no_prim_ops_shared PRIVATE dl)
   endif()
   target_include_directories(
     executorch_no_prim_ops_shared PUBLIC ${_common_include_directories}
@@ -541,7 +542,7 @@ target_link_options_shared_lib(executorch)
 # operators necessary for the models that will run.
 #
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
@@ -584,56 +585,56 @@ if(EXECUTORCH_BUILD_GTESTS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
 endif()
 
-if(EXECUTORCH_BUILD_SDK)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-      ON
-      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-  )
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_APPLE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
+if(EXECUTORCH_BUILD_CADENCE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
+if(EXECUTORCH_BUILD_COREML)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
+if(EXECUTORCH_BUILD_MPS)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
 endif()
 
 if(EXECUTORCH_BUILD_NEURON)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/mediatek)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
+if(EXECUTORCH_BUILD_QNN)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
 endif()
 
 if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
-if(EXECUTORCH_BUILD_QNN)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/qualcomm)
+if(EXECUTORCH_BUILD_SDK)
+  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
+      ON
+      CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
+  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
-if(EXECUTORCH_BUILD_ARM_BAREMETAL)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+if(EXECUTORCH_BUILD_EXTENSION_APPLE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
 endif()
 
-if(EXECUTORCH_BUILD_MPS)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/mps)
+if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
 endif()
 
-if(EXECUTORCH_BUILD_COREML)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/apple/coreml)
+if(EXECUTORCH_BUILD_EXTENSION_MODULE)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
-if(EXECUTORCH_BUILD_CADENCE)
-   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cadence)
+if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
@@ -690,9 +691,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   )
   # util lib
   add_library(
-    util
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
+    util ${CMAKE_CURRENT_SOURCE_DIR}/extension/evalue_util/print_evalue.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/extension/aten_util/aten_bridge.cpp
   )
   target_include_directories(
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
@@ -741,12 +741,14 @@ if(EXECUTORCH_BUILD_PYBIND)
   else()
     set_target_properties(
       portable_lib
-      PROPERTIES # Assume <executorch> is the root `site-packages/executorch`
-                 # Need to add <executorch>/extension/llm/custom_ops for
-                 # libcustom_ops_aot_lib
-                 # Need to add <executorch>/kernels/quantized for
-                 # libquantized_ops_aot_lib
-                 BUILD_RPATH "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
+      PROPERTIES
+        # Assume <executorch> is the root `site-packages/executorch`
+        # Need to add <executorch>/extension/llm/custom_ops for
+        # libcustom_ops_aot_lib
+        # Need to add <executorch>/kernels/quantized for
+        # libquantized_ops_aot_lib
+        BUILD_RPATH
+        "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
     )
   endif()
 
@@ -757,9 +759,7 @@ endif()
 
 if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops
-  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_QUANTIZED)
 
@@ -14,10 +14,10 @@ if(NOT EXECUTORCH_ROOT)
 endif()
 
 if(EXECUTORCH_BUILD_SDK)
-# protobuf requires frtti
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti" )
+  # protobuf requires frtti
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frtti")
 endif()
-  
+
 option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
 
 # inmemoryfs sources
 
@@ -229,7 +229,7 @@ def lower_module_and_test_output(
         compile_specs = [CompileSpec("use_fp16", bytes([use_fp16]))]
 
         if use_partitioner:
-            logging.info(f"Edge IR graph:\n{edge_program.exported_program().graph}")
+            logging.info(f"Edge IR graph:\n{edge_program.exported_program()}")
             delegated_program = edge_program
             delegated_program = edge_program.to_backend(
                 MPSPartitioner(compile_specs=compile_specs)
 
@@ -40,6 +40,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.addmm.default,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
 
@@ -9,6 +9,7 @@
     op_addmm,
     op_avg_pool2d,
     op_batch_norm,
+    op_bmm,
     op_cat,
     op_conv2d,
     op_dequant,
 
@@ -0,0 +1,82 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args
+from executorch.backends.arm.tosa_utils import get_two_inputs
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class BMMVisitor(NodeVisitor):
+    target = "aten.bmm.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        input0, input1 = get_two_inputs(node)
+
+        # aten.bmm maps directly to MATMUL
+        # NOTE: For now, only INT8 & FP32 is supported
+
+        # For INT8, we need to get the zero points and add an intermediate tensor
+        # for a later rescale.
+        if is_quant_node:
+            input0_zp = get_quant_node_args(input0).zp
+            input1_zp = get_quant_node_args(input1).zp
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
+            bmm_output_name = bmm_result.name
+        else:
+            input0_zp, input1_zp = 0, 0
+            bmm_output_name = output.name
+
+        # Add the MATMUL to the TOSA graph.
+        attr = ts.TosaSerializerAttribute()
+        attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().MATMUL,
+            [input0.name, input1.name],
+            [bmm_output_name],
+            attr,
+        )
+
+        # As INT8 accumulates into INT32, we need to rescale it back to INT8
+        if is_quant_node:
+            input0_q_params = get_quant_node_args(input0)
+            input1_q_params = get_quant_node_args(input1)
+            output_q_params = get_quant_node_args(list(node.users)[0])
+
+            final_output_scale = (
+                input0_q_params.scale * input1_q_params.scale
+            ) / output_q_params.scale
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=final_output_scale,
+                input_node=bmm_result,
+                output_name=output.name,
+                output_type=ts.DType.INT8,
+                output_shape=bmm_result.shape,
+                input_zp=0,
+                output_zp=output_q_params.zp,
+                is_double_round=False,
+            )
@@ -33,7 +33,7 @@ def define_node(
         input_name = inputs[0].name
         dim_order = inputs[0].dim_order
         input_shape = tosa_shape(inputs[0].shape, dim_order)
-        dim_value = dim_order.index(inputs[1].number)
+        dim_value = dim_order.index(inputs[1].number % len(dim_order))
 
         ## softmax = exp(logits - max(logits)) / reduce_sum(exp(logits - max(logits)), -1)
         # FP32
 
@@ -22,7 +22,7 @@ def _annotate_mm(
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[List[List[Node]]]:
-    mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn)
+    mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn)
     mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values()))
     annotated_partitions = []
     for mm_partition in mm_partitions:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+0916b5b29b092afcbf2b898caae49abe80662bac`