pytorch
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 1 deletion b/‎.lintrunner.toml
Lines changed: 0 additions & 1 deletion
diff --git a/‎CMakePresets.json
Lines changed: 51 additions & 3 deletions b/‎CMakePresets.json
Lines changed: 51 additions & 3 deletions
diff --git a/‎backends/cadence/aot/compiler.py
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/compiler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/quantizer/TARGETS
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/quantizer/TARGETS
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/quantizer/patterns.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/quantizer/quantizer.py
Lines changed: 8 additions & 7 deletions b/‎backends/cadence/aot/quantizer/quantizer.py
Lines changed: 8 additions & 7 deletions
diff --git a/‎backends/cadence/aot/quantizer/utils.py
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/quantizer/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/remove_ops.py
Lines changed: 1 addition & 4 deletions b/‎backends/cadence/aot/remove_ops.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 1 addition & 4 deletions b/‎backends/cadence/aot/tests/test_remove_ops_passes.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎backends/qualcomm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/fuse_consecutive_cast.py
Lines changed: 116 additions & 0 deletions b/‎backends/qualcomm/_passes/fuse_consecutive_cast.py
Lines changed: 116 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 29 additions & 0 deletions b/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 29 additions & 0 deletions
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [macos-arm64, pybind, llm]
+        preset: [macos, ios, ios-simulator, pybind, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -39,7 +39,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [pybind, llm]
+        preset: [linux, pybind, llm]
         runner: [linux.2xlarge, linux.arm64.2xlarge]
         docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         # Excluding specific runner + docker image combinations that don't make sense:
 
@@ -390,7 +390,6 @@ exclude_patterns = [
     "backends/arm/test/ops/**",
     "backends/vulkan/quantizer/**",
     "backends/vulkan/test/**",
-    "backends/cadence/aot/quantizer/**",
     "backends/qualcomm/quantizer/**",
     "examples/qualcomm/**",
     "backends/xnnpack/quantizer/**",
 
@@ -7,13 +7,13 @@
       "binaryDir": "${sourceDir}/cmake-out"
     },
     {
-      "name": "macos-arm64",
-      "displayName": "Build everything buildable on macOS arm64",
+      "name": "macos",
+      "displayName": "Build everything buildable on macOS",
       "inherits": ["common"],
       "generator": "Xcode",
       "cacheVariables": {
         "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
-        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos-arm64.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos.cmake",
         "PLATFORM": "MAC_ARM64",
         "DEPLOYMENT_TARGET": "10.15"
       },
@@ -23,6 +23,54 @@
         "rhs": "Darwin"
       }
     },
+    {
+      "name": "ios",
+      "displayName": "Build everything buildable on iOS",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake",
+        "PLATFORM": "OS64",
+        "DEPLOYMENT_TARGET": "17.0"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    },
+    {
+      "name": "ios-simulator",
+      "displayName": "Build everything buildable on iOS simulator",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/ios.cmake",
+        "PLATFORM": "SIMULATORARM64",
+        "DEPLOYMENT_TARGET": "17.0"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    },
+    {
+      "name": "linux",
+      "displayName": "Build everything buildable on Linux",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "CMAKE_SYSTEM_NAME": "Linux",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/linux.cmake"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Linux"
+      }
+    },
     {
       "name": "pybind",
       "displayName": "Build pybindings exported in the wheel",
 
@@ -123,7 +123,7 @@ def prepare_and_convert_pt2(
     assert isinstance(model_gm, torch.fx.GraphModule)
 
     # Prepare
-    prepared_model = prepare_pt2e(model_gm, quantizer)  # pyre-ignore[6]
+    prepared_model = prepare_pt2e(model_gm, quantizer)
 
     # Calibrate
     # If no calibration data is provided, use the inputs
 
@@ -9,6 +9,7 @@ python_library(
     ],
     deps = [
         "//caffe2:torch",
+        "//pytorch/ao:torchao",
     ],
 )
 
@@ -34,7 +35,6 @@ python_library(
         ":patterns",
         ":utils",
         "//caffe2:torch",
-        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer_utils",
     ],
 )
 
 
@@ -15,7 +15,7 @@
 
 from torch import fx
 from torch._ops import OpOverload
-from torch.ao.quantization.quantizer import (
+from torchao.quantization.pt2e.quantizer import (
     DerivedQuantizationSpec,
     SharedQuantizationSpec,
 )
 
@@ -29,19 +29,20 @@
     is_annotated,
     no_outside_users,
 )
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
+
+from torch import fx
+
+from torchao.quantization.pt2e import HistogramObserver, MinMaxObserver
+from torchao.quantization.pt2e.quantizer import (
+    ComposableQuantizer,
+    DerivedQuantizationSpec,
     OperatorConfig,
     QuantizationAnnotation,
     QuantizationConfig,
     QuantizationSpec,
+    Quantizer,
 )
 
-from torch import fx
-
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
-from torch.ao.quantization.quantizer import DerivedQuantizationSpec, Quantizer
-from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
-
 
 act_qspec_asym8s = QuantizationSpec(
     dtype=torch.int8,
 
@@ -14,13 +14,13 @@
 import torch
 from torch import fx
 from torch._ops import OpOverload
-from torch.ao.quantization import ObserverOrFakeQuantize
 
 from torch.fx import GraphModule
 from torch.fx.passes.utils.source_matcher_utils import (
     check_subgraphs_connected,
     SourcePartition,
 )
+from torchao.quantization.pt2e import ObserverOrFakeQuantize
 
 
 def quantize_tensor_multiplier(
 
@@ -235,10 +235,7 @@ def call_operator(
         kwargs: dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        if op not in {
-            exir_ops.edge.aten.linalg_vector_norm.default,
-            exir_ops.edge.cadence.linalg_vector_norm.default,
-        }:
+        if op is not exir_ops.edge.aten.linalg_vector_norm.default:
             return super().call_operator(op, args, kwargs, meta)
 
         # If the op has three args or less, it can't be a nop
 
@@ -467,10 +467,7 @@ def forward(self, x: torch.Tensor):
 
         # Expect the linalg_vector_norm op to be removed by the pass
         self.assertEqual(
-            count_node(graph_module, exir_ops.edge.aten.linalg_vector_norm.default)
-            + count_node(
-                graph_module, exir_ops.edge.cadence.linalg_vector_norm.default
-            ),
+            count_node(graph_module, exir_ops.edge.aten.linalg_vector_norm.default),
             0,
         )
 
 
@@ -20,6 +20,7 @@
 from .expand_broadcast_tensor_shape import ExpandBroadcastTensorShape
 from .fixed_linear_keep_dim import FixedLinearKeepDim
 from .fold_qdq import FoldQDQ
+from .fuse_consecutive_cast import FuseConsecutiveCast
 from .fuse_consecutive_transpose import FuseConsecutiveTranspose
 from .i64_to_i32 import I64toI32
 from .insert_io_qdq import InsertIOQDQ
@@ -54,6 +55,7 @@
     ExpandBroadcastTensorShape,
     FixedLinearKeepDim,
     FoldQDQ,
+    FuseConsecutiveCast,
     FuseConsecutiveTranspose,
     I64toI32,
     InsertIOQDQ,
 
@@ -0,0 +1,116 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class FuseConsecutiveCast(ExportPass):
+    """
+    This pass fuses consecutive cast into one or none to reduce runtime
+    overhead.
+    To simplify the fuse logic, we ensure each cast node's output has at most 1 cast node
+    by cloning cast.
+    Example:
+    Before clone cast:
+    relu -> cast1 ─> cast2
+            |──────> cast3
+
+    After clone cast:
+    relu ─> cast1 ──────> cast2
+      |───> cast4(new) ─> cast3
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op_map = {
+            exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+            exir_ops.edge.aten._to_copy.default,
+        }
+        self.visited = set()
+        self.nodes = []
+
+    def _canonicalize_cast(
+        self, graph_module: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        # replace all i64 cast nodes with i32 version
+        graph = graph_module.graph
+        for n in graph_module.graph.nodes:
+            if n.target in self.op_map and n.meta["val"].dtype == torch.int64:
+                users = list(n.users)
+                for user in users:
+                    # bypass graph output node to meet original convention
+                    if user.op == "output":
+                        continue
+
+                    with graph.inserting_after(n):
+                        cast_node = graph.create_node(
+                            "call_function",
+                            exir_ops.edge.aten._to_copy.default,
+                            n.args,
+                            kwargs={"dtype": torch.int32},
+                        )
+                        cast_node.meta = n.meta
+                        cast_node.meta["val"] = cast_node.meta["val"].to(torch.int32)
+                        user.replace_input_with(n, cast_node)
+
+        graph.eliminate_dead_code()
+
+        # clone nodes for future fusion
+        for n in graph_module.graph.nodes:
+            # make sure we're handling cast node instead of convert node
+            if n.target in self.op_map and n.kwargs.get("dtype", None) is not None:
+                users = [user for user in list(n.users) if user.target in self.op_map]
+                if len(users) > 1:
+                    for i in range(1, len(users)):
+                        with graph.inserting_after(n):
+                            clone_cast_node = graph.create_node(
+                                "call_function",
+                                exir_ops.edge.aten._to_copy.default,
+                                n.args,
+                                kwargs=n.kwargs,
+                            )
+                            clone_cast_node.meta = n.meta
+                            users[i].replace_input_with(n, clone_cast_node)
+
+    def _traverse(self, node):
+        if node in self.visited or node.target not in self.op_map:
+            return
+
+        self.nodes.append(node)
+        self.visited.add(node)
+        next_users = [n for n in list(node.users) if n.target in self.op_map]
+
+        assert (
+            len(next_users) <= 1
+        ), "Each cast node should have at most 1 cast output node after _clone_cast"
+        if not next_users:
+            return
+        else:
+            self._traverse(list(node.users)[0])
+
+    def _fuse(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        for n in graph_module.graph.nodes:
+            self._traverse(n)
+            # TODO: how to handle following scenario (won't happen for quantized graph)
+            #       fp -> to(i32) -> to(fp)
+            if len(self.nodes) > 1:
+                input_node, output_node = self.nodes[0], self.nodes[-1]
+                output_node.replace_input_with(output_node.args[0], input_node.args[0])
+
+            # clear current stack
+            self.nodes = []
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self._canonicalize_cast(graph_module)
+        self._fuse(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
@@ -31,6 +31,14 @@ class I64toI32(ExportPass):
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.scalar_tensor.default,
     }
+    # This dict is to ensure that the input of the OPs are int64 due to Pytorch restrictions.
+    # For example, scatter op can only accept args[2], the index, as int64.
+    # Key: Ops to cast input to i64
+    # Value: The args' indices to add casting op
+    I64_IN_OPS = {
+        exir_ops.edge.aten.gather.default: [2],
+        exir_ops.edge.aten.scatter.src: [2],
+    }
     copy_op = exir_ops.edge.aten._to_copy.default
 
     def __init__(
@@ -141,11 +149,32 @@ def _cast_constant_to_int32(self, graph_module: torch.fx.GraphModule):
                         n.replace_all_uses_with(to_dst_node)
                         to_dst_node.args = (n,)
 
+    def _cast_op_args_to_i64(self, graph_module: torch.fx.GraphModule):
+        # input will be cast to i32 during call_operator dtype propogation
+        # insert i64 cast node to prevent PyTorch's operator validation failure
+        for node in graph_module.graph.nodes:
+            if node.target in self.I64_IN_OPS:
+                with graph_module.graph.inserting_before(node):
+                    arg_indices = self.I64_IN_OPS[node.target]
+                    for arg_index in arg_indices:
+                        input_node = node.args[arg_index]
+                        cast_i64_node = graph_module.graph.create_node(
+                            "call_function",
+                            self.copy_op,
+                            (input_node,),
+                            {"dtype": torch.int64},
+                        )
+                        cast_i64_node.meta["val"] = node.meta["val"].to(torch.int64)
+                        args_list = list(node.args)
+                        args_list[arg_index] = cast_i64_node
+                        node.args = tuple(args_list)
+
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         # Record original output dtype to ensure that if user expects int64 as output,
         # convert the output back to int64 if it is casted from int64->int32.
         self._record_original_output_dtype(graph_module)
         self._cast_constant_to_int32(graph_module)
+        self._cast_op_args_to_i64(graph_module)
         graph_module = super().call(graph_module).graph_module
         self._preserve_output_dtype(graph_module)
         graph_module.recompile()
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ python_library(`
`9`	`9`	`],`
`10`	`10`	`deps = [`
`11`	`11`	`"//caffe2:torch",`
	`12`	`+ "//pytorch/ao:torchao",`
`12`	`13`	`],`
`13`	`14`	`)`
`14`	`15`
`@@ -34,7 +35,6 @@ python_library(`
`34`	`35`	`":patterns",`
`35`	`36`	`":utils",`
`36`	`37`	`"//caffe2:torch",`
`37`		`- "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer_utils",`
`38`	`38`	`],`
`39`	`39`	`)`
`40`	`40`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`
`16`	`16`	`from torch import fx`
`17`	`17`	`from torch._ops import OpOverload`
`18`		`-from torch.ao.quantization.quantizer import (`
	`18`	`+from torchao.quantization.pt2e.quantizer import (`
`19`	`19`	`DerivedQuantizationSpec,`
`20`	`20`	`SharedQuantizationSpec,`
`21`	`21`	`)`