pytorch
diff --git a/‎backends/qualcomm/builders/__init__.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/builders/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 5 additions & 3 deletions b/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎backends/qualcomm/builders/op_embedding.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/builders/op_embedding.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_skip_ops.py
Lines changed: 3 additions & 1 deletion b/‎backends/qualcomm/builders/op_skip_ops.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_split.py
Lines changed: 85 additions & 0 deletions b/‎backends/qualcomm/builders/op_split.py
Lines changed: 85 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/qnn_constants.py
Lines changed: 7 additions & 0 deletions b/‎backends/qualcomm/builders/qnn_constants.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/qualcomm/partition/common_defs.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/partition/common_defs.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/partition/qnn_partitioner.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/partition/qnn_partitioner.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/passes/fuse_consecutive_transpose.py
Lines changed: 69 additions & 0 deletions b/‎backends/qualcomm/passes/fuse_consecutive_transpose.py
Lines changed: 69 additions & 0 deletions
diff --git a/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 7 additions & 1 deletion b/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 7 additions & 1 deletion
@@ -41,6 +41,7 @@
     op_skip_ops,
     op_slice_copy,
     op_softmax,
+    op_split,
     op_squeeze,
     op_sub,
     op_tanh,
@@ -85,6 +86,7 @@
     op_skip_ops,
     op_slice_copy,
     op_softmax,
+    op_split,
     op_squeeze,
     op_sub,
     op_tanh,
 
@@ -283,6 +283,7 @@ def define_tensor(
         nodes_to_wrappers: Dict[str, PyQnnWrapper.TensorWrapper],
         is_input_tensor: bool,
         node_name: str = None,
+        wrapper_idx: int = 0,
         is_tensor: bool = True,
     ) -> PyQnnWrapper.TensorWrapper:
         """
@@ -299,8 +300,9 @@ def define_tensor(
         if node_name is None:
             node_name = node.name
 
-        if node_name in nodes_to_wrappers:
-            return nodes_to_wrappers[node_name]
+        if cached := nodes_to_wrappers[node_name].get(wrapper_idx, None):
+            return cached
+
         tensor_name = node.name
         if is_graph_output(node):
             tensor_name = "output_" + tensor_name
@@ -341,7 +343,7 @@ def define_tensor(
                 tensor.detach().numpy(),
                 True,
             )
-        nodes_to_wrappers[node_name] = tensor_wrapper
+        nodes_to_wrappers[node_name][wrapper_idx] = tensor_wrapper
         return tensor_wrapper
 
     def define_node(
 
@@ -34,7 +34,7 @@ def define_node(
             weight_tensor,
             PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,
             nodes_to_wrappers,
-            is_input_tensor=False,
+            is_input_tensor=True,
         )
 
         indices_node = node.args[1]
 
@@ -46,5 +46,7 @@ def define_node(
             raise AssertionError(
                 f"Invalid number of index for {node.name }: {len(node.args[1])}"
             )
-        nodes_to_wrappers[node.name] = nodes_to_wrappers.get(node.args[0].name)
+        nodes_to_wrappers[node.name] = {
+            0: nodes_to_wrappers.get(node.args[0].name).get(node.args[1])
+        }
         return
@@ -0,0 +1,85 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+import torch
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpSplit, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class Softmax(NodeVisitor):
+    target = ["aten.split_with_sizes.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+            is_input_tensor=True,
+        )
+        split_input_tensors = [input_tensor_wrapper]
+
+        axis = 0 if len(node.args) < 3 else cast(int, node.args[2])
+        if axis < 0:
+            axis = axis % len(input_tensor.shape)
+        if "axis_order" in node.meta:
+            axis = node.meta["axis_order"].index(axis)
+
+        # this is not the general case, only a quick workaround here
+        index = np.arange(1, input_tensor.shape[axis], dtype=np.uint32)
+        index_shape = [len(index)]
+
+        split_output_tensors = []
+        for i in range(input_tensor.shape[axis]):
+            output_tensor = self.get_tensor(node, node, i)
+            output_tensor_wrapper = self.define_tensor(
+                node,
+                output_tensor,
+                PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+                nodes_to_wrappers,
+                is_input_tensor=False,
+                wrapper_idx=i,
+            )
+            split_output_tensors.append(output_tensor_wrapper)
+
+        split_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpSplit.op_name,
+        )
+        split_op.AddInputTensors(split_input_tensors)
+        split_op.AddOutputTensors(split_output_tensors)
+
+        split_op.AddScalarParam(
+            OpSplit.param_axis,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            {"data": np.uint32(axis)},
+        )
+        split_op.AddTensorParam(
+            OpSplit.param_split_index,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(index_shape),
+            index_shape,
+            index,
+            True,
+        )
+
+        return split_op
@@ -247,6 +247,13 @@ class OpSoftmax:
     param_beta: str = "beta"
 
 
+@dataclass(init=False, frozen=True)
+class OpSplit:
+    op_name: str = "Split"
+    param_axis: str = "axis"
+    param_split_index: str = "split_index"
+
+
 @dataclass(init=False, frozen=True)
 class OpSqueeze:
     op_name: str = "Squeeze"
 
@@ -11,8 +11,9 @@
 not_supported_operator = [
     exir_ops.edge.aten.arange.start_step,
     exir_ops.edge.aten.clone.default,
-    exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
+    exir_ops.edge.aten.index.Tensor,
+    exir_ops.edge.aten.index_put.default,
 ]
 
 allow_list_operator = [
 
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import copy
+from collections import defaultdict
 from typing import Any, Dict, List
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
@@ -49,7 +50,7 @@ def __init__(
             )
 
         self.skip_node_id_set = skip_node_id_set
-        self.nodes_to_wrappers = {}
+        self.nodes_to_wrappers = defaultdict(dict)
         self.qnn_manager = PyQnnManager.QnnManager(
             generate_qnn_executorch_option(compiler_specs)
         )
 
@@ -0,0 +1,69 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+
+class FuseConsecutiveTranspose(ExportPass):
+    """
+    This pass fuses consecutive transpose / permute into one to reduce runtime
+    overhead
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.op_map = {
+            exir_ops.edge.aten.permute_copy.default,
+        }
+        self.visited = set()
+        self.nodes = []
+
+    def _traverse(self, node):
+        if node.op == "call_function" and node.target in self.op_map:
+            self.nodes.append(node)
+            self.visited.add(node)
+            if len(node.users) == 1:
+                self._traverse(list(node.users)[0])
+
+    def _fuse(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        graph = graph_module.graph
+        for n in graph_module.graph.nodes:
+            if n in self.visited:
+                continue
+            if n.op == "call_function" and n.target in self.op_map:
+                self._traverse(n)
+            num_nodes = len(self.nodes)
+            if num_nodes > 1:
+                input_node, output_node = self.nodes[0].args[0], self.nodes[-1]
+                input_shape = input_node.meta["val"].shape
+                axis_order = torch.arange(len(input_shape)).tolist()
+                for node in self.nodes:
+                    axis_order = [axis_order[i] for i in node.args[1]]
+                with graph.inserting_after(input_node):
+                    permute_op = exir_ops.edge.aten.permute_copy.default
+                    permute_node = graph.create_node(
+                        "call_function", permute_op, (input_node, axis_order)
+                    )
+                    users = output_node.users.copy()
+                    for user in users:
+                        user.replace_input_with(output_node, permute_node)
+                    # copy metadata
+
+                    permute_node.meta = output_node.meta
+            # clear current stack
+
+            self.nodes = []
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self._fuse(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
@@ -5,12 +5,16 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
+from collections import defaultdict
 from typing import final, List
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 from executorch.backends.qualcomm.builders.node_visitor import get_node_visitors
 
 from executorch.backends.qualcomm.passes.convert_to_linear import ConvertToLinear
+from executorch.backends.qualcomm.passes.fuse_consecutive_transpose import (
+    FuseConsecutiveTranspose,
+)
 from executorch.backends.qualcomm.passes.insert_io_qdq import InsertIOQDQ
 from executorch.backends.qualcomm.passes.insert_requantize import InsertRequantize
 from executorch.backends.qualcomm.passes.layout_transform import LayoutTransform
@@ -47,14 +51,16 @@ def preprocess(
                 InsertRequantize(edge_program),
                 InsertIOQDQ(edge_program),
                 LayoutTransform(edge_program, insert_permute=True),
+                # please enable this when apply convert_linear_to_conv2d
+                # FuseConsecutiveTranspose(),
             ]
         )
 
         pass_result = qnn_compiler_passes(edge_program.graph_module)
         assert pass_result is not None
 
         enable_tensor_dump = qnn_manager.IsTensorDump()
-        nodes_to_wrappers = {}
+        nodes_to_wrappers = defaultdict(dict)
         node_visitors = get_node_visitors(
             edge_program, enable_tensor_dump=enable_tensor_dump
         )
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def define_node(`
`34`	`34`	`weight_tensor,`
`35`	`35`	`PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_STATIC,`
`36`	`36`	`nodes_to_wrappers,`
`37`		`- is_input_tensor=False,`
	`37`	`+ is_input_tensor=True,`
`38`	`38`	`)`
`39`	`39`
`40`	`40`	`indices_node = node.args[1]`
Original file line number	Diff line number	Diff line change
`@@ -46,5 +46,7 @@ def define_node(`
`46`	`46`	`raise AssertionError(`
`47`	`47`	`f"Invalid number of index for {node.name }: {len(node.args[1])}"`
`48`	`48`	`)`
`49`		`- nodes_to_wrappers[node.name] = nodes_to_wrappers.get(node.args[0].name)`
	`49`	`+ nodes_to_wrappers[node.name] = {`
	`50`	`+ 0: nodes_to_wrappers.get(node.args[0].name).get(node.args[1])`
	`51`	`+ }`
`50`	`52`	`return`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
`6`	`6`	`import copy`
	`7`	`+from collections import defaultdict`
`7`	`8`	`from typing import Any, Dict, List`
`8`	`9`
`9`	`10`	`import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager`
`@@ -49,7 +50,7 @@ def __init__(`
`49`	`50`	`)`
`50`	`51`
`51`	`52`	`self.skip_node_id_set = skip_node_id_set`
`52`		`- self.nodes_to_wrappers = {}`
	`53`	`+ self.nodes_to_wrappers = defaultdict(dict)`
`53`	`54`	`self.qnn_manager = PyQnnManager.QnnManager(`
`54`	`55`	`generate_qnn_executorch_option(compiler_specs)`
`55`	`56`	`)`