[ET-VK] Add pass to remove copy ops (#7326)

pytorchbot · SS-JIA · web-flow · commit a3f792a6fed2 · 2024-12-16T13:31:57.000-05:00
## Context This diff prepares Vulkan to handle dim order operators. For more context, see #4873 Since Vulkan has its own internal representation of memory layout, these ops are handled by simply remove explicit memory layout transition operators from the graph and let the memory metadata tagging pass insert the necessary memory layout transitions. A new pass is added to remove such operators, largely based on QNN's `RemoveRedundancy` pass. Differential Revision: [D67180898](https://our.internmc.facebook.com/intern/diff/D67180898/) ghstack-source-id: 258092214 Pull Request resolved: #7325 Co-authored-by: Stephen Jia <ssjia@meta.com>
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
@@ -43,6 +43,19 @@ runtime.python_library(
     ],
 )
 
+runtime.python_library(
+    name = "remove_redundant_ops",
+    srcs = ["remove_redundant_ops.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 runtime.python_library(
     name = "tag_memory_meta_pass",
     srcs = ["tag_memory_meta_pass.py"],
@@ -71,6 +84,7 @@ runtime.python_library(
         ":insert_prepack_nodes",
         ":int4_weight_only_quantizer",
         ":remove_local_scalar_dense",
+        ":remove_redundant_ops",
         ":tag_memory_meta_pass"
     ]
 )
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
@@ -5,11 +5,15 @@
 from executorch.backends.vulkan._passes.remove_local_scalar_dense_ops import (
     RemoveLocalScalarDenseOpsTransform,
 )
+from executorch.backends.vulkan._passes.remove_redundant_ops import (
+    RemoveRedundantOpsTransform,
+)
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
 __all__ = [
     "insert_prepack_nodes",
     "VkInt4WeightOnlyQuantizer",
     "RemoveLocalScalarDenseOpsTransform",
+    "RemoveRedundantOpsTransform",
     "TagMemoryMetaPass",
 ]
diff --git a/backends/vulkan/_passes/remove_redundant_ops.py b/backends/vulkan/_passes/remove_redundant_ops.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Set, Union
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
+from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
+
+OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload]
+
+
+class RemoveRedundantOpsTransform(ExportPass):
+    """
+    Trim certain operators to reduce unnecessary overhead.
+    """
+
+    redundant_ops: Set[OpType] = {
+        torch.clone,
+        torch.ops.aten.clone.default,
+        exir_ops.edge.aten.clone.default,
+        torch.ops.aten.alias.default,
+        exir_ops.edge.aten.alias.default,
+        exir_ops.edge.aten.lift_fresh_copy.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+    }
+
+    def __init__(self) -> None:
+        super(RemoveRedundantOpsTransform, self).__init__()
+
+    def _should_remove(self, node: torch.fx.Node) -> bool:
+        if node.target in self.redundant_ops:
+            return True
+
+        # Only remove to_copy if dtype does not change. Otherwise, memory format changes
+        # will be handled internally by the backend.
+        if (
+            node.target == exir_ops.edge.aten._to_copy.default
+            or node.target == torch.ops.aten._to_copy.default
+        ):
+            src_dtype = node.meta["val"].dtype
+            # pyre-ignore
+            dst_dtype = node.args[0].meta["val"].dtype
+            return src_dtype == dst_dtype
+
+        return False
+
+    def _remove(self, graph_module: torch.fx.GraphModule) -> None:
+        for node in graph_module.graph.nodes:
+            if not self._should_remove(node):
+                continue
+
+            with graph_module.graph.inserting_after(node):
+                node.replace_all_uses_with(node.args[0])
+
+        graph_module.graph.eliminate_dead_code()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        self._remove(graph_module)
+        graph_module.recompile()
+        dead_code_elimination_pass(graph_module)
+        return PassResult(graph_module, True)
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
@@ -228,6 +228,8 @@ def update_features_impl(op: OpKey):
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        # dim order copy operator will be removed; memory layout is handled internally
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
     ]
 )
 def register_ephemeral_op(features: OpFeatures):
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
@@ -17,11 +17,11 @@
 from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
 from executorch.backends.transforms.fuse_dequant_linear import FuseDequantLinearPass
 from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
-from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 
 from executorch.backends.vulkan._passes import (
     insert_prepack_nodes,
     RemoveLocalScalarDenseOpsTransform,
+    RemoveRedundantOpsTransform,
     TagMemoryMetaPass,
 )
 
@@ -143,7 +143,7 @@ def preprocess(  # noqa: C901
         program = apply_passes(
             program,
             [
-                RemoveCloneOpsTransform(),
+                RemoveRedundantOpsTransform(),
                 AddmmToLinearTransform(),
                 FuseDequantLinearPass(),
                 FuseViewCopyTransform(),

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,8 @@ def update_features_impl(op: OpKey):`
`228`	`228`	`exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,`
`229`	`229`	`exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,`
`230`	`230`	`exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,`
	`231`	`+ # dim order copy operator will be removed; memory layout is handled internally`
	`232`	`+ exir_ops.edge.dim_order_ops._to_dim_order_copy.default,`
`231`	`233`	`]`
`232`	`234`	`)`
`233`	`235`	`def register_ephemeral_op(features: OpFeatures):`