skip rms norm

shewu-quic · shewu-quic · commit 57bda673d0a2 · 2024-04-15T08:32:33.000+08:00
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -164,8 +164,9 @@ def get_quant_encoding_conf(
             else node.meta["quant_attrs"]
         )
         if quant_attrs["encoding"] in PER_CHANNEL_ENCODING:
+            print(f"[Hutton define_tensor] {node.name} {quant_attrs['scales']}, {-quant_attrs['zero_points']}")
             return self.make_qnn_per_channel_config(node, quant_attrs)
-
+        print(f"[Hutton define_tensor] {node.name} {quant_attrs['scale']}, {-quant_attrs['zero_point']}")
         return self.make_qnn_per_tensor_config(quant_attrs)
 
     def get_quant_tensor_value(
diff --git a/backends/qualcomm/builders/op_mean_dim.py b/backends/qualcomm/builders/op_mean_dim.py
@@ -27,7 +27,9 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
+        
         input_node = node.args[0]
+        print(f"[Hutton] {node.name} {node.meta}")
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
diff --git a/backends/qualcomm/builders/op_mul.py b/backends/qualcomm/builders/op_mul.py
@@ -25,6 +25,7 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
+        print(f"[Hutton] {node.name} {node.meta}")
         out_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
             node,
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -14,6 +14,12 @@
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.index_put.default,
+    # exir_ops.edge.aten.mul.Tensor,
+    # exir_ops.edge.aten.sub.Tensor,
+    # exir_ops.edge.aten.add.Tensor,
+    # exir_ops.edge.aten.rsqrt.default,
+    # exir_ops.edge.aten.matmul.default,
+    # exir_ops.edge.aten.unsqueeze_copy.default,
 ]
 
 allow_list_operator = [
diff --git a/backends/qualcomm/partition/qnn_partitioner.py b/backends/qualcomm/partition/qnn_partitioner.py
@@ -6,6 +6,7 @@
 import copy
 from typing import Any, Dict, List
 
+from executorch.examples.models.llama2.llama_transformer import RMSNorm
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 import torch
 from executorch.backends.qualcomm.builders import node_visitor
@@ -53,6 +54,7 @@ def __init__(
         self.qnn_manager = PyQnnManager.QnnManager(
             generate_qnn_executorch_option(compiler_specs)
         )
+        self.discard_modules = set([RMSNorm])
 
         self.qnn_manager.Init()
 
@@ -62,7 +64,15 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
 
         if node.target in allow_list_operator:
             return True
-
+        # if "nn_module_stack" in node.meta:
+        #     module_values_list = list(node.meta["nn_module_stack"].values())
+        #     owning_module = module_values_list[-1][1]
+        #     if owning_module in self.discard_modules:
+        #         print(f"[QNN Partitioner Op Support]: {node.name} | Skipped since RMS norm")
+        #         return False
+        # if "quant_attrs" in node.meta and node.meta['quant_attrs']['scale'] > 1:
+        #     print(f"[QNN Partitioner Op Support]: {node.name} | Skipped since scale is greater than 1")
+        #     return False
         if self.skip_node_id_set is not None and node.name in self.skip_node_id_set:
             print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped")
             return False
diff --git a/backends/qualcomm/quantizer/quantizer.py b/backends/qualcomm/quantizer/quantizer.py
@@ -6,6 +6,7 @@
 from enum import IntEnum, unique
 from typing import Callable, Dict, Optional, Sequence, Set
 
+from executorch.examples.models.llama2.llama_transformer import RMSNorm
 import torch
 from executorch.backends.qualcomm.passes.convert_hardsigmoid import ConvertHardsigmoid
 from executorch.backends.qualcomm.passes.decompose_scaled_dot_product_attention import (
@@ -62,7 +63,7 @@ def __init__(self):
 
         self.custom_quant_annotations: Sequence[Callable] = []
         self.discard_nodes: Set[str] = set()
-
+        self.discard_modules: Set[torch.nn.Module] = set()
         self.use_per_channel_weight_quant_ops: Set[OpOverload] = set()
         # the weight quantized for activation 8 bits and 16 bits
         self.per_channel_weight_dtype: Dict = {
@@ -71,10 +72,15 @@ def __init__(self):
         }
 
     def _annotate(self, gm: GraphModule) -> None:
+        self.discard_modules = set([RMSNorm])
         for node in gm.graph.nodes:
             if node.name in self.discard_nodes:
                 continue
-
+            if "nn_module_stack" in node.meta:
+                module_values_list = list(node.meta["nn_module_stack"].values())
+                owning_module = module_values_list[-1][1]
+                if owning_module in self.discard_modules:
+                    continue
             quant_config = self._get_quant_config(node.target)
             if quant_config:
                 OP_ANNOTATOR[node.target](node, quant_config)
@@ -176,6 +182,50 @@ def set_per_channel_linear_quant(self, enable: bool) -> None:
             torch.ops.aten.linear.default,
         }
         self._update_per_channel_weight_quant_ops(linear_ops, enable)
+    
+    def _lift_constant_scalar_operands(self, gm: torch.fx.GraphModule) -> None:
+        """
+        For the case like mul(x, 2), convert the the scalr to tensor
+        """
+        for n in gm.graph.nodes:
+            if n.op != "call_function" or n.target not in (
+                torch.ops.aten.add.Tensor,
+                torch.ops.aten.sub.Tensor,
+                torch.ops.aten.mul.Tensor,
+                torch.ops.aten.mul.Scalar,
+                torch.ops.aten.rsub.Scalar,
+            ):
+                continue
+
+            const_arg = None
+            non_const_arg = None
+            for arg in n.args:
+                if isinstance(arg, torch.fx.Node):
+                    non_const_arg = arg
+                else:
+                    const_arg = arg
+
+            if non_const_arg is None or const_arg is None:
+                continue
+
+            tensor_constant = torch.tensor([const_arg])
+            tensor_constant_name = get_new_attr_name_with_prefix("_tensor_constant_")(
+                gm
+            )
+            gm.register_buffer(tensor_constant_name, tensor_constant)
+
+            fake_mode = n.meta["val"].fake_mode
+            with gm.graph.inserting_before(n):
+                get_attr_node = gm.graph.get_attr(tensor_constant_name)
+                get_attr_node.meta["val"] = fake_mode.from_tensor(tensor_constant)
+
+            if n.target == torch.ops.aten.rsub.Scalar:
+                n.args = (get_attr_node, non_const_arg) + n.args[2:]
+                n.target = torch.ops.aten.sub.Tensor
+            else:
+                n.args = (non_const_arg, get_attr_node) + n.args[2:]
+
+        gm.recompile()
 
     def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = RemoveClone()(model).graph_module
@@ -184,7 +234,7 @@ def transform_for_annotation(self, model: GraphModule) -> GraphModule:
         model = DecomposeScaledDotProductAttention()(model).graph_module
         model = DecomposeSilu()(model).graph_module
         model = ReplaceInfBuffer()(model).graph_module
-
+        # self._lift_constant_scalar_operands(model)
         return model
 
     def validate(self, model: GraphModule) -> None:
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -183,6 +183,7 @@ def generate_htp_compiler_spec(
     htp_options.performance_mode = QnnExecuTorchHtpPerformanceMode.kHtpBurst
     htp_options.use_multi_contexts = use_multi_contexts
     htp_options.use_dlbc = use_dlbc
+    # htp_options.use_conv_hmx = False
     return QnnExecuTorchBackendOptions(
         backend_type=QnnExecuTorchBackendType.kHtpBackend,
         htp_options=htp_options,
diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
@@ -24,6 +24,26 @@
 
 from sentencepiece import SentencePieceProcessor
 
+def annotate_rms_norm(gm: torch.fx.GraphModule) -> None:
+    from executorch.backends.qualcomm.quantizer.quantizer import (
+        get_default_16bit_qnn_ptq_config,
+    )
+    from executorch.backends.qualcomm.quantizer.utils import (
+        OP_ANNOTATOR,
+    )
+    from executorch.examples.models.llama2.llama_transformer import RMSNorm
+
+    quantization_config = get_default_16bit_qnn_ptq_config()
+    SUPPORTED_OPS = set(OP_ANNOTATOR.keys())
+    for node in gm.graph.nodes:
+        if "nn_module_stack" in node.meta:
+            module_values_list = list(node.meta["nn_module_stack"].values())
+            owning_module = module_values_list[-1][1]
+            if owning_module in [RMSNorm]:
+                if node.target in SUPPORTED_OPS:
+                    print(f"[16 bits quant] {node.name}")
+                    OP_ANNOTATOR[node.target](node, quantization_config)
+
 
 def create_device_inputs(example_inputs):
     # TODO: support batch inputs if necessary
@@ -194,7 +214,7 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
             args.model,
             f"{args.artifact}/{pte_filename}",
             partial(calibrate, inputs),
-            custom_annotations=(),
+            custom_annotations=(annotate_rms_norm,),
             quant_dtype=quant_dtype,
             per_channel_linear=per_channel_linear,
             shared_buffer=args.shared_buffer,
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -196,6 +196,8 @@ def build_executorch_binary(
             raise AssertionError(f"No support for QuantDtype {quant_dtype}.")
 
         captured_model = torch._export.capture_pre_autograd_graph(model, inputs)
+        # from executorch.backends.qualcomm.utils.utils import draw_graph
+        # draw_graph("before_quantized", ".", captured_model)
         annotated_model = prepare_pt2e(captured_model, quantizer)
         print("Quantizing the model...")
         # calibration
@@ -205,7 +207,8 @@ def build_executorch_binary(
             for data in dataset:
                 annotated_model(*data)
         quantized_model = convert_pt2e(annotated_model)
-
+        # from executorch.backends.qualcomm.utils.utils import draw_graph
+        # draw_graph("afte_quantized", ".", quantized_model)
         edge_prog = capture_program(quantized_model, inputs)
     else:
         edge_prog = capture_program(model, inputs)
@@ -261,6 +264,9 @@ def build_executorch_binary(
             compile_config=EdgeCompileConfig(_check_ir_validity=False),
         )
         edge_prog_mgr = edge_prog_mgr.to_backend(qnn_partitioner)
+        # from executorch.backends.qualcomm.utils.utils import draw_graph
+        # draw_graph("afte_pte_default_8a8w_rmsnorm_16a16w", ".", edge_prog_mgr.exported_program().graph_module)
+
         exec_prog_mgr = edge_prog_mgr.to_executorch(config=executorch_config)
         with open(f"{file_name}.pte", "wb") as file:
             file.write(exec_prog_mgr.buffer)