pytorch
diff --git a/‎backends/qualcomm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 13 additions & 14 deletions b/‎backends/qualcomm/builders/node_visitor.py
Lines changed: 13 additions & 14 deletions
diff --git a/‎backends/qualcomm/builders/op_mean_dim.py
Lines changed: 0 additions & 1 deletion b/‎backends/qualcomm/builders/op_mean_dim.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/op_mul.py
Lines changed: 0 additions & 1 deletion b/‎backends/qualcomm/builders/op_mul.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/qualcomm/partition/common_defs.py
Lines changed: 3 additions & 0 deletions b/‎backends/qualcomm/partition/common_defs.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/qualcomm/partition/qnn_partitioner.py
Lines changed: 11 additions & 13 deletions b/‎backends/qualcomm/partition/qnn_partitioner.py
Lines changed: 11 additions & 13 deletions
diff --git a/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/quantizer/quantizer.py
Lines changed: 12 additions & 7 deletions b/‎backends/qualcomm/quantizer/quantizer.py
Lines changed: 12 additions & 7 deletions
diff --git a/‎backends/qualcomm/quantizer/utils.py
Lines changed: 2 additions & 2 deletions b/‎backends/qualcomm/quantizer/utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnBackendCache.cpp
Lines changed: 3 additions & 2 deletions b/‎backends/qualcomm/runtime/backends/QnnBackendCache.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎backends/qualcomm/tests/models.py
Lines changed: 26 additions & 1 deletion b/‎backends/qualcomm/tests/models.py
Lines changed: 26 additions & 1 deletion
@@ -323,6 +323,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
         pybind11::module
         pybind11::lto
         wrappers
+        qnn_schema
         qnn_executorch_logging
         qnn_executorch_header
     )
 
@@ -26,7 +26,7 @@
     # Note that there is no int64 tensor data type in Qnn.
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UNDEFINED,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
@@ -35,7 +35,7 @@
     torch.int32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_32,
     torch.int64: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_64,
     torch.uint8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_8,
-    QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
+    torch.uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16,
     float: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
 }
 
@@ -164,31 +164,26 @@ def get_quant_encoding_conf(
             else node.meta["quant_attrs"]
         )
         if quant_attrs["encoding"] in PER_CHANNEL_ENCODING:
-            print(f"[Hutton define_tensor] {node.name} {quant_attrs['scales']}, {-quant_attrs['zero_points']}")
             return self.make_qnn_per_channel_config(node, quant_attrs)
-        print(f"[Hutton define_tensor] {node.name} {quant_attrs['scale']}, {-quant_attrs['zero_point']}")
         return self.make_qnn_per_tensor_config(quant_attrs)
 
     def get_quant_tensor_value(
-        self, tensor: torch.Tensor, quant_attrs: Dict, dtype, bitwidth
+        self, tensor: torch.Tensor, quant_attrs: Dict, dtype, quant_configs
     ) -> torch.Tensor:
         if quant_attrs["encoding"] in PER_TENSOR_ENCODING:
             scale = quant_attrs["scale"]
             zero_point = quant_attrs["zero_point"]
+            
         else:  # per channel case
             scale = quant_attrs["scales"]
             zero_point = quant_attrs["zero_points"]
 
         # To bypass torch.uint16 quantization is not supported
-        dtype = (
-            torch.int32
-            if dtype == PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_16
-            else quant_attrs["dtype"]
-        )
-
+        dtype = quant_configs["dtype"]
+        print(f"[Hutton get_quant_tensor_value] tensor {tensor}")
         tensor = tensor.div(scale).add(zero_point).round().to(dtype)
         # Make the backends access data correctly
-        if bitwidth == 4:
+        if quant_configs.get("bitwidth") == 4:
             mask = torch.full(tensor.size(), 0x0F, dtype=torch.int8)
             tensor = torch.bitwise_and(mask, tensor)
         return tensor
@@ -237,11 +232,12 @@ def get_data_type(
                 <= torch.iinfo(torch.int16).max - torch.iinfo(torch.int16).min
             ):
                 if unsigned:
-                    quant_config["dtype"] = QNN_uint16
+                    quant_config["dtype"] = torch.uint16
                 else:
                     quant_config["dtype"] = torch.int16
             return QNN_QUANT_TYPE_MAP[quant_config["dtype"]]
         else:
+            print(f"[Hutton] QQ {tensor}")
             return QNN_TENSOR_TYPE_MAP[tensor.dtype]
 
     def define_custom_tensor_wrapper(
@@ -312,6 +308,7 @@ def define_tensor(
         )
         dtype = self.get_data_type(tensor, quant_configs, is_tensor)
         if isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor):
+            print(f"[Hutton fake_tensor] node {node} dtype {dtype}  quant_configs[dtype] {quant_configs.get('dtype')}")
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
                 tensor_name,
                 tensor_type,
@@ -329,8 +326,10 @@ def define_tensor(
                     tensor,
                     node.meta["quant_attrs"],
                     dtype,
-                    quant_configs.get("bitwidth"),
+                    quant_configs,
                 )
+                print(f"[Hutton scalar] node {node}: dtype {dtype} quant_configs[dtype] {quant_configs['dtype']}")
+
             tensor_wrapper = PyQnnWrapper.TensorWrapper(
                 tensor_name,
                 tensor_type,
 
@@ -29,7 +29,6 @@ def define_node(
     ) -> PyQnnWrapper.PyQnnOpWrapper:
 
         input_node = node.args[0]
-        print(f"[Hutton] {node.name} {node.meta}")
         input_tensor = self.get_tensor(input_node, node)
         input_tensor_wrapper = self.define_tensor(
             input_node,
 
@@ -25,7 +25,6 @@ def define_node(
         node: torch.fx.Node,
         nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
     ) -> PyQnnWrapper.PyQnnOpWrapper:
-        print(f"[Hutton] {node.name} {node.meta}")
         out_tensor = self.get_tensor(node, node)
         output_tensor_wrapper = self.define_tensor(
             node,
 
@@ -14,6 +14,9 @@
     exir_ops.edge.aten.full.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.index_put.default,
+    exir_ops.edge.aten.embedding.default,
+    # exir_ops.edge.aten.addmm.default,
+    # exir_ops.edge.aten.mm.default,
     # exir_ops.edge.aten.mul.Tensor,
     # exir_ops.edge.aten.sub.Tensor,
     # exir_ops.edge.aten.add.Tensor,
 
@@ -6,7 +6,6 @@
 import copy
 from typing import Any, Dict, List
 
-from executorch.examples.models.llama2.llama_transformer import RMSNorm
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 import torch
 from executorch.backends.qualcomm.builders import node_visitor
@@ -28,7 +27,7 @@
 
 from .common_defs import allow_list_operator, not_supported_operator
 
-
+test = 0
 class QnnOperatorSupport(OperatorSupportBase):
     def __init__(
         self,
@@ -54,8 +53,8 @@ def __init__(
         self.qnn_manager = PyQnnManager.QnnManager(
             generate_qnn_executorch_option(compiler_specs)
         )
-        self.discard_modules = set([RMSNorm])
-
+        # from executorch.examples.models.llama2.llama_transformer import RMSNorm
+        self.discard_modules =  ["executorch.examples.models.llama2.llama_transformer.RMSNorm"] #["executorch.examples.models.llama2.llama_transformer.RMSNorm"] # []
         self.qnn_manager.Init()
 
     def is_node_supported(self, _, node: torch.fx.Node) -> bool:
@@ -64,15 +63,14 @@ def is_node_supported(self, _, node: torch.fx.Node) -> bool:
 
         if node.target in allow_list_operator:
             return True
-        # if "nn_module_stack" in node.meta:
-        #     module_values_list = list(node.meta["nn_module_stack"].values())
-        #     owning_module = module_values_list[-1][1]
-        #     if owning_module in self.discard_modules:
-        #         print(f"[QNN Partitioner Op Support]: {node.name} | Skipped since RMS norm")
-        #         return False
-        # if "quant_attrs" in node.meta and node.meta['quant_attrs']['scale'] > 1:
-        #     print(f"[QNN Partitioner Op Support]: {node.name} | Skipped since scale is greater than 1")
-        #     return False
+        global test
+        test +=1
+        if "nn_module_stack" in node.meta and test < 6:
+            module_values_list = list(node.meta["nn_module_stack"].values())
+            owning_module = module_values_list[-1][1]
+            if owning_module in self.discard_modules:
+                print(f"[QNN Partitioner Op Support]: {node.name} | Skipped since RMS norm")
+                return False
         if self.skip_node_id_set is not None and node.name in self.skip_node_id_set:
             print(f"[QNN Partitioner Op Support]: {node.target.__name__} | Skipped")
             return False
 
@@ -52,7 +52,8 @@ def preprocess(
 
         pass_result = qnn_compiler_passes(edge_program.graph_module)
         assert pass_result is not None
-
+        # from executorch.backends.qualcomm.utils.utils import draw_graph
+        # draw_graph("qnn_preprocess",".", pass_result.graph_module)
         enable_tensor_dump = qnn_manager.IsTensorDump()
         nodes_to_wrappers = {}
         node_visitors = get_node_visitors(
 
@@ -20,6 +20,7 @@
 from torch._ops import OpOverload
 from torch.ao.quantization.quantizer import Quantizer
 from torch.fx import GraphModule
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
 
 from .utils import (
     get_16a4w_qnn_ptq_config,
@@ -75,12 +76,14 @@ def _annotate(self, gm: GraphModule) -> None:
         self.discard_modules = set([RMSNorm])
         for node in gm.graph.nodes:
             if node.name in self.discard_nodes:
+                print(f"[Hutton quantizer.py] discard nodes {node.name}")
                 continue
-            if "nn_module_stack" in node.meta:
-                module_values_list = list(node.meta["nn_module_stack"].values())
-                owning_module = module_values_list[-1][1]
-                if owning_module in self.discard_modules:
-                    continue
+            # if "nn_module_stack" in node.meta:
+            #     module_values_list = list(node.meta["nn_module_stack"].values())
+            #     owning_module = module_values_list[-1][1]
+            #     if owning_module in self.discard_modules:
+            #         print(f"[Hutton quantizer.py] discard modules {node.name}")
+            #         continue
             quant_config = self._get_quant_config(node.target)
             if quant_config:
                 OP_ANNOTATOR[node.target](node, quant_config)
@@ -207,8 +210,10 @@ def _lift_constant_scalar_operands(self, gm: torch.fx.GraphModule) -> None:
 
             if non_const_arg is None or const_arg is None:
                 continue
-
-            tensor_constant = torch.tensor([const_arg])
+            if type(const_arg) is int:
+                tensor_constant = torch.tensor([const_arg], dtype=torch.int32)
+            else:
+                tensor_constant = torch.tensor([const_arg])
             tensor_constant_name = get_new_attr_name_with_prefix("_tensor_constant_")(
                 gm
             )
 
@@ -120,7 +120,7 @@ def get_16a4w_qnn_ptq_config() -> QuantizationConfig:
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     weight_quantization_spec = QuantizationSpec(
@@ -157,7 +157,7 @@ def get_default_16bit_qnn_ptq_config() -> QuantizationConfig:
         quant_min=torch.iinfo(torch.uint16).min,
         quant_max=torch.iinfo(torch.uint16).max,
         qscheme=torch.per_tensor_affine,
-        observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args),
+        observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args),
     )
 
     weight_quantization_spec = QuantizationSpec(
 
@@ -87,7 +87,8 @@ QnnBackendCache::QnnBackendCache(
     state_ = SERIALIZE;
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
     return;
-  } else {
+  } 
+  /*else {
     // TODO: need fix on this since qnn context binary could somehow
     //       pass the check of flatbuffer verifier
     // check if context binary came from flatbuffer
@@ -100,7 +101,7 @@ QnnBackendCache::QnnBackendCache(
       state_ = ONLINE_PREPARE;
       return;
     }
-  }
+  }*/
 
   if (qnn_sys_impl_.Load() != Error::Ok) {
     QNN_EXECUTORCH_LOG_ERROR(
 
@@ -461,7 +461,7 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, y):
-        return torch.mul(x, y)
+        return x*y
 
 
 class MulConstantFloat(torch.nn.Module):
@@ -490,6 +490,16 @@ def forward(self, x):
         return out1
 
 
+class MulQQ(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(2))
+
+    def forward(self, x):
+        output = x
+        return output * self.weight
+
+
 class MultiheadAttention(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -588,6 +598,21 @@ def forward(self, x):
         return torch.rsqrt(x)
 
 
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    # def forward(self, x):
+    #     return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
+    def _norm(self, x):
+        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
 class ScaledDotProductAttention(torch.nn.Module):
     def __init__(self):
         super().__init__()
Original file line number	Diff line number	Diff line change
`@@ -323,6 +323,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")`
`323`	`323`	`pybind11::module`
`324`	`324`	`pybind11::lto`
`325`	`325`	`wrappers`
	`326`	`+ qnn_schema`
`326`	`327`	`qnn_executorch_logging`
`327`	`328`	`qnn_executorch_header`
`328`	`329`	`)`