qnn end to end flow

cccclai · cccclai · commit 5d396b324a6e · 2024-04-14T16:15:24.000-07:00
Pull Request resolved: #3038 Patch a few changes including: - support bool tensor type - support fp16 and fix the 8w8a quantization. - add two non-supported ops (slice_scatter and index_put) in common_defs.py stories model working end to end: AOT: fp16: ``` python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json ``` quantize: ``` python -m examples.models.llama2.export_llama -kv --qnn --pt2e_quantize qnn_8a8w -c stories110M.pt -p params.json ``` Runtime: ``` /llama_main --model_path=llama2_fp16_qnn_2.21.pte --tokenizer_path=tokenizer.bin --prompt="Once" ``` Output: ``` Once upon a time, there was a boy named Tim. Tim had a pet dog named Max. Max was a big, strong dog. They liked to play and run in the park. One day, Tim and Max went to the park to play. They saw a cat. The cat was up in a tree. Max wanted to help the cat. He tried to climb the tree, but he could not. Then, something unexpected happened. Max started to climb the tree! He was very strong. Max helped the cat come down. The cat was happy. Tim was so proud of his pet. ``` Stories model is too small and sensitive to qunatization. ghstack-source-id: 222473434 @exported-using-ghexport Differential Revision: [D56119738](https://our.internmc.facebook.com/intern/diff/D56119738/)
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -29,6 +29,7 @@
     QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,
 }
 QNN_TENSOR_TYPE_MAP = {
+    torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
     torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,
     torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,
     torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,
diff --git a/backends/qualcomm/partition/common_defs.py b/backends/qualcomm/partition/common_defs.py
@@ -13,6 +13,8 @@
     exir_ops.edge.aten.clone.default,
     exir_ops.edge.aten.index.Tensor,
     exir_ops.edge.aten.full.default,
+    exir_ops.edge.aten.slice_scatter.default,
+    exir_ops.edge.aten.index_put.default,
 ]
 
 allow_list_operator = [
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -19,6 +19,7 @@
 
 import pkg_resources
 import torch
+import torch.nn.functional as F
 from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
     XnnpackDynamicallyQuantizedPartitioner,
@@ -34,7 +35,6 @@
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from sentencepiece import SentencePieceProcessor
-from torch.nn import functional as F
 
 from .builder import DType, LlamaEdgeManager, load_llama_model, WeightType
 from .quant_lib import _get_pt2e_quantization_params, get_pt2e_quantizers
@@ -337,6 +337,13 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--pt2e_quantize",
         default=None,
+        choices=[
+            "xnnpack_dynamic",
+            "xnnpack_dynamic_qc4",
+            "qnn_8a8w",
+            "qnn_16a16w",
+            "qnn_16a4w",
+        ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
     parser.add_argument(
@@ -607,6 +614,8 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     if args.use_sdpa_with_kv_cache:
         transforms.append(replace_sdpa_with_custom_op)
 
+    if args.qnn and args.use_kv_cache:
+        transforms.append(replace_sdpa_with_simple_sdpa)
     return (
         load_llama_model(
             checkpoint=checkpoint_path,
@@ -629,13 +638,16 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     # export_to_edge
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
-    if args.qnn:
-        assert (
-            args.quantization_mode is None
-        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
+    quant_dtype = None
+    if args.qnn and args.pt2e_quantize:
         try:
             # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
-            from executorch.backends.qualcomm.quantizer.quantizer import QnnQuantizer
+            from executorch.backends.qualcomm.quantizer.quantizer import (
+                get_16a4w_qnn_ptq_config,
+                get_default_16bit_qnn_ptq_config,
+                QnnQuantizer,
+                QuantDtype,
+            )
 
             # reset quantizers and pt2e_quant_params from xnnpack backend
             pt2e_quant_params = None
@@ -645,10 +657,36 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
                 "Please install the Qualcomm backend follwing https://pytorch.org/executorch/main/build-run-qualcomm.html"
             )
 
+        backend, quant_config = args.pt2e_quantize.split("_")
+        assert (
+            backend == "qnn"
+        ), f"The quantization config is for backend {backend} instead of qnn."
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer = QnnQuantizer()
         # more custom quantization are supported including 16a4w etc. default to 8bit quantized
         custom_annotations = ()
+        if quant_config == "8a8w":
+            quant_dtype = QuantDtype.use_8a8w
+            pass
+        elif quant_config == "16a16w":
+            quant_dtype = QuantDtype.use_16a16w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            qnn_quantizer.set_bit16_op_quant_config(get_default_16bit_qnn_ptq_config())
+        elif quant_config == "16a4w":
+            quant_dtype = QuantDtype.use_16a4w
+            qnn_quantizer.add_16bit_quant_ops(qnn_quantizer.SUPPORTED_OPS)
+            qnn_quantizer.set_bit16_op_quant_config(get_16a4w_qnn_ptq_config())
+            qnn_quantizer.set_per_channel_weight_dtype(
+                weight_dtype_for_16bit_act="int4"
+            )
+        else:
+            raise AssertionError(
+                f"No support for quant type {quant_config}. Support 8a8w, 16a16w and 16a4w."
+            )
+
+        assert (
+            args.quantization_mode is None
+        ), "Currently qnn backend only supports QnnQuantizer via pt2e flow"
         qnn_quantizer.add_custom_quant_annotations(custom_annotations)
         quantizers.append(qnn_quantizer)
 
@@ -763,24 +801,37 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
             )
 
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        use_fp16 = True
+        skip_node_op_set = {}
+        if args.pt2e_quantize:
+            use_fp16 = False
+            # TODO: fix the lowering error without skipping nodes
+            if quant_dtype == QuantDtype.use_8a8w:
+                skip_node_op_set = {
+                    "aten.unsqueeze_copy.default",
+                    "aten.permute_copy.default",
+                }
+            elif quant_dtype == QuantDtype.use_16a16w:
+                raise NotImplementedError("16a16w for llama is still under development")
+            elif quant_dtype == QuantDtype.use_16a4w:
+                raise NotImplementedError("16a4w for llama is still under development")
         partitioners.append(
             # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
             QnnPartitioner(
                 # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
                 generate_qnn_executorch_compiler_spec(
                     # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
                     soc_model=QcomChipset.SM8650,  # default to SM8650
-                    backend_options=backend_options,
+                    backend_options=generate_htp_compiler_spec(use_fp16=use_fp16),
                     debug=False,
                     saver=False,
                 ),
                 skip_node_id_set={},
-                skip_node_op_set={},
+                skip_node_op_set=skip_node_op_set,
             )
         )
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`
-        _transform(builder_exported_to_edge.export_program())
+        _transform(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@`
`29`	`29`	`QNN_uint16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UFIXED_POINT_16,`
`30`	`30`	`}`
`31`	`31`	`QNN_TENSOR_TYPE_MAP = {`
	`32`	`+ torch.bool: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,`
`32`	`33`	`torch.float32: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_FLOAT_32,`
`33`	`34`	`torch.int8: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_8,`
`34`	`35`	`torch.int16: PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_INT_16,`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`	`exir_ops.edge.aten.clone.default,`
`14`	`14`	`exir_ops.edge.aten.index.Tensor,`
`15`	`15`	`exir_ops.edge.aten.full.default,`
	`16`	`+ exir_ops.edge.aten.slice_scatter.default,`
	`17`	`+ exir_ops.edge.aten.index_put.default,`
`16`	`18`	`]`
`17`	`19`
`18`	`20`	`allow_list_operator = [`