Qualcomm AI Engine Direct - HardSigmoid follow up for FP16 / Test cases complement

chuntl · chuntl · commit fd88d97db892 · 2024-04-01T20:26:12.000+08:00
Summary:
- make HardSigmoid more compact in FP16
- add online prepare utitlity test case
- add test case for export_example.py
diff --git a/backends/qualcomm/builders/op_dequantize.py b/backends/qualcomm/builders/op_dequantize.py
@@ -54,20 +54,16 @@ def define_node(
 
 
 @register_node_visitor
-class PerTensorDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.default"]
+class PerTensorDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+    ]
 
 
 @register_node_visitor
-class PerTensorDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.tensor"]
-
-
-@register_node_visitor
-class PerChannelDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.default"]
-
-
-@register_node_visitor
-class PerChannelDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.tensor"]
+class PerChannelDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
diff --git a/backends/qualcomm/passes/convert_hardsigmoid.py b/backends/qualcomm/passes/convert_hardsigmoid.py
@@ -25,6 +25,10 @@ def call(self, graph_module: torch.fx.GraphModule):
         partitions = get_source_partitions(graph, [torch.nn.Hardsigmoid])
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
+                if exir_ops.edge.aten.hardswish.default in [
+                    node.target for node in src_partition.nodes
+                ]:
+                    continue
                 if self.quantization_capture:
                     # only one hardsigmoid op will be seen
                     input_nodes = src_partition.input_nodes
@@ -34,8 +38,6 @@ def call(self, graph_module: torch.fx.GraphModule):
                 else:
                     in_ops_target = exir_ops.edge.aten.add.Tensor
                     out_ops_target = exir_ops.edge.aten.div.Tensor
-                    # see the reverse engineering logic hardswish
-                    # https://shorturl.at/pACEL
                     input_nodes = [
                         n for n in src_partition.nodes if n.target is in_ops_target
                     ]
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6,8 +6,10 @@
 import json
 import subprocess
 import sys
+import tempfile
 import unittest
 from multiprocessing.connection import Listener
+from pathlib import Path
 
 import torch
 from executorch.backends.qualcomm.tests.utils import (
@@ -1099,6 +1101,19 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1220,6 +1235,20 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1586,6 +1615,29 @@ def test_ptq_mobilebert(self):
             for k, v in cpu.items():
                 self.assertLessEqual(abs(v[0] - htp[k][0]), 5)
 
+    def test_export_example(self):
+        if not self.required_envs([self.model_name]):
+            self.skipTest("missing required envs")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            cmds = [
+                "python",
+                "qualcomm/scripts/export_example.py",
+                "--model_name",
+                self.model_name,
+                "--output_folder",
+                "{}/".format(tmp_dir),
+                "--generate_etrecord",
+            ]
+
+            p = subprocess.Popen(
+                cmds, stdout=subprocess.DEVNULL, cwd=f"{self.executorch_root}/examples"
+            )
+            p.communicate()
+            self.assertTrue(
+                Path("{0}/{1}.pte".format(tmp_dir, self.model_name)).exists()
+            )
+
 
 def setup_environment():
     parser = setup_common_args_and_variables()
@@ -1615,6 +1667,12 @@ def setup_environment():
         default="",
         type=str,
     )
+    parser.add_argument(
+        "-n",
+        "--model_name",
+        help="Input the model to export",
+        type=str,
+    )
     parser.add_argument(
         "-o",
         "--online_prepare",
@@ -1643,6 +1701,7 @@ def setup_environment():
     TestQNN.artifact_dir = args.artifact_dir
     TestQNN.image_dataset = args.image_dataset
     TestQNN.pretrained_weight = args.pretrained_weight
+    TestQNN.model_name = args.model_name
     TestQNN.online_prepare = args.online_prepare
     TestQNN.enable_profile = args.enable_profile
     TestQNN.error_only = args.error_only
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -19,6 +19,7 @@
     ConvertBinaryOpsWithScalar,
 )
 from executorch.backends.qualcomm.passes.convert_bmm_to_matmul import ConvertBmmToMatmul
+from executorch.backends.qualcomm.passes.convert_hardsigmoid import ConvertHardsigmoid
 from executorch.backends.qualcomm.passes.convert_interpolate_with_upsample2d import (
     ConvertInterpolateWithUpsample2D,
 )
@@ -104,6 +105,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     graph_module = edge_program.graph_module
     RemoveClone()(graph_module)
     ConvertToLinear()(graph_module)
+    ConvertHardsigmoid()(graph_module)
     ConvertBmmToMatmul()(graph_module)
     ConvertInterpolateWithUpsample2D()(graph_module)
     I64toI32(edge_program)(graph_module)
diff --git a/examples/qualcomm/scripts/export_example.py b/examples/qualcomm/scripts/export_example.py
@@ -40,6 +40,14 @@
         help="Generate ETRecord metadata to link with runtime results (used for profiling)",
     )
 
+    parser.add_argument(
+        "-f",
+        "--output_folder",
+        type=str,
+        default="",
+        help="The folder to store the exported program",
+    )
+
     args = parser.parse_args()
 
     if args.model_name not in MODEL_NAME_TO_MODEL:
@@ -92,7 +100,7 @@
     )
 
     if args.generate_etrecord:
-        etrecord_path = "etrecord.bin"
+        etrecord_path = args.output_folder + "etrecord.bin"
         generate_etrecord(etrecord_path, edge_copy, executorch_program)
 
-    save_pte_program(executorch_program, args.model_name)
+    save_pte_program(executorch_program, args.model_name, args.output_folder)