Qualcomm AI Engine Direct - GA FocalNet

winskuo-quic · winskuo-quic · commit 1a4c77c39a67 · 2025-05-22T11:04:25.000+08:00
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
@@ -253,7 +253,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
 
   pybind11_extension(PyQnnManagerAdaptor)
   pybind11_extension(PyQnnWrapperAdaptor)
-  if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+  if(NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES RelWithDebInfo)
     # Strip unnecessary sections of the binary
     pybind11_strip(PyQnnManagerAdaptor)
     pybind11_strip(PyQnnWrapperAdaptor)
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from .annotate_adaptive_avg_pool1d import AnnotateAdaptiveAvgPool1D
 from .annotate_quant_attrs import AnnotateQuantAttrs
 from .annotate_stack import AnnotateStack
 from .annotate_unbind import AnnotateUnbind
@@ -38,6 +39,7 @@
 
 
 __all__ = [
+    AnnotateAdaptiveAvgPool1D,
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
diff --git a/backends/qualcomm/_passes/annotate_adaptive_avg_pool1d.py b/backends/qualcomm/_passes/annotate_adaptive_avg_pool1d.py
@@ -0,0 +1,43 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.backends.qualcomm.builders.node_visitor import q_ops
+from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+from .utils import get_quant_attrs
+
+
+class AnnotateAdaptiveAvgPool1D(ExportPass):
+    """
+    Add "quant_attrs" to graph nodes' meta from the QDQ information
+    generated after quantization process.
+    adaptive_avg_pool1d got decomposed to unsqueeze -> adaptive_avg_pool2d -> squeeze
+    """
+
+    def __init__(self, edge_program: torch.export.ExportedProgram):
+        super(AnnotateAdaptiveAvgPool1D, self).__init__()
+        self.edge_program = edge_program
+
+    def _annotate_adaptive_avg_pool1d(self, graph_module: torch.fx.GraphModule):
+        partitions = get_source_partitions(
+            graph_module.graph, [torch.ops.aten.adaptive_avg_pool1d.default]
+        )
+        for src_partitions in partitions.values():
+            for src_partition in src_partitions:
+                output = src_partition.output_nodes[0]
+                if (list(output.users)[0].target) in q_ops:
+                    quant_attrs = get_quant_attrs(
+                        self.edge_program, list(output.users)[0]
+                    )
+                    for n in src_partition.nodes:
+                        n.meta[QCOM_QUANT_ATTRS] = quant_attrs.copy()
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        self._annotate_adaptive_avg_pool1d(graph_module)
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py
@@ -7,6 +7,7 @@
 from typing import Any, Dict
 
 import torch
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops, q_ops
 from executorch.backends.qualcomm.builders.utils import get_parameter
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_DTYPE,
@@ -20,7 +21,7 @@
 )
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import dq_ops, get_quant_attrs, q_ops
+from .utils import get_quant_attrs
 
 
 class AnnotateQuantAttrs(ExportPass):
diff --git a/backends/qualcomm/_passes/annotate_stack.py b/backends/qualcomm/_passes/annotate_stack.py
@@ -4,11 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+from executorch.backends.qualcomm.builders.node_visitor import q_ops
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-from .utils import get_quant_attrs, q_ops
+from .utils import get_quant_attrs
 
 
 class AnnotateStack(ExportPass):
@@ -27,7 +28,7 @@ def _annotate_stack(self, graph_module: torch.fx.GraphModule):
         partitions = get_source_partitions(
             graph_module.graph, [torch.stack, torch.ops.aten.stack.default, "stack"]
         )
-        for _, src_partitions in partitions.items():
+        for src_partitions in partitions.values():
             for src_partition in src_partitions:
                 output = src_partition.output_nodes[0]
                 if (list(output.users)[0].target) in q_ops:
diff --git a/backends/qualcomm/_passes/annotate_unbind.py b/backends/qualcomm/_passes/annotate_unbind.py
@@ -4,11 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops
 from executorch.backends.qualcomm.utils.constants import QCOM_QUANT_ATTRS
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-from .utils import dq_ops, get_quant_attrs
+from .utils import get_quant_attrs
 
 
 class AnnotateUnbind(ExportPass):
@@ -27,7 +29,7 @@ def _annotate_unbind(self, graph_module: torch.fx.GraphModule):
         partitions = get_source_partitions(
             graph_module.graph, [torch.unbind, torch.ops.aten.unbind.int, "unbind"]
         )
-        for _, src_partitions in partitions.items():
+        for src_partitions in partitions.values():
             for src_partition in src_partitions:
                 if src_partition.input_nodes[0].target in dq_ops:
                     q_node = src_partition.input_nodes[0].args[0]
diff --git a/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
@@ -5,12 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
+
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
 
-from .utils import dq_ops
-
 
 class ExpandBroadcastTensorShape(ExportPass):
     """
diff --git a/backends/qualcomm/_passes/fold_qdq.py b/backends/qualcomm/_passes/fold_qdq.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops, q_ops
 from executorch.backends.qualcomm.builders.utils import is_parameter
 from executorch.backends.qualcomm.utils.constants import QCOM_BYPASS_NODE
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from executorch.exir.passes import dead_code_elimination_pass
 
-from .utils import dq_ops, q_ops
-
 
 class FoldQDQ(ExportPass):
     """
diff --git a/backends/qualcomm/_passes/insert_io_qdq.py b/backends/qualcomm/_passes/insert_io_qdq.py
@@ -7,6 +7,8 @@
 
 import torch
 
+from executorch.backends.qualcomm.builders.node_visitor import q_ops
+
 from executorch.backends.qualcomm.builders.utils import is_parameter
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_ENCODING,
@@ -16,8 +18,6 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
-from .utils import q_ops
-
 
 class InsertIOQDQ(ExportPass):
     """
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -9,6 +9,7 @@
 from typing import Dict
 
 from executorch.backends.qualcomm._passes import (
+    AnnotateAdaptiveAvgPool1D,
     AnnotateQuantAttrs,
     AnnotateStack,
     AnnotateUnbind,
@@ -73,6 +74,7 @@ def get_capture_program_passes():
     # The second value in each tuple in `default_passes_and_setting` indicates whether the corresponding pass is activated by default.
     # If a pass is activated, it will be executed by default.
     default_passes_and_setting = [
+        (AnnotateAdaptiveAvgPool1D, True),
         (AnnotateQuantAttrs, True),
         (AnnotateStack, True),
         (AnnotateUnbind, True),
@@ -128,11 +130,11 @@ def get_to_edge_transform_passes(
         dep_table: Dict = None,
     ):
         # TODO: remove this workaround when target could be correctly detected
-        from executorch.backends.qualcomm._passes import utils
+        from executorch.backends.qualcomm.builders import node_visitor
         from executorch.exir.dialects._ops import ops as exir_ops
 
-        utils.q_ops.add(exir_ops.edge.pt2e_quant.quantize_affine.default)
-        utils.dq_ops.add(exir_ops.edge.pt2e_quant.dequantize_affine.default)
+        node_visitor.q_ops.add(exir_ops.edge.pt2e_quant.quantize_affine.default)
+        node_visitor.dq_ops.add(exir_ops.edge.pt2e_quant.dequantize_affine.default)
 
         passes_job = (
             passes_job if passes_job is not None else get_capture_program_passes()
diff --git a/backends/qualcomm/_passes/recompose_rms_norm.py b/backends/qualcomm/_passes/recompose_rms_norm.py
@@ -4,13 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import torch
+
+from executorch.backends.qualcomm.builders.node_visitor import dq_ops
 from executorch.backends.qualcomm.builders.utils import get_parameter, is_parameter
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
-from .utils import dq_ops
-
 
 class RecomposeRmsNorm(ExportPass):
     """
diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py
@@ -13,19 +13,6 @@
 from torch._subclasses import FakeTensor
 
 
-q_ops = {
-    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
-}
-
-dq_ops = {
-    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-}
-
-
 def copy_meta(meta: Dict, callback=None):
     copied = {}
     for k, v in meta.items():
@@ -73,6 +60,7 @@ def get_passes_dependency_for_capture_program():
         dict: A dictionary mapping each pass to its corresponding list of dependencies.
     """
     from executorch.backends.qualcomm._passes import (
+        AnnotateAdaptiveAvgPool1D,
         AnnotateQuantAttrs,
         AnnotateStack,
         AnnotateUnbind,
@@ -94,6 +82,7 @@ def get_passes_dependency_for_capture_program():
     )
 
     return {
+        AnnotateAdaptiveAvgPool1D: [RemoveRedundancy],
         AnnotateQuantAttrs: [
             RecomposePixelUnshuffle,
             ConvertBmmToMatmul,
diff --git a/backends/qualcomm/builders/node_visitor.py b/backends/qualcomm/builders/node_visitor.py
@@ -11,7 +11,6 @@
 
 import numpy as np
 import torch
-from executorch.backends.qualcomm._passes.utils import dq_ops
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_AXIS,
     QCOM_AXIS_ORDER,
@@ -79,6 +78,18 @@
     exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
 }
 
+q_ops = {
+    exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+}
+
+dq_ops = {
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+    exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+}
+
 
 class NodeVisitor:
     """
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -462,8 +462,13 @@ def annotate_neg(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_single_in_single_out(node, quantization_config)
 
 
-@register_annotator([torch.ops.aten.adaptive_avg_pool2d.default])
-def annotate_adaptive_avgpool2d(
+@register_annotator(
+    [
+        torch.ops.aten.adaptive_avg_pool1d.default,
+        torch.ops.aten.adaptive_avg_pool2d.default,
+    ]
+)
+def annotate_adaptive_avg_pool(
     node: Node, quantization_config: QuantizationConfig
 ) -> None:
     annotate_single_in_single_out(node, quantization_config)
@@ -1170,7 +1175,13 @@ def annotate_unbind(node: Node, quantization_config: QuantizationConfig) -> None
         )
 
 
-@register_annotator([torch.ops.aten.split.Tensor, torch.ops.aten.chunk.default])
+@register_annotator(
+    [
+        torch.ops.aten.split_with_sizes.default,
+        torch.ops.aten.split.Tensor,
+        torch.ops.aten.chunk.default,
+    ]
+)
 def annotate_chunk(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
         return
diff --git a/backends/qualcomm/scripts/build.sh b/backends/qualcomm/scripts/build.sh
@@ -30,7 +30,7 @@ CMAKE_X86_64="build-x86"
 BUILD_AARCH64="true"
 CMAKE_AARCH64="build-android"
 CLEAN="true"
-BUILD_TYPE="Debug"
+BUILD_TYPE="RelWithDebInfo"
 BUILD_JOB_NUMBER="16"
 
 if [ -z PYTHON_EXECUTABLE ]; then
@@ -71,7 +71,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/devtools && make clean
+        cd $BUILD_ROOT/third-party/flatcc && make clean
     fi
 
     cd $BUILD_ROOT
@@ -116,7 +116,7 @@ if [ "$BUILD_X86_64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/devtools && make clean
+        cd $BUILD_ROOT/third-party/flatcc && make clean
     fi
 
     cd $BUILD_ROOT
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -3868,6 +3868,44 @@ def test_fbnet(self):
                 self.assertGreaterEqual(msg["top_1"], 60)
                 self.assertGreaterEqual(msg["top_5"], 90)
 
+    def test_focalnet(self):
+        if not self.required_envs([self.image_dataset]):
+            self.skipTest("missing required envs")
+
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/focalnet.py",
+            "--dataset",
+            self.image_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+        if self.shared_buffer:
+            cmds.extend(["--shared_buffer"])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["top_1"], 55)
+                self.assertGreaterEqual(msg["top_5"], 80)
+
     def test_gMLP(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py
diff --git a/examples/qualcomm/CMakeLists.txt b/examples/qualcomm/CMakeLists.txt
diff --git a/examples/qualcomm/oss_scripts/focalnet.py b/examples/qualcomm/oss_scripts/focalnet.py