pytorch · freddan80 · Jan 14, 2025 · Jan 10, 2025 · Jan 9, 2025 · Dec 17, 2024
@@ -119,7 +119,7 @@ backends/arm/test/setup_testing.sh
 The you can run the tests with
 
 ```
-pytest -c /dev/null -v -n auto backends/arm/test --arm_quantize_io --arm_run_corstoneFVP
+pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
 ### Code coverage

@@ -178,6 +178,20 @@ def is_tosa(compile_spec: List[CompileSpec]) -> bool:
     return False
 
 
+def is_quantize_io(compile_specs: List[CompileSpec]) -> bool:
+    for spec in compile_specs:
+        if spec.key == "quantize_io" and spec.value.decode() == "True":
+            return True
+    return False
+
+
+def get_tosa_version(compile_spec: List[CompileSpec]) -> TosaSpecification:
+    for spec in compile_spec:
+        if spec.key == "tosa_version":
+            return TosaSpecification.create_from_string(spec.value.decode())
+    raise RuntimeError("Could not find TOSA version in CompileSpec")
+
+
 def get_intermediate_path(compile_spec: List[CompileSpec]) -> Optional[str]:
     for spec in compile_spec:
         if spec.key == "debug_artifact_path":

@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -10,8 +10,10 @@
 from typing import Callable, final, List, Optional, Tuple
 
 import torch
-from executorch.backends.arm.arm_backend import ArmBackend  # usort: skip
-from executorch.backends.arm._passes.tag_io_quant_pass import TagIOQuantPass
+from executorch.backends.arm.arm_backend import (
+    ArmBackend,
+    is_quantize_io,
+)  # usort: skip
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
     TOSASupportedOperators,
 )
@@ -23,7 +25,7 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data
-from executorch.exir.passes import PassManager
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 
@@ -35,6 +37,22 @@
     logger.setLevel(logging.INFO)
 
 
+def is_quant_node(node: torch.fx.node.Node) -> bool:
+    return node.target in {
+        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+    }
+
+
+def is_dequant_node(node: torch.fx.node.Node) -> bool:
+    return node.target in {
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+    }
+
+
 @final
 class ArmPartitioner(Partitioner):
     def __init__(self, compile_spec: List[CompileSpec]) -> None:
@@ -43,6 +61,7 @@ def __init__(self, compile_spec: List[CompileSpec]) -> None:
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
         # subgraphs containing the nodes with the tags
+
         logger.info("ArmPartitioner::partition")
         partition_tags = {}
 
@@ -52,28 +71,42 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         logger.info(f"Partitioning for {tosa_spec}")
 
-        for spec in self.delegation_spec.compile_specs:
-            if spec.key == "quantize_io" and spec.value.decode() == "True":
-                # Exclude IO quantization from the partition
-                passes = PassManager(
-                    passes=[
-                        TagIOQuantPass(),
-                    ]
-                )
-                passes(exported_program.graph_module)
-
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
             TOSASupportedOperators(tosa_spec),
             allows_single_node_partition=True,
         )
         partition_list = capability_partitioner.propose_partitions()
         for partition in partition_list:
+            tag = f"tag{partition.id}"
+
+            def is_partitioned(node: torch.fx.Node, tag=tag) -> bool:
+                return (
+                    "delegation_tag" in node.meta and node.meta["delegation_tag"] == tag
+                )
+
             for node in partition.nodes:
-                tag = f"tag{partition.id}"
                 node.meta["delegation_tag"] = tag
                 partition_tags[tag] = self.delegation_spec
 
+            if not is_quantize_io(self.delegation_spec.compile_specs):
+                continue
+
+            # De-tag outmost q-nodes upwards and dq-nodes downwards.
+            # De-tag if at least one input/ output is not part of partition.
+            for node in partition.nodes:
+                if is_quant_node(node):
+                    for input in node.all_input_nodes:
+                        if not is_partitioned(input):
+                            del node.meta["delegation_tag"]
+                            break
+
+                if is_dequant_node(node):
+                    for user in node.users:
+                        if not is_partitioned(user):
+                            del node.meta["delegation_tag"]
+                            break
+
         tag_constant_data(exported_program)
 
         return PartitionResult(

@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Needs to be run from exeuctorch root.
+# Optional parameter: 1: build_type= "Release" | "Debug" | "RelWithDebInfo"
+
+build_type="Release"
+
+build_type=${1:-$build_type}
+
+SITE_PACKAGES="$(python3 -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+
+echo "--------------------------------------------------------------------------------"
+echo "Build .so library to register quant ops with AoT flow ${build_type} into '$(echo $(pwd))/cmake-out-aot-lib'"
+echo "--------------------------------------------------------------------------------"
+
+# Since we only want to build the quantized_aot lib in the specified folder,
+# we want exactly the configuration set below and deleting the cache is OK.
+rm -f cmake-out-aot-lib/CMakeCache.txt 
+
+cmake \
+    -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH"    \
+    -DCMAKE_BUILD_TYPE=${build_type}            \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON      \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
+    -Bcmake-out-aot-lib                         \
+    .
+
+cmake --build cmake-out-aot-lib --parallel -- quantized_ops_aot_lib
@@ -12,8 +12,6 @@
 from pathlib import Path
 
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
-
-from executorch.backends.arm.test.conftest import is_option_enabled
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
@@ -50,7 +48,6 @@ def maybe_get_tosa_collate_path() -> str | None:
             tosa_test_base = os.path.join(tosa_test_base, "tosa-mi")
         else:
             tosa_test_base = os.path.join(tosa_test_base, "other")
-
         return os.path.join(tosa_test_base, test_class, test_name)
 
     return None
@@ -78,13 +75,14 @@ def get_tosa_compile_spec_unbuilt(
         ArmCompileSpecBuilder()
         .tosa_compile_spec(tosa_version)
         .dump_intermediate_artifacts_to(custom_path)
+        .set_quantize_io(True)
     )
 
     return compile_spec_builder
 
 
 def get_u55_compile_spec(
-    quantize_io=False,
+    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -99,7 +97,7 @@ def get_u55_compile_spec(
 
 
 def get_u85_compile_spec(
-    quantize_io=False,
+    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -114,7 +112,7 @@ def get_u85_compile_spec(
 
 
 def get_u55_compile_spec_unbuilt(
-    quantize_io=False,
+    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> ArmCompileSpecBuilder:
@@ -132,15 +130,15 @@ def get_u55_compile_spec_unbuilt(
             memory_mode="Shared_Sram",
             extra_flags="--debug-force-regor --output-format=raw",
         )
-        .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
+        .set_quantize_io(quantize_io)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )
     return compile_spec
 
 
 def get_u85_compile_spec_unbuilt(
-    quantize_io=False,
+    quantize_io=True,
     custom_path=None,
     reorder_inputs=None,
 ) -> list[CompileSpec]:
@@ -156,7 +154,7 @@ def get_u85_compile_spec_unbuilt(
             memory_mode="Shared_Sram",
             extra_flags="--output-format=raw",
         )
-        .set_quantize_io(is_option_enabled("quantize_io") or quantize_io)
+        .set_quantize_io(quantize_io)
         .dump_intermediate_artifacts_to(artifact_path)
         .set_input_order(reorder_inputs)
     )