Migrate the quantizer to use aten ops directly (#4195)

mcremon-meta · facebook-github-bot · commit f680897160e3 · 2024-07-12T14:01:41.000-07:00
Summary:

This major change allows a lot more flexibility in the quantizer, and reduces the dependency on the decompositions/graph tracing tools.

The motivation is that some of those do not preserve or propagate `source_fn_stack` information, resulting in quantization misses. SDPA is an example, where the underlying `bmm` ops cannot be quantized with `source_fn_stack` information alone, or MHA, which can hide its SDPA component and sometimes even `linear` ops depending on the model (see ViT for an example).

Also note than in most cases, we match single nodes anyway, with a 1-1 mapping between the op (either nn.Module or nn.functional) and the aten op, so using the aten op directly is simply easier.

Summary of the changes:
- change the quantizer to match aten ops directly, through `node.target`
- propagate required changes to the `QuantFusion` pass
- update/remove existing patterns

Reviewed By: dulinriley

Differential Revision: D59552606
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -19,7 +19,7 @@
 )
 from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion
 from executorch.backends.cadence.aot.quantizer.quantizer import (
-    CadenceGenericQuantizer,
+    CadenceAtenQuantizer,
     CadenceQuantizer,
 )
 from executorch.backends.cadence.aot.utils import model_is_quantized
@@ -58,7 +58,7 @@ def quantize_pt2(
 
     # Get patterns and apply fusion of dq -> op -> q to qop
     patterns = [
-        assert_is_instance(q, CadenceGenericQuantizer).pattern
+        assert_is_instance(q, CadenceAtenQuantizer).pattern
         for q in quantizer.quantizers
     ]
     QuantFusion(patterns)(converted_model)
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -14,21 +14,19 @@
     BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
-    LayerNormFunctionalPattern,
     LayerNormPattern,
-    LinearFunctionalPattern,
     LinearPattern,
     MatmulPattern,
     ReluPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     create_zero_bias_int32,
+    find_sequential_partitions_aten,
     get_conv_args,
     quantize_tensor_multiplier,
 )
 from executorch.exir.pass_base import ExportPass
 from torch import fx
-from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
 from torch.fx.passes.utils.fuser_utils import legalize_graph
@@ -310,14 +308,15 @@ def __init__(self, patterns) -> None:
 
     def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
         for pattern in self.patterns:
-            fused_partitions = find_sequential_partitions(
+            fused_partitions = find_sequential_partitions_aten(
                 graph_module,
                 pattern.partition_types(),
             )
             for fused_partition in fused_partitions:
                 anchors = pattern.get_anchors(graph_module, fused_partition)
                 if not anchors:
                     continue
+                # pyre-ignore[16]: Undefined attribute
                 if any(self.is_fused(p.nodes) for p in fused_partition):
                     continue
 
@@ -373,9 +372,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             quant_node,
                             op_node,
                         )
-                    elif isinstance(pattern, LinearPattern) or isinstance(
-                        pattern, LinearFunctionalPattern
-                    ):
+                    elif isinstance(pattern, LinearPattern):
                         args, kwargs = get_args_and_kwargs_linear(
                             graph_module,
                             inputs_inputs,
@@ -385,9 +382,7 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             bias_inputs,
                             quant_node,
                         )
-                    elif isinstance(pattern, LayerNormPattern) or isinstance(
-                        pattern, LayerNormFunctionalPattern
-                    ):
+                    elif isinstance(pattern, LayerNormPattern):
                         args, kwargs = get_args_and_kwargs_layer_norm(
                             graph_module,
                             inputs_inputs,
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -14,15 +14,14 @@
     BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
-    LayerNormFunctionalPattern,
     LayerNormPattern,
-    LinearFunctionalPattern,
     LinearPattern,
     MatmulPattern,
     QuantizationPattern,
     ReluPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
+    find_sequential_partitions_aten,
     is_annotated,
     no_outside_users,
 )
@@ -31,7 +30,6 @@
 from torch import fx
 
 from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
-from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
 from torch.ao.quantization.quantizer import DerivedQuantizationSpec, Quantizer
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
@@ -63,7 +61,7 @@
 bias_qspec: Optional[QuantizationSpec] = None
 
 
-class CadenceGenericQuantizer(Quantizer):
+class CadenceAtenQuantizer(Quantizer):
     def __init__(
         self, pattern: QuantizationPattern, quantization_config: QuantizationConfig
     ) -> None:
@@ -72,7 +70,7 @@ def __init__(
         self.quantization_config = quantization_config
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        fused_partitions = find_sequential_partitions(
+        fused_partitions = find_sequential_partitions_aten(
             model,
             self.pattern.partition_types(),
         )
@@ -154,15 +152,13 @@ def __init__(self) -> None:
         )
         super().__init__(
             [
-                CadenceGenericQuantizer(AddmmPattern(), static_qconfig),
-                CadenceGenericQuantizer(BmmPattern(), static_qconfig),
-                CadenceGenericQuantizer(Conv1dPattern(), static_qconfig),
-                CadenceGenericQuantizer(Conv2dPattern(), static_qconfig),
-                CadenceGenericQuantizer(LayerNormPattern(), static_qconfig),
-                CadenceGenericQuantizer(LayerNormFunctionalPattern(), static_qconfig),
-                CadenceGenericQuantizer(LinearPattern(), static_qconfig),
-                CadenceGenericQuantizer(LinearFunctionalPattern(), static_qconfig),
-                CadenceGenericQuantizer(MatmulPattern(), static_qconfig),
-                CadenceGenericQuantizer(ReluPattern(), static_qconfig),
+                CadenceAtenQuantizer(AddmmPattern(), static_qconfig),
+                CadenceAtenQuantizer(BmmPattern(), static_qconfig),
+                CadenceAtenQuantizer(Conv1dPattern(), static_qconfig),
+                CadenceAtenQuantizer(Conv2dPattern(), static_qconfig),
+                CadenceAtenQuantizer(LayerNormPattern(), static_qconfig),
+                CadenceAtenQuantizer(LinearPattern(), static_qconfig),
+                CadenceAtenQuantizer(MatmulPattern(), static_qconfig),
+                CadenceAtenQuantizer(ReluPattern(), static_qconfig),
             ]
         )
diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py
@@ -4,14 +4,21 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import itertools
+from collections import OrderedDict
 from math import frexp, isclose, trunc
-from typing import List, Tuple
+from typing import Any, Dict, List, Tuple, Type
 
 import torch
 from torch import fx
+from torch._ops import OpOverload
 from torch.ao.quantization import ObserverOrFakeQuantize
 
 from torch.fx import GraphModule
+from torch.fx.passes.utils.source_matcher_utils import (
+    check_subgraphs_connected,
+    SourcePartition,
+)
 
 
 def quantize_tensor_multiplier(
@@ -127,3 +134,101 @@ def get_bias_qparams(
 
 def get_conv_args(arg, first_val: int) -> List[fx.Node]:
     return arg if len(arg) == 2 else [first_val, arg[0]]
+
+
+def get_aten_node_target_partitions(
+    graph: torch.fx.Graph,
+    wanted_original_aten_op: List[OpOverload],
+) -> Dict[Any, List[SourcePartition]]:
+    """
+    Args:
+        graph: The graph we want to partition
+        wanted_sources: List of orginal_aten ops (OpOverload)
+
+    Returns:
+        Dictionary mapping aten ops that were given to a list of SourcePartitions
+        that correspond to the list of nodes that were decomposed from the given
+        aten ops.
+    """
+    modules: Dict[Type, Dict[str, List[torch.fx.Node]]] = {}
+
+    for node in graph.nodes:
+        # The metadata source_fn should contain a tuple of a unique name for the
+        # source, and the source function if the node is decomposed from a
+        # function, or the type of module if the node is decomposed from a leaf
+        # module
+        # TODO(matthiascremon): look into ways to avoid using source_fn_stack
+        if (source_fn_st := node.meta.get("source_fn_stack")) is None:
+            continue
+
+        source_fn = source_fn_st[-1]
+        if node.target not in wanted_original_aten_op:
+            continue
+
+        diff_modules = modules.setdefault(source_fn[1], {})
+        partition = diff_modules.setdefault(node.name, [])
+        partition.append(node)
+
+    def make_partition(
+        nodes: List[torch.fx.Node], module_type: Type
+    ) -> SourcePartition:
+        input_nodes = set()
+        output_nodes = set()
+        params = set()
+        for node in nodes:
+            for arg in node.args:
+                if isinstance(arg, torch.fx.Node) and arg not in nodes:
+                    input_nodes.add(arg)
+
+            if node.op == "get_attr":
+                params.add(node)
+
+            for user in node.users.keys():
+                if user not in nodes:
+                    output_nodes.add(node)
+
+        return SourcePartition(
+            nodes,
+            module_type,
+            list(input_nodes),
+            list(output_nodes),
+            list(params),  # type: ignore[arg-type]
+        )
+
+    ret: Dict[Type[Any], List[SourcePartition]] = {}
+
+    for k, v in modules.items():
+        ret[k] = [make_partition(partition, k) for partition in v.values()]
+
+    return ret
+
+
+def _partitions_sequential(partitions: Tuple[SourcePartition]) -> bool:
+    prev_partition = None
+    for partition in partitions:
+        if prev_partition is not None and not check_subgraphs_connected(
+            prev_partition, partition
+        ):
+            return False
+        prev_partition = partition
+    return True
+
+
+def find_sequential_partitions_aten(
+    gm: torch.fx.GraphModule,
+    partition_types: List[Any],
+) -> List[SourcePartition]:
+    typed_partitions: OrderedDict[Any, List[SourcePartition]] = OrderedDict()
+    for partition_type in partition_types:
+        partitions = get_aten_node_target_partitions(gm.graph, [partition_type])
+        typed_partitions[partition_type] = list(
+            itertools.chain.from_iterable(partitions.values())
+        )
+
+    typed_partitions_list = list(typed_partitions.values())
+    fusion_candidates = itertools.product(*typed_partitions_list)
+    fused_partitions = []
+    for candidate in fusion_candidates:
+        if _partitions_sequential(candidate):
+            fused_partitions.append(candidate)
+    return fused_partitions
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-strict
+
 import logging
 import operator
 from typing import Dict, List, Tuple
@@ -116,7 +118,7 @@ def get_ops_count(graph_module: torch.fx.GraphModule) -> Dict[str, int]:
 def print_ops_info(
     to_edge_gm: torch.fx.GraphModule,
     jarvis_gm: torch.fx.GraphModule,
-):
+) -> None:
     to_edge_ops_count = get_ops_count(to_edge_gm)
     jarvis_ops_count = get_ops_count(jarvis_gm)
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`)`
`20`	`20`	`from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion`
`21`	`21`	`from executorch.backends.cadence.aot.quantizer.quantizer import (`
`22`		`- CadenceGenericQuantizer,`
	`22`	`+ CadenceAtenQuantizer,`
`23`	`23`	`CadenceQuantizer,`
`24`	`24`	`)`
`25`	`25`	`from executorch.backends.cadence.aot.utils import model_is_quantized`
`@@ -58,7 +58,7 @@ def quantize_pt2(`
`58`	`58`
`59`	`59`	`# Get patterns and apply fusion of dq -> op -> q to qop`
`60`	`60`	`patterns = [`
`61`		`- assert_is_instance(q, CadenceGenericQuantizer).pattern`
	`61`	`+ assert_is_instance(q, CadenceAtenQuantizer).pattern`
`62`	`62`	`for q in quantizer.quantizers`
`63`	`63`	`]`
`64`	`64`	`QuantFusion(patterns)(converted_model)`