Use symmetric weights for convs and int8 in the default quantizer

mcremon-meta · facebook-github-bot · commit 78b7bf9076c1 · 2025-02-10T10:52:32.000-08:00
Summary:
As titled. int8 should give better performance with Cadence kernels, since they're not improving uint8 anymore.
The upcoming (quantized) convolution kernel needs symmetric weights, so we make that change as well.

Differential Revision: D69405797
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
@@ -9,8 +9,6 @@
 import logging
 import tempfile
 
-import torch
-
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
 from typing import Any, Tuple
 
@@ -23,38 +21,15 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.runtime import runtime
 from executorch.backends.cadence.runtime.executor import BundledProgramManager
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
-    QuantizationConfig,
-    QuantizationSpec,
-)
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
 
 from .utils import save_bpte_program, save_pte_program
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
-act_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
-wgt_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=MinMaxObserver,
-)
-
 
 def export_model(
     model: nn.Module,
@@ -66,15 +41,8 @@ def export_model(
     working_dir = tempfile.mkdtemp(dir="/tmp")
     logging.debug(f"Created work directory {working_dir}")
 
-    qconfig = QuantizationConfig(
-        act_qspec,
-        act_qspec,
-        wgt_qspec,
-        None,
-    )
-
     # Instantiate the quantizer
-    quantizer = CadenceDefaultQuantizer(qconfig)
+    quantizer = CadenceDefaultQuantizer()
 
     # Convert the model
     converted_model = convert_pt2(model, example_inputs, quantizer)
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -40,33 +40,48 @@
 from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
 
 
-act_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
+act_qspec_asym8u = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
     qscheme=torch.per_tensor_affine,
     is_dynamic=False,
     observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
 )
 
-wgt_qspec = QuantizationSpec(
-    dtype=torch.uint8,
-    quant_min=0,
-    quant_max=255,
+wgt_qspec_asym8u = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
     qscheme=torch.per_tensor_affine,
     is_dynamic=False,
     observer_or_fake_quant_ctr=MinMaxObserver,
 )
 
+wgt_qspec_asym8s = QuantizationSpec(
+    dtype=torch.int8,
+    quant_min=-128,
+    quant_max=127,
+    qscheme=torch.per_tensor_symmetric,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=MinMaxObserver,
+)
+
 bias_qspec: Optional[QuantizationSpec] = None
 
-_default_qconfig = QuantizationConfig(
-    act_qspec,
-    act_qspec,
-    wgt_qspec,
+qconfig_A8uW8u = QuantizationConfig(
+    act_qspec_asym8u,
+    act_qspec_asym8u,
+    wgt_qspec_asym8u,
     None,
 )
 
+qconfig_A8uW8s = QuantizationConfig(
+    act_qspec_asym8u,
+    act_qspec_asym8u,
+    wgt_qspec_asym8s,
+    None,
+)
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -147,19 +162,17 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
         return []
 
 
-def get_cadence_default_quantizer_list_with_config(
-    quantization_config: QuantizationConfig,
-) -> List[Quantizer]:
+def get_cadence_default_quantizers() -> List[Quantizer]:
     return [
-        CadenceAtenQuantizer(AddmmPattern(), quantization_config),
-        CadenceAtenQuantizer(BmmPattern(), quantization_config),
-        CadenceAtenQuantizer(Conv1dPattern(), quantization_config),
-        CadenceAtenQuantizer(Conv2dPattern(), quantization_config),
-        CadenceAtenQuantizer(LayerNormPattern(), quantization_config),
-        CadenceAtenQuantizer(LinearPattern(), quantization_config),
-        CadenceAtenQuantizer(MatmulPattern(), quantization_config),
-        CadenceAtenQuantizer(ReluPattern0(), quantization_config),
-        CadenceAtenQuantizer(ReluPattern1(), quantization_config),
+        CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s),
+        CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s),
+        CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u),
+        CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u),
     ]
 
 
@@ -178,10 +191,9 @@ class CadenceDefaultQuantizer(CadenceQuantizer):
     Default quantizer for Cadence backend.
     """
 
-    def __init__(self, qconfig: Optional[QuantizationConfig] = None) -> None:
-        if qconfig is None:
-            qconfig = _default_qconfig
-        quantizers = get_cadence_default_quantizer_list_with_config(qconfig)
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
         super().__init__(quantizers)