Skip to content

Use symmetric weights for convs and int8 in the default quantizer #8344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 3 additions & 33 deletions backends/cadence/aot/export_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

# Example script for exporting simple models to flatbuffer

#pyre-unsafe

import logging
import tempfile

import torch

from executorch.backends.cadence.aot.ops_registrations import * # noqa
from typing import Any, Tuple

Expand All @@ -23,38 +23,15 @@
from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
from executorch.backends.cadence.runtime import runtime
from executorch.backends.cadence.runtime.executor import BundledProgramManager
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
QuantizationConfig,
QuantizationSpec,
)
from executorch.exir import ExecutorchProgramManager
from torch import nn
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver

from .utils import save_bpte_program, save_pte_program


FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT)

act_qspec = QuantizationSpec(
dtype=torch.int8,
quant_min=-128,
quant_max=127,
qscheme=torch.per_tensor_affine,
is_dynamic=False,
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
)

wgt_qspec = QuantizationSpec(
dtype=torch.int8,
quant_min=-128,
quant_max=127,
qscheme=torch.per_tensor_affine,
is_dynamic=False,
observer_or_fake_quant_ctr=MinMaxObserver,
)


def export_model(
model: nn.Module,
Expand All @@ -66,15 +43,8 @@ def export_model(
working_dir = tempfile.mkdtemp(dir="/tmp")
logging.debug(f"Created work directory {working_dir}")

qconfig = QuantizationConfig(
act_qspec,
act_qspec,
wgt_qspec,
None,
)

# Instantiate the quantizer
quantizer = CadenceDefaultQuantizer(qconfig)
quantizer = CadenceDefaultQuantizer()

# Convert the model
converted_model = convert_pt2(model, example_inputs, quantizer)
Expand Down
2 changes: 1 addition & 1 deletion backends/cadence/aot/ops_registrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def quantized_relu_per_tensor_meta(
out_multiplier: int,
out_shift: int,
) -> torch.Tensor:
return input.new_empty(input.size(), dtype=torch.uint8)
return input.new_empty(input.size(), dtype=input.dtype)


@register_fake("cadence::fully_connected")
Expand Down
69 changes: 41 additions & 28 deletions backends/cadence/aot/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,46 @@
from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer


act_qspec = QuantizationSpec(
dtype=torch.uint8,
quant_min=0,
quant_max=255,
act_qspec_asym8u = QuantizationSpec(
dtype=torch.int8,
quant_min=-128,
quant_max=127,
qscheme=torch.per_tensor_affine,
is_dynamic=False,
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
)

wgt_qspec = QuantizationSpec(
dtype=torch.uint8,
quant_min=0,
quant_max=255,
wgt_qspec_asym8u = QuantizationSpec(
dtype=torch.int8,
quant_min=-128,
quant_max=127,
qscheme=torch.per_tensor_affine,
is_dynamic=False,
observer_or_fake_quant_ctr=MinMaxObserver,
)

wgt_qspec_asym8s = QuantizationSpec(
dtype=torch.int8,
quant_min=-128,
quant_max=127,
qscheme=torch.per_tensor_symmetric,
is_dynamic=False,
observer_or_fake_quant_ctr=MinMaxObserver,
)

bias_qspec: Optional[QuantizationSpec] = None

_default_qconfig = QuantizationConfig(
act_qspec,
act_qspec,
wgt_qspec,
qconfig_A8uW8u = QuantizationConfig(
act_qspec_asym8u,
act_qspec_asym8u,
wgt_qspec_asym8u,
None,
)

qconfig_A8uW8s = QuantizationConfig(
act_qspec_asym8u,
act_qspec_asym8u,
wgt_qspec_asym8s,
None,
)

Expand Down Expand Up @@ -147,19 +163,17 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
return []


def get_cadence_default_quantizer_list_with_config(
quantization_config: QuantizationConfig,
) -> List[Quantizer]:
def get_cadence_default_quantizers() -> List[Quantizer]:
return [
CadenceAtenQuantizer(AddmmPattern(), quantization_config),
CadenceAtenQuantizer(BmmPattern(), quantization_config),
CadenceAtenQuantizer(Conv1dPattern(), quantization_config),
CadenceAtenQuantizer(Conv2dPattern(), quantization_config),
CadenceAtenQuantizer(LayerNormPattern(), quantization_config),
CadenceAtenQuantizer(LinearPattern(), quantization_config),
CadenceAtenQuantizer(MatmulPattern(), quantization_config),
CadenceAtenQuantizer(ReluPattern0(), quantization_config),
CadenceAtenQuantizer(ReluPattern1(), quantization_config),
CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u),
CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u),
CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s),
CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s),
CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u),
CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u),
CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u),
CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u),
CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u),
]


Expand All @@ -178,10 +192,9 @@ class CadenceDefaultQuantizer(CadenceQuantizer):
Default quantizer for Cadence backend.
"""

def __init__(self, qconfig: Optional[QuantizationConfig] = None) -> None:
if qconfig is None:
qconfig = _default_qconfig
quantizers = get_cadence_default_quantizer_list_with_config(qconfig)
def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
if quantizers is None:
quantizers = get_cadence_default_quantizers()
super().__init__(quantizers)


Expand Down
35 changes: 3 additions & 32 deletions backends/cadence/hifi/operators/op_quantized_relu_out.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,6 @@ namespace impl {
namespace HiFi {
namespace native {

template <typename T>
void quantized_relu_(
const Tensor& input,
const Tensor& in_zero_point,
const int64_t out_zero_point,
const Tensor& out_multiplier,
const Tensor& out_shift,
Tensor& output) {
T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
const T* __restrict__ in = input.const_data_ptr<T>();
T* __restrict__ out = output.mutable_data_ptr<T>();

const int32_t* __restrict__ out_multiplier_data =
out_multiplier.const_data_ptr<int32_t>();
const int32_t* __restrict__ out_shift_data =
out_shift.const_data_ptr<int32_t>();

// Compute the out_scale from out_multiplier and out_shift
const float out_scale =
-out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);

for (size_t i = 0, e = input.numel(); i < e; ++i) {
float temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
out[i] = kernels::quantize<T>(temp, out_scale, (int32_t)out_zero_point);
}
}

void quantized_relu_per_tensor_out(
KernelRuntimeContext& ctx,
const Tensor& input,
Expand All @@ -68,7 +41,7 @@ void quantized_relu_per_tensor_out(
_out_multiplier,
_out_shift,
_out_zero_point,
_out_zero_point,
0,
255,
input.numel());

Expand All @@ -85,7 +58,7 @@ void quantized_relu_per_tensor_out(
_out_multiplier,
_out_shift,
_out_zero_point,
_out_zero_point,
-128,
127,
input.numel());

Expand All @@ -107,9 +80,7 @@ void quantized_relu_per_tensor_out(
const Tensor& out_multiplier,
const Tensor& out_shift,
Tensor& output) {
const uint8_t* p_in = input.const_data_ptr<uint8_t>();
uint8_t* p_out = output.mutable_data_ptr<uint8_t>();
uint8_t _in_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
int8_t _in_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
int32_t _out_multiplier = out_multiplier.const_data_ptr<int32_t>()[0];
int32_t _out_shift = out_shift.const_data_ptr<int32_t>()[0];

Expand Down
Loading