Skip to content

Commit a9652e8

Browse files
authored
Merge branch 'main' into jz/export-llama-logging
2 parents 662b517 + 1308d4d commit a9652e8

File tree

18 files changed

+177
-131
lines changed

18 files changed

+177
-131
lines changed

.github/workflows/android-perf.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ jobs:
222222
--preq_mode 8da4w_output_8da8w \
223223
--preq_group_size 32 \
224224
--max_seq_length 2048 \
225+
--max_context_length 2048 \
225226
--output_name "${OUT_ET_MODEL_NAME}.pte" \
226227
-kv \
227228
-d fp32 \
@@ -253,6 +254,7 @@ jobs:
253254
--xnnpack-extended-ops \
254255
-d fp32 \
255256
--max_seq_length 2048 \
257+
--max_context_length 2048 \
256258
--output_name "${OUT_ET_MODEL_NAME}.pte" \
257259
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
258260
ls -lh "${OUT_ET_MODEL_NAME}.pte"

.github/workflows/apple-perf.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ jobs:
233233
--preq_mode 8da4w_output_8da8w \
234234
--preq_group_size 32 \
235235
--max_seq_length 2048 \
236+
--max_context_length 2048 \
236237
--output_name "${OUT_ET_MODEL_NAME}.pte" \
237238
-kv \
238239
-d fp32 \
@@ -264,6 +265,7 @@ jobs:
264265
--xnnpack-extended-ops \
265266
-d fp32 \
266267
--max_seq_length 2048 \
268+
--max_context_length 2048 \
267269
--output_name "${OUT_ET_MODEL_NAME}.pte" \
268270
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
269271
ls -lh "${OUT_ET_MODEL_NAME}.pte"

CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,7 @@ endif()
596596
# any backends.
597597
#
598598
add_library(executorch ${_executorch__srcs})
599-
target_link_libraries(executorch PRIVATE executorch_core)
599+
target_link_libraries(executorch PUBLIC executorch_core)
600600
target_include_directories(executorch PUBLIC ${_common_include_directories})
601601
target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
602602
target_compile_options(executorch PUBLIC ${_common_compile_options})
@@ -750,7 +750,9 @@ if(EXECUTORCH_BUILD_PYBIND)
750750
endif()
751751

752752
# find pytorch lib, to allow pybind to take at::Tensor as input/output
753-
find_package(Torch CONFIG REQUIRED)
753+
if(NOT TARGET torch)
754+
find_package(Torch CONFIG REQUIRED)
755+
endif()
754756
find_library(
755757
TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
756758
)
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import unittest
7+
8+
import pytest
9+
10+
from executorch.backends.arm.test import common, conftest
11+
12+
from executorch.backends.arm.test.tester.arm_tester import ArmTester
13+
from executorch.examples.models import deeplab_v3
14+
15+
16+
class TestDl3(unittest.TestCase):
17+
"""Tests DeepLabv3."""
18+
19+
dl3 = deeplab_v3.DeepLabV3ResNet50Model()
20+
model_inputs = dl3.get_example_inputs()
21+
dl3 = dl3.get_eager_model()
22+
23+
@unittest.expectedFailure
24+
def test_dl3_tosa_MI(self):
25+
(
26+
ArmTester(
27+
self.dl3,
28+
example_inputs=self.model_inputs,
29+
compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
30+
)
31+
.export()
32+
.to_edge_transform_and_lower()
33+
.to_executorch()
34+
.run_method_and_compare_outputs(self.model_inputs)
35+
)
36+
37+
@unittest.expectedFailure
38+
def test_dl3_tosa_BI(self):
39+
(
40+
ArmTester(
41+
self.dl3,
42+
example_inputs=self.model_inputs,
43+
compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
44+
)
45+
.quantize()
46+
.export()
47+
.to_edge_transform_and_lower()
48+
.to_executorch()
49+
.run_method_and_compare_outputs(atol=1.0, qtol=1, inputs=self.model_inputs)
50+
)
51+
52+
@pytest.mark.slow
53+
@pytest.mark.corstone_fvp
54+
@unittest.skip
55+
def test_dl3_u55_BI(self):
56+
tester = (
57+
ArmTester(
58+
self.dl3,
59+
example_inputs=self.model_inputs,
60+
compile_spec=common.get_u55_compile_spec(),
61+
)
62+
.quantize()
63+
.export()
64+
.to_edge_transform_and_lower()
65+
.to_executorch()
66+
.serialize()
67+
)
68+
if conftest.is_option_enabled("corstone_fvp"):
69+
tester.run_method_and_compare_outputs(
70+
atol=1.0, qtol=1, inputs=self.model_inputs
71+
)
72+
73+
@pytest.mark.slow
74+
@pytest.mark.corstone_fvp
75+
@unittest.skip
76+
def test_dl3_u85_BI(self):
77+
tester = (
78+
ArmTester(
79+
self.dl3,
80+
example_inputs=self.model_inputs,
81+
compile_spec=common.get_u85_compile_spec(),
82+
)
83+
.quantize()
84+
.export()
85+
.to_edge_transform_and_lower()
86+
.to_executorch()
87+
.serialize()
88+
)
89+
if conftest.is_option_enabled("corstone_fvp"):
90+
tester.run_method_and_compare_outputs(
91+
atol=1.0, qtol=1, inputs=self.model_inputs
92+
)

backends/cadence/aot/export_example.py

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66

77
# Example script for exporting simple models to flatbuffer
88

9+
#pyre-unsafe
10+
911
import logging
1012
import tempfile
1113

12-
import torch
13-
1414
from executorch.backends.cadence.aot.ops_registrations import * # noqa
1515
from typing import Any, Tuple
1616

@@ -23,38 +23,15 @@
2323
from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
2424
from executorch.backends.cadence.runtime import runtime
2525
from executorch.backends.cadence.runtime.executor import BundledProgramManager
26-
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
27-
QuantizationConfig,
28-
QuantizationSpec,
29-
)
3026
from executorch.exir import ExecutorchProgramManager
3127
from torch import nn
32-
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
3328

3429
from .utils import save_bpte_program, save_pte_program
3530

3631

3732
FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
3833
logging.basicConfig(level=logging.INFO, format=FORMAT)
3934

40-
act_qspec = QuantizationSpec(
41-
dtype=torch.int8,
42-
quant_min=-128,
43-
quant_max=127,
44-
qscheme=torch.per_tensor_affine,
45-
is_dynamic=False,
46-
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
47-
)
48-
49-
wgt_qspec = QuantizationSpec(
50-
dtype=torch.int8,
51-
quant_min=-128,
52-
quant_max=127,
53-
qscheme=torch.per_tensor_affine,
54-
is_dynamic=False,
55-
observer_or_fake_quant_ctr=MinMaxObserver,
56-
)
57-
5835

5936
def export_model(
6037
model: nn.Module,
@@ -66,15 +43,8 @@ def export_model(
6643
working_dir = tempfile.mkdtemp(dir="/tmp")
6744
logging.debug(f"Created work directory {working_dir}")
6845

69-
qconfig = QuantizationConfig(
70-
act_qspec,
71-
act_qspec,
72-
wgt_qspec,
73-
None,
74-
)
75-
7646
# Instantiate the quantizer
77-
quantizer = CadenceDefaultQuantizer(qconfig)
47+
quantizer = CadenceDefaultQuantizer()
7848

7949
# Convert the model
8050
converted_model = convert_pt2(model, example_inputs, quantizer)

backends/cadence/aot/ops_registrations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ def quantized_relu_per_tensor_meta(
576576
out_multiplier: int,
577577
out_shift: int,
578578
) -> torch.Tensor:
579-
return input.new_empty(input.size(), dtype=torch.uint8)
579+
return input.new_empty(input.size(), dtype=input.dtype)
580580

581581

582582
@register_fake("cadence::fully_connected")

backends/cadence/aot/quantizer/quantizer.py

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,30 +40,46 @@
4040
from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
4141

4242

43-
act_qspec = QuantizationSpec(
44-
dtype=torch.uint8,
45-
quant_min=0,
46-
quant_max=255,
43+
act_qspec_asym8u = QuantizationSpec(
44+
dtype=torch.int8,
45+
quant_min=-128,
46+
quant_max=127,
4747
qscheme=torch.per_tensor_affine,
4848
is_dynamic=False,
4949
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
5050
)
5151

52-
wgt_qspec = QuantizationSpec(
53-
dtype=torch.uint8,
54-
quant_min=0,
55-
quant_max=255,
52+
wgt_qspec_asym8u = QuantizationSpec(
53+
dtype=torch.int8,
54+
quant_min=-128,
55+
quant_max=127,
5656
qscheme=torch.per_tensor_affine,
5757
is_dynamic=False,
5858
observer_or_fake_quant_ctr=MinMaxObserver,
5959
)
6060

61+
wgt_qspec_asym8s = QuantizationSpec(
62+
dtype=torch.int8,
63+
quant_min=-128,
64+
quant_max=127,
65+
qscheme=torch.per_tensor_symmetric,
66+
is_dynamic=False,
67+
observer_or_fake_quant_ctr=MinMaxObserver,
68+
)
69+
6170
bias_qspec: Optional[QuantizationSpec] = None
6271

63-
_default_qconfig = QuantizationConfig(
64-
act_qspec,
65-
act_qspec,
66-
wgt_qspec,
72+
qconfig_A8uW8u = QuantizationConfig(
73+
act_qspec_asym8u,
74+
act_qspec_asym8u,
75+
wgt_qspec_asym8u,
76+
None,
77+
)
78+
79+
qconfig_A8uW8s = QuantizationConfig(
80+
act_qspec_asym8u,
81+
act_qspec_asym8u,
82+
wgt_qspec_asym8s,
6783
None,
6884
)
6985

@@ -147,19 +163,17 @@ def get_supported_operators(cls) -> List[OperatorConfig]:
147163
return []
148164

149165

150-
def get_cadence_default_quantizer_list_with_config(
151-
quantization_config: QuantizationConfig,
152-
) -> List[Quantizer]:
166+
def get_cadence_default_quantizers() -> List[Quantizer]:
153167
return [
154-
CadenceAtenQuantizer(AddmmPattern(), quantization_config),
155-
CadenceAtenQuantizer(BmmPattern(), quantization_config),
156-
CadenceAtenQuantizer(Conv1dPattern(), quantization_config),
157-
CadenceAtenQuantizer(Conv2dPattern(), quantization_config),
158-
CadenceAtenQuantizer(LayerNormPattern(), quantization_config),
159-
CadenceAtenQuantizer(LinearPattern(), quantization_config),
160-
CadenceAtenQuantizer(MatmulPattern(), quantization_config),
161-
CadenceAtenQuantizer(ReluPattern0(), quantization_config),
162-
CadenceAtenQuantizer(ReluPattern1(), quantization_config),
168+
CadenceAtenQuantizer(AddmmPattern(), qconfig_A8uW8u),
169+
CadenceAtenQuantizer(BmmPattern(), qconfig_A8uW8u),
170+
CadenceAtenQuantizer(Conv1dPattern(), qconfig_A8uW8s),
171+
CadenceAtenQuantizer(Conv2dPattern(), qconfig_A8uW8s),
172+
CadenceAtenQuantizer(LayerNormPattern(), qconfig_A8uW8u),
173+
CadenceAtenQuantizer(LinearPattern(), qconfig_A8uW8u),
174+
CadenceAtenQuantizer(MatmulPattern(), qconfig_A8uW8u),
175+
CadenceAtenQuantizer(ReluPattern0(), qconfig_A8uW8u),
176+
CadenceAtenQuantizer(ReluPattern1(), qconfig_A8uW8u),
163177
]
164178

165179

@@ -178,10 +192,9 @@ class CadenceDefaultQuantizer(CadenceQuantizer):
178192
Default quantizer for Cadence backend.
179193
"""
180194

181-
def __init__(self, qconfig: Optional[QuantizationConfig] = None) -> None:
182-
if qconfig is None:
183-
qconfig = _default_qconfig
184-
quantizers = get_cadence_default_quantizer_list_with_config(qconfig)
195+
def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
196+
if quantizers is None:
197+
quantizers = get_cadence_default_quantizers()
185198
super().__init__(quantizers)
186199

187200

backends/cadence/hifi/operators/op_quantized_relu_out.cpp

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,6 @@ namespace impl {
1818
namespace HiFi {
1919
namespace native {
2020

21-
template <typename T>
22-
void quantized_relu_(
23-
const Tensor& input,
24-
const Tensor& in_zero_point,
25-
const int64_t out_zero_point,
26-
const Tensor& out_multiplier,
27-
const Tensor& out_shift,
28-
Tensor& output) {
29-
T q_zero_point = in_zero_point.const_data_ptr<T>()[0];
30-
const T* __restrict__ in = input.const_data_ptr<T>();
31-
T* __restrict__ out = output.mutable_data_ptr<T>();
32-
33-
const int32_t* __restrict__ out_multiplier_data =
34-
out_multiplier.const_data_ptr<int32_t>();
35-
const int32_t* __restrict__ out_shift_data =
36-
out_shift.const_data_ptr<int32_t>();
37-
38-
// Compute the out_scale from out_multiplier and out_shift
39-
const float out_scale =
40-
-out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]);
41-
42-
for (size_t i = 0, e = input.numel(); i < e; ++i) {
43-
float temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0;
44-
out[i] = kernels::quantize<T>(temp, out_scale, (int32_t)out_zero_point);
45-
}
46-
}
47-
4821
void quantized_relu_per_tensor_out(
4922
KernelRuntimeContext& ctx,
5023
const Tensor& input,
@@ -68,7 +41,7 @@ void quantized_relu_per_tensor_out(
6841
_out_multiplier,
6942
_out_shift,
7043
_out_zero_point,
71-
_out_zero_point,
44+
0,
7245
255,
7346
input.numel());
7447

@@ -85,7 +58,7 @@ void quantized_relu_per_tensor_out(
8558
_out_multiplier,
8659
_out_shift,
8760
_out_zero_point,
88-
_out_zero_point,
61+
-128,
8962
127,
9063
input.numel());
9164

@@ -107,9 +80,7 @@ void quantized_relu_per_tensor_out(
10780
const Tensor& out_multiplier,
10881
const Tensor& out_shift,
10982
Tensor& output) {
110-
const uint8_t* p_in = input.const_data_ptr<uint8_t>();
111-
uint8_t* p_out = output.mutable_data_ptr<uint8_t>();
112-
uint8_t _in_zero_point = in_zero_point.const_data_ptr<uint8_t>()[0];
83+
int8_t _in_zero_point = in_zero_point.const_data_ptr<int8_t>()[0];
11384
int32_t _out_multiplier = out_multiplier.const_data_ptr<int32_t>()[0];
11485
int32_t _out_shift = out_shift.const_data_ptr<int32_t>()[0];
11586

0 commit comments

Comments
 (0)