Skip to content

Commit 455cea4

Browse files
author
yifan_shen3
committed
add coreml quantize
1 parent 370f304 commit 455cea4

File tree

3 files changed

+82
-5
lines changed

3 files changed

+82
-5
lines changed

backends/apple/coreml/compiler/coreml_preprocess.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# CoreML backend for delegating a EdgeProgram to CoreML.
44

55
import json
6+
import logging
67

78
import shutil
89
import uuid
@@ -14,6 +15,7 @@
1415
from typing import Any, Dict, final, List, Optional, Tuple
1516

1617
import coremltools as ct
18+
import coremltools.optimize as cto
1719
import executorchcoreml
1820

1921
from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
2325
)
2426
from executorch.exir.backend.compile_spec_schema import CompileSpec
2527

28+
logger = logging.getLogger(__name__)
29+
logger.setLevel(logging.WARNING)
30+
2631

2732
class COMPILE_SPEC_KEYS(Enum):
2833
COMPUTE_UNITS = "compute_units"
2934
MODEL_TYPE = "model_type"
3035
MIN_DEPLOYMENT_TARGET = "min_deployment_target"
3136
MODEL_COMPUTE_PRECISION = "model_compute_precision"
37+
OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
3238

3339

3440
class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
169175
compute_unit.name.lower().encode("utf-8"),
170176
)
171177

178+
@staticmethod
179+
def generate_op_linear_quantizer_config_compile_spec(
180+
op_linear_quantizer_config: Dict,
181+
) -> CompileSpec:
182+
"""
183+
Returns the compile spec representing the model post conversion quantization,
184+
which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
185+
"""
186+
str_representation = json.dumps(op_linear_quantizer_config)
187+
byte_representation = str_representation.encode("utf-8")
188+
return CompileSpec(
189+
COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
190+
byte_representation,
191+
)
192+
193+
@staticmethod
194+
def op_linear_quantizer_config_from_compile_specs(
195+
compile_specs: List[CompileSpec],
196+
) -> cto.coreml.OpLinearQuantizerConfig:
197+
"""
198+
Returns the model's post conversion quantization by parsing the list of compile specs.
199+
"""
200+
for compile_spec in compile_specs:
201+
if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
202+
config_dict_str = compile_spec.value.decode("utf-8")
203+
config_dict = json.loads(config_dict_str)
204+
config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
205+
return config
206+
207+
return None
208+
172209
@staticmethod
173210
def generate_compile_specs(
174211
compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
175212
minimum_deployment_target: ct.target = ct.target.iOS15,
176213
compute_precision: ct.precision = ct.precision.FLOAT16,
177214
model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
215+
op_linear_quantizer_config: Optional[Dict] = None,
178216
) -> List[CompileSpec]:
179217
"""
180218
Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
192230
CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
193231
)
194232
compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
233+
if op_linear_quantizer_config is not None:
234+
compile_specs.append(
235+
CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
236+
op_linear_quantizer_config
237+
)
238+
)
195239

196240
return compile_specs
197241

@@ -368,18 +412,18 @@ def preprocess(
368412
compile_specs,
369413
)
370414
)
371-
372415
model_compute_precision: ct.precision = (
373416
CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
374417
)
375-
376418
minimum_deployment_target: ct.target = (
377419
CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
378420
)
379-
380421
compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
381422
compile_specs
382423
)
424+
op_linear_quantizer_config = (
425+
CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
426+
)
383427

384428
mlmodel = ct.convert(
385429
model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
392436
compute_units=compute_units,
393437
)
394438

439+
if op_linear_quantizer_config is not None:
440+
logger.warning(
441+
"Core ML Backend op_linear_quantizer_config API is experimental"
442+
)
443+
config = cto.coreml.OptimizationConfig(
444+
global_config=op_linear_quantizer_config,
445+
# skip embedding
446+
op_type_configs={"gather": None},
447+
)
448+
mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
449+
395450
return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)

examples/models/llama2/export_llama_lib.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,12 @@ def build_args_parser() -> argparse.ArgumentParser:
293293
action="store_true",
294294
help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
295295
)
296+
parser.add_argument(
297+
"--coreml-quantize",
298+
default=None,
299+
choices=["b4w"],
300+
help="This option is only for coreml: Use coreml quantization",
301+
)
296302
parser.add_argument(
297303
"--qnn",
298304
action="store_true",
@@ -539,6 +545,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
539545
args.use_kv_cache and args.coreml_enable_state,
540546
args.embedding_quantize,
541547
args.pt2e_quantize,
548+
args.coreml_quantize,
542549
)
543550
partitioners.append(coreml_partitioner)
544551
modelname = f"coreml_{modelname}"

extension/llm/export/partitioner_lib.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
import json
78
from typing import Optional
89

910

@@ -59,6 +60,7 @@ def get_coreml_partitioner(
5960
enable_state: bool = False,
6061
embedding_quantize: Optional[str] = None,
6162
pt2e_quantize: Optional[str] = None,
63+
coreml_quantize: Optional[str] = None,
6264
):
6365
try:
6466
import coremltools as ct
@@ -87,16 +89,29 @@ def get_coreml_partitioner(
8789
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
8890
# In Core ML, 4-bit weight compression is introduced in iOS 18
8991
if (
90-
embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4
91-
) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
92+
(embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
93+
or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
94+
or coreml_quantize == "b4w"
95+
):
9296
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
9397

98+
op_linear_quantizer_config = None
99+
if coreml_quantize == "b4w":
100+
op_linear_quantizer_config = {
101+
"mode": "linear_symmetric",
102+
"dtype": "int4",
103+
"granularity": "per_block",
104+
"block_size": 32,
105+
"weight_threshold": 512,
106+
}
107+
94108
compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16]
95109
minimum_deployment_target=minimum_deployment_target,
96110
compute_precision=ct.precision(ct.precision.FLOAT16.value),
97111
# using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
98112
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
99113
model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16]
114+
op_linear_quantizer_config=op_linear_quantizer_config,
100115
)
101116
return CoreMLPartitioner( # pyre-fixme[16]
102117
compile_specs=compile_specs,

0 commit comments

Comments
 (0)