Skip to content

Commit 3bc734f

Browse files
author
yifan_shen3
committed
add coreml quantize
1 parent 370f304 commit 3bc734f

File tree

3 files changed

+82
-5
lines changed

3 files changed

+82
-5
lines changed

backends/apple/coreml/compiler/coreml_preprocess.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# CoreML backend for delegating a EdgeProgram to CoreML.
44

55
import json
6+
import logging
67

78
import shutil
89
import uuid
@@ -14,6 +15,7 @@
1415
from typing import Any, Dict, final, List, Optional, Tuple
1516

1617
import coremltools as ct
18+
import coremltools.optimize as cto
1719
import executorchcoreml
1820

1921
from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
2325
)
2426
from executorch.exir.backend.compile_spec_schema import CompileSpec
2527

28+
logger = logging.getLogger(__name__)
29+
logger.setLevel(logging.WARNING)
30+
2631

2732
class COMPILE_SPEC_KEYS(Enum):
2833
COMPUTE_UNITS = "compute_units"
2934
MODEL_TYPE = "model_type"
3035
MIN_DEPLOYMENT_TARGET = "min_deployment_target"
3136
MODEL_COMPUTE_PRECISION = "model_compute_precision"
37+
OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
3238

3339

3440
class MODEL_PATHS(Enum):
@@ -169,12 +175,39 @@ def generate_compute_unit_compile_spec(
169175
compute_unit.name.lower().encode("utf-8"),
170176
)
171177

178+
@staticmethod
179+
def generate_op_linear_quantizer_config_compile_spec(
180+
op_linear_quantizer_config: str,
181+
) -> CompileSpec:
182+
"""
183+
Returns the compile spec representing the model post conversion quantization,
184+
which is a dict str that will construct cto.coreml.OpLinearQuantizerConfig
185+
"""
186+
return CompileSpec(
187+
COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
188+
op_linear_quantizer_config.encode("utf-8"),
189+
)
190+
191+
@staticmethod
192+
def op_linear_quantizer_config_from_compile_specs(
193+
compile_specs: List[CompileSpec],
194+
) -> Dict:
195+
"""
196+
Returns the model's post conversion quantization by parsing the list of compile specs.
197+
"""
198+
for compile_spec in compile_specs:
199+
if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
200+
return compile_spec.value.decode("utf-8")
201+
202+
return None
203+
172204
@staticmethod
173205
def generate_compile_specs(
174206
compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
175207
minimum_deployment_target: ct.target = ct.target.iOS15,
176208
compute_precision: ct.precision = ct.precision.FLOAT16,
177209
model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
210+
op_linear_quantizer_config: Optional[str] = None,
178211
) -> List[CompileSpec]:
179212
"""
180213
Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +225,12 @@ def generate_compile_specs(
192225
CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
193226
)
194227
compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
228+
if op_linear_quantizer_config is not None:
229+
compile_specs.append(
230+
CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
231+
op_linear_quantizer_config
232+
)
233+
)
195234

196235
return compile_specs
197236

@@ -368,18 +407,18 @@ def preprocess(
368407
compile_specs,
369408
)
370409
)
371-
372410
model_compute_precision: ct.precision = (
373411
CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
374412
)
375-
376413
minimum_deployment_target: ct.target = (
377414
CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
378415
)
379-
380416
compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
381417
compile_specs
382418
)
419+
op_linear_quantizer_config = (
420+
CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
421+
)
383422

384423
mlmodel = ct.convert(
385424
model=edge_program,
@@ -392,4 +431,19 @@ def preprocess(
392431
compute_units=compute_units,
393432
)
394433

434+
if op_linear_quantizer_config is not None:
435+
logger.warning(
436+
"Core ML Backend op_linear_quantizer_config API is experimental"
437+
)
438+
config_dict = json.loads(op_linear_quantizer_config)
439+
global_op_config = cto.coreml.OpLinearQuantizerConfig._from_dict(
440+
config_dict
441+
)
442+
config = cto.coreml.OptimizationConfig(
443+
global_config=global_op_config,
444+
# skip embedding
445+
op_type_configs={"gather": None},
446+
)
447+
mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
448+
395449
return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)

examples/models/llama2/export_llama_lib.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,12 @@ def build_args_parser() -> argparse.ArgumentParser:
293293
action="store_true",
294294
help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
295295
)
296+
parser.add_argument(
297+
"--coreml-quantize",
298+
default=None,
299+
choices=["b4w"],
300+
help="This option is only for coreml: Use coreml quantization",
301+
)
296302
parser.add_argument(
297303
"--qnn",
298304
action="store_true",
@@ -539,6 +545,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
539545
args.use_kv_cache and args.coreml_enable_state,
540546
args.embedding_quantize,
541547
args.pt2e_quantize,
548+
args.coreml_quantize,
542549
)
543550
partitioners.append(coreml_partitioner)
544551
modelname = f"coreml_{modelname}"

extension/llm/export/partitioner_lib.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
import json
78
from typing import Optional
89

910

@@ -59,6 +60,7 @@ def get_coreml_partitioner(
5960
enable_state: bool = False,
6061
embedding_quantize: Optional[str] = None,
6162
pt2e_quantize: Optional[str] = None,
63+
coreml_quantize: Optional[str] = None,
6264
):
6365
try:
6466
import coremltools as ct
@@ -87,16 +89,30 @@ def get_coreml_partitioner(
8789
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
8890
# In Core ML, 4-bit weight compression is introduced in iOS 18
8991
if (
90-
embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4
91-
) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
92+
(embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
93+
or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
94+
or coreml_quantize == "b4w"
95+
):
9296
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
9397

98+
op_linear_quantizer_config = None
99+
if coreml_quantize == "b4w":
100+
op_linear_quantizer_config_dict = {
101+
"mode": "linear_symmetric",
102+
"dtype": "int4",
103+
"granularity": "per_block",
104+
"block_size": 32,
105+
"weight_threshold": 512,
106+
}
107+
op_linear_quantizer_config = json.dumps(op_linear_quantizer_config_dict)
108+
94109
compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16]
95110
minimum_deployment_target=minimum_deployment_target,
96111
compute_precision=ct.precision(ct.precision.FLOAT16.value),
97112
# using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
98113
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
99114
model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16]
115+
op_linear_quantizer_config=op_linear_quantizer_config,
100116
)
101117
return CoreMLPartitioner( # pyre-fixme[16]
102118
compile_specs=compile_specs,

0 commit comments

Comments
 (0)