Skip to content

Commit 4da3c5d

Browse files
yifan_shen3facebook-github-bot
authored andcommitted
Add CoreML Quantize (#5228)
Summary: ## Motivation Short term: TorchAO int4 quantization yields float zero point, but CoreML does not have good support for it yet. We will need CoreML int4 quantization for now. Intermediate term: Before torch implements all CoreML-supported quantizations (e.g. palettization, sparcification, joint compression...), it will be great to have a way to use/experiment those CoreML quantizations. ## Solution In CoreML preprocess, we add CoreML quantization config as a compile spec Pull Request resolved: #5228 Reviewed By: kirklandsign Differential Revision: D62468184 Pulled By: cccclai fbshipit-source-id: 9f4987d19a01eaf5e2814c9ff8089324174644f8
1 parent 3171ede commit 4da3c5d

File tree

3 files changed

+81
-5
lines changed

3 files changed

+81
-5
lines changed

backends/apple/coreml/compiler/coreml_preprocess.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# CoreML backend for delegating a EdgeProgram to CoreML.
44

55
import json
6+
import logging
67

78
import shutil
89
import uuid
@@ -14,6 +15,7 @@
1415
from typing import Any, Dict, final, List, Optional, Tuple
1516

1617
import coremltools as ct
18+
import coremltools.optimize as cto
1719
import executorchcoreml
1820

1921
from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
2325
)
2426
from executorch.exir.backend.compile_spec_schema import CompileSpec
2527

28+
logger = logging.getLogger(__name__)
29+
logger.setLevel(logging.WARNING)
30+
2631

2732
class COMPILE_SPEC_KEYS(Enum):
2833
COMPUTE_UNITS = "compute_units"
2934
MODEL_TYPE = "model_type"
3035
MIN_DEPLOYMENT_TARGET = "min_deployment_target"
3136
MODEL_COMPUTE_PRECISION = "model_compute_precision"
37+
OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
3238

3339

3440
class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
169175
compute_unit.name.lower().encode("utf-8"),
170176
)
171177

178+
@staticmethod
179+
def generate_op_linear_quantizer_config_compile_spec(
180+
op_linear_quantizer_config: Dict,
181+
) -> CompileSpec:
182+
"""
183+
Returns the compile spec representing the model post conversion quantization,
184+
which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
185+
"""
186+
str_representation = json.dumps(op_linear_quantizer_config)
187+
byte_representation = str_representation.encode("utf-8")
188+
return CompileSpec(
189+
COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
190+
byte_representation,
191+
)
192+
193+
@staticmethod
194+
def op_linear_quantizer_config_from_compile_specs(
195+
compile_specs: List[CompileSpec],
196+
) -> cto.coreml.OpLinearQuantizerConfig:
197+
"""
198+
Returns the model's post conversion quantization by parsing the list of compile specs.
199+
"""
200+
for compile_spec in compile_specs:
201+
if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
202+
config_dict_str = compile_spec.value.decode("utf-8")
203+
config_dict = json.loads(config_dict_str)
204+
config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
205+
return config
206+
207+
return None
208+
172209
@staticmethod
173210
def generate_compile_specs(
174211
compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
175212
minimum_deployment_target: ct.target = ct.target.iOS15,
176213
compute_precision: ct.precision = ct.precision.FLOAT16,
177214
model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
215+
op_linear_quantizer_config: Optional[Dict] = None,
178216
) -> List[CompileSpec]:
179217
"""
180218
Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
192230
CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
193231
)
194232
compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
233+
if op_linear_quantizer_config is not None:
234+
compile_specs.append(
235+
CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
236+
op_linear_quantizer_config
237+
)
238+
)
195239

196240
return compile_specs
197241

@@ -368,18 +412,18 @@ def preprocess(
368412
compile_specs,
369413
)
370414
)
371-
372415
model_compute_precision: ct.precision = (
373416
CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
374417
)
375-
376418
minimum_deployment_target: ct.target = (
377419
CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
378420
)
379-
380421
compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
381422
compile_specs
382423
)
424+
op_linear_quantizer_config = (
425+
CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
426+
)
383427

384428
mlmodel = ct.convert(
385429
model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
392436
compute_units=compute_units,
393437
)
394438

439+
if op_linear_quantizer_config is not None:
440+
logger.warning(
441+
"Core ML Backend op_linear_quantizer_config API is experimental"
442+
)
443+
config = cto.coreml.OptimizationConfig(
444+
global_config=op_linear_quantizer_config,
445+
# skip embedding
446+
op_type_configs={"gather": None},
447+
)
448+
mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
449+
395450
return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)

examples/models/llama2/export_llama_lib.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,12 @@ def build_args_parser() -> argparse.ArgumentParser:
304304
action="store_true",
305305
help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
306306
)
307+
parser.add_argument(
308+
"--coreml-quantize",
309+
default=None,
310+
choices=["b4w"],
311+
help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
312+
)
307313
parser.add_argument(
308314
"--qnn",
309315
action="store_true",
@@ -523,6 +529,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
523529
args.use_kv_cache and args.coreml_enable_state,
524530
args.embedding_quantize,
525531
args.pt2e_quantize,
532+
args.coreml_quantize,
526533
)
527534
partitioners.append(coreml_partitioner)
528535
modelname = f"coreml_{modelname}"

extension/llm/export/partitioner_lib.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def get_coreml_partitioner(
5959
enable_state: bool = False,
6060
embedding_quantize: Optional[str] = None,
6161
pt2e_quantize: Optional[str] = None,
62+
coreml_quantize: Optional[str] = None,
6263
):
6364
try:
6465
import coremltools as ct
@@ -87,16 +88,29 @@ def get_coreml_partitioner(
8788
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
8889
# In Core ML, 4-bit weight compression is introduced in iOS 18
8990
if (
90-
embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4
91-
) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
91+
(embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
92+
or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
93+
or coreml_quantize == "b4w"
94+
):
9295
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
9396

97+
op_linear_quantizer_config = None
98+
if coreml_quantize == "b4w":
99+
op_linear_quantizer_config = {
100+
"mode": "linear_symmetric",
101+
"dtype": "int4",
102+
"granularity": "per_block",
103+
"block_size": 32,
104+
"weight_threshold": 512,
105+
}
106+
94107
compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16]
95108
minimum_deployment_target=minimum_deployment_target,
96109
compute_precision=ct.precision(ct.precision.FLOAT16.value),
97110
# using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
98111
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
99112
model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16]
113+
op_linear_quantizer_config=op_linear_quantizer_config,
100114
)
101115
return CoreMLPartitioner( # pyre-fixme[16]
102116
compile_specs=compile_specs,

0 commit comments

Comments
 (0)