Skip to content

Add CoreML Quantize #5228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 58 additions & 3 deletions backends/apple/coreml/compiler/coreml_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# CoreML backend for delegating a EdgeProgram to CoreML.

import json
import logging

import shutil
import uuid
Expand All @@ -14,6 +15,7 @@
from typing import Any, Dict, final, List, Optional, Tuple

import coremltools as ct
import coremltools.optimize as cto
import executorchcoreml

from executorch.exir.backend.backend_details import (
Expand All @@ -23,12 +25,16 @@
)
from executorch.exir.backend.compile_spec_schema import CompileSpec

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


class COMPILE_SPEC_KEYS(Enum):
COMPUTE_UNITS = "compute_units"
MODEL_TYPE = "model_type"
MIN_DEPLOYMENT_TARGET = "min_deployment_target"
MODEL_COMPUTE_PRECISION = "model_compute_precision"
OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"


class MODEL_PATHS(Enum):
Expand Down Expand Up @@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
compute_unit.name.lower().encode("utf-8"),
)

@staticmethod
def generate_op_linear_quantizer_config_compile_spec(
op_linear_quantizer_config: Dict,
) -> CompileSpec:
"""
Returns the compile spec representing the model post conversion quantization,
which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
"""
str_representation = json.dumps(op_linear_quantizer_config)
byte_representation = str_representation.encode("utf-8")
return CompileSpec(
COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
byte_representation,
)

@staticmethod
def op_linear_quantizer_config_from_compile_specs(
compile_specs: List[CompileSpec],
) -> cto.coreml.OpLinearQuantizerConfig:
"""
Returns the model's post conversion quantization by parsing the list of compile specs.
"""
for compile_spec in compile_specs:
if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
config_dict_str = compile_spec.value.decode("utf-8")
config_dict = json.loads(config_dict_str)
config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
return config

return None

@staticmethod
def generate_compile_specs(
compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
minimum_deployment_target: ct.target = ct.target.iOS15,
compute_precision: ct.precision = ct.precision.FLOAT16,
model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
op_linear_quantizer_config: Optional[Dict] = None,
) -> List[CompileSpec]:
"""
Returns the list of compile specs that's used by CoreMLBackend to lower the module.
Expand All @@ -192,6 +230,12 @@ def generate_compile_specs(
CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
)
compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
if op_linear_quantizer_config is not None:
compile_specs.append(
CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
op_linear_quantizer_config
)
)

return compile_specs

Expand Down Expand Up @@ -368,18 +412,18 @@ def preprocess(
compile_specs,
)
)

model_compute_precision: ct.precision = (
CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
)

minimum_deployment_target: ct.target = (
CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
)

compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
compile_specs
)
op_linear_quantizer_config = (
CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
)

mlmodel = ct.convert(
model=edge_program,
Expand All @@ -392,4 +436,15 @@ def preprocess(
compute_units=compute_units,
)

if op_linear_quantizer_config is not None:
logger.warning(
"Core ML Backend op_linear_quantizer_config API is experimental"
)
config = cto.coreml.OptimizationConfig(
global_config=op_linear_quantizer_config,
# skip embedding
op_type_configs={"gather": None},
)
mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)

return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)
7 changes: 7 additions & 0 deletions examples/models/llama2/export_llama_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,12 @@ def build_args_parser() -> argparse.ArgumentParser:
action="store_true",
help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
)
parser.add_argument(
"--coreml-quantize",
default=None,
choices=["b4w"],
help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
)
parser.add_argument(
"--qnn",
action="store_true",
Expand Down Expand Up @@ -523,6 +529,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
args.use_kv_cache and args.coreml_enable_state,
args.embedding_quantize,
args.pt2e_quantize,
args.coreml_quantize,
)
partitioners.append(coreml_partitioner)
modelname = f"coreml_{modelname}"
Expand Down
18 changes: 16 additions & 2 deletions extension/llm/export/partitioner_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def get_coreml_partitioner(
enable_state: bool = False,
embedding_quantize: Optional[str] = None,
pt2e_quantize: Optional[str] = None,
coreml_quantize: Optional[str] = None,
):
try:
import coremltools as ct
Expand Down Expand Up @@ -87,16 +88,29 @@ def get_coreml_partitioner(
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
# In Core ML, 4-bit weight compression is introduced in iOS 18
if (
embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4
) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
(embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
or coreml_quantize == "b4w"
):
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)

op_linear_quantizer_config = None
if coreml_quantize == "b4w":
op_linear_quantizer_config = {
"mode": "linear_symmetric",
"dtype": "int4",
"granularity": "per_block",
"block_size": 32,
"weight_threshold": 512,
}

compile_specs = CoreMLBackend.generate_compile_specs( # pyre-fixme[16]
minimum_deployment_target=minimum_deployment_target,
compute_precision=ct.precision(ct.precision.FLOAT16.value),
# using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
model_type=CoreMLBackend.MODEL_TYPE.MODEL, # pyre-fixme[16]
op_linear_quantizer_config=op_linear_quantizer_config,
)
return CoreMLPartitioner( # pyre-fixme[16]
compile_specs=compile_specs,
Expand Down
Loading