Add CoreML Quantize (#5228)

yifan_shen3 · facebook-github-bot · commit 4da3c5d0bf11 · 2024-09-11T09:17:47.000-07:00
Summary: ## Motivation Short term: TorchAO int4 quantization yields float zero point, but CoreML does not have good support for it yet. We will need CoreML int4 quantization for now. Intermediate term: Before torch implements all CoreML-supported quantizations (e.g. palettization, sparcification, joint compression...), it will be great to have a way to use/experiment those CoreML quantizations. ## Solution In CoreML preprocess, we add CoreML quantization config as a compile spec Pull Request resolved: #5228 Reviewed By: kirklandsign Differential Revision: D62468184 Pulled By: cccclai fbshipit-source-id: 9f4987d19a01eaf5e2814c9ff8089324174644f8
diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -3,6 +3,7 @@
 # CoreML backend for delegating a EdgeProgram to CoreML.
 
 import json
+import logging
 
 import shutil
 import uuid
@@ -14,6 +15,7 @@
 from typing import Any, Dict, final, List, Optional, Tuple
 
 import coremltools as ct
+import coremltools.optimize as cto
 import executorchcoreml
 
 from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
     MODEL_TYPE = "model_type"
     MIN_DEPLOYMENT_TARGET = "min_deployment_target"
     MODEL_COMPUTE_PRECISION = "model_compute_precision"
+    OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
 
 
 class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
             compute_unit.name.lower().encode("utf-8"),
         )
 
+    @staticmethod
+    def generate_op_linear_quantizer_config_compile_spec(
+        op_linear_quantizer_config: Dict,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model post conversion quantization,
+        which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
+        """
+        str_representation = json.dumps(op_linear_quantizer_config)
+        byte_representation = str_representation.encode("utf-8")
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
+            byte_representation,
+        )
+
+    @staticmethod
+    def op_linear_quantizer_config_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> cto.coreml.OpLinearQuantizerConfig:
+        """
+        Returns the model's post conversion quantization by parsing the list of compile specs.
+        """
+        for compile_spec in compile_specs:
+            if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
+                config_dict_str = compile_spec.value.decode("utf-8")
+                config_dict = json.loads(config_dict_str)
+                config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
+                return config
+
+        return None
+
     @staticmethod
     def generate_compile_specs(
         compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
         minimum_deployment_target: ct.target = ct.target.iOS15,
         compute_precision: ct.precision = ct.precision.FLOAT16,
         model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
+        op_linear_quantizer_config: Optional[Dict] = None,
     ) -> List[CompileSpec]:
         """
         Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
             CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
         )
         compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
+        if op_linear_quantizer_config is not None:
+            compile_specs.append(
+                CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
+                    op_linear_quantizer_config
+                )
+            )
 
         return compile_specs
 
@@ -368,18 +412,18 @@ def preprocess(
                 compile_specs,
             )
         )
-
         model_compute_precision: ct.precision = (
             CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
         )
-
         minimum_deployment_target: ct.target = (
             CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
         )
-
         compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
             compile_specs
         )
+        op_linear_quantizer_config = (
+            CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
+        )
 
         mlmodel = ct.convert(
             model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
             compute_units=compute_units,
         )
 
+        if op_linear_quantizer_config is not None:
+            logger.warning(
+                "Core ML Backend op_linear_quantizer_config API is experimental"
+            )
+            config = cto.coreml.OptimizationConfig(
+                global_config=op_linear_quantizer_config,
+                # skip embedding
+                op_type_configs={"gather": None},
+            )
+            mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
+
         return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -304,6 +304,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="This option is only for coreml, and is only supported for MacOS15+/iOS18+",
     )
+    parser.add_argument(
+        "--coreml-quantize",
+        default=None,
+        choices=["b4w"],
+        help="This option is only for coreml: Use coreml quantization, e.g. b4w (for blockwise 4 bit weight)",
+    )
     parser.add_argument(
         "--qnn",
         action="store_true",
@@ -523,6 +529,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
             args.use_kv_cache and args.coreml_enable_state,
             args.embedding_quantize,
             args.pt2e_quantize,
+            args.coreml_quantize,
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
@@ -59,6 +59,7 @@ def get_coreml_partitioner(
     enable_state: bool = False,
     embedding_quantize: Optional[str] = None,
     pt2e_quantize: Optional[str] = None,
+    coreml_quantize: Optional[str] = None,
 ):
     try:
         import coremltools as ct
@@ -87,16 +88,29 @@ def get_coreml_partitioner(
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
     # In Core ML, 4-bit weight compression is introduced in iOS 18
     if (
-        embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4
-    ) or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
+        (embedding_quantize is not None and int(embedding_quantize.split(",")[0]) == 4)
+        or pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
+        or coreml_quantize == "b4w"
+    ):
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
 
+    op_linear_quantizer_config = None
+    if coreml_quantize == "b4w":
+        op_linear_quantizer_config = {
+            "mode": "linear_symmetric",
+            "dtype": "int4",
+            "granularity": "per_block",
+            "block_size": 32,
+            "weight_threshold": 512,
+        }
+
     compile_specs = CoreMLBackend.generate_compile_specs(  # pyre-fixme[16]
         minimum_deployment_target=minimum_deployment_target,
         compute_precision=ct.precision(ct.precision.FLOAT16.value),
         # using `ComputeUnit.ALL` can increase the model load time, default to `ComputeUnit.CPU_AND_GPU`
         compute_unit=ct.ComputeUnit[ct.ComputeUnit.CPU_AND_GPU.name.upper()],
         model_type=CoreMLBackend.MODEL_TYPE.MODEL,  # pyre-fixme[16]
+        op_linear_quantizer_config=op_linear_quantizer_config,
     )
     return CoreMLPartitioner(  # pyre-fixme[16]
         compile_specs=compile_specs,