add 4-bit groupwise weight-only quantization for coreml

yifan_shen3 · yifan_shen3 · commit ac94cd9d043e · 2024-08-22T21:29:25.000-07:00
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -144,7 +144,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantization_mode",
         type=str,
         default=None,
-        choices=["int8", "8da4w", "8da4w-gptq"],
+        choices=["int8", "8da4w", "8da4w-gptq", "coreml_g4w"],
         help="type of quantization",
     )
 
@@ -487,7 +487,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
 
     if args.coreml:
         coreml_partitioner = get_coreml_partitioner(
-            args.use_kv_cache, args.pt2e_quantize
+            args.use_kv_cache, args.pt2e_quantize, args.quantization_mode
         )
         partitioners.append(coreml_partitioner)
         modelname = f"coreml_{modelname}"
diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama2/install_requirements.sh
@@ -8,7 +8,7 @@
 # Install snakeviz for cProfile flamegraph
 # Install sentencepiece for llama tokenizer
 pip install snakeviz sentencepiece
-pip install torchao==0.1
+pip install torchao==0.4.0
 
 # Install lm-eval for Model Evaluation with lm-evalution-harness
 # Install tiktoken for tokenizer
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
@@ -130,6 +130,15 @@ def quantize(
             group_size,
         )
         model = gptq_quantizer.quantize(model, inputs)
+        return model
+    elif qmode == "coreml_g4w":
+        from torchao.quantization.quant_api import Int4WeightOnlyQuantizer
+
+        quantizer = Int4WeightOnlyQuantizer(
+            precision=torch.float32, groupsize=32, inner_k_tiles=2, device=torch.device("cpu")
+        )
+        model = quantizer.quantize(model)
+
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
@@ -56,7 +56,7 @@ def get_mps_partitioner(use_kv_cache: bool = False):
 
 
 def get_coreml_partitioner(
-    use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
+    use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None, quantization_mode: Optional[str] = None
 ):
     assert (
         use_kv_cache is True
@@ -82,7 +82,10 @@ def get_coreml_partitioner(
     if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
     # In Core ML, 4-bit weight compression is introduced in iOS 18
-    if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
+    if (
+        pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
+        or quantization_mode == "coreml_g4w"
+    ):
         minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
     # In Core ML, stateful execution is introduced in iOS 18
     # TODO (https://github.com/pytorch/executorch/issues/4209)