Skip to content

Commit ac94cd9

Browse files
author
yifan_shen3
committed
add 4-bit groupwise weight-only quantization for coreml
1 parent d3da92d commit ac94cd9

File tree

4 files changed

+17
-5
lines changed

4 files changed

+17
-5
lines changed

examples/models/llama2/export_llama_lib.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def build_args_parser() -> argparse.ArgumentParser:
144144
"--quantization_mode",
145145
type=str,
146146
default=None,
147-
choices=["int8", "8da4w", "8da4w-gptq"],
147+
choices=["int8", "8da4w", "8da4w-gptq", "coreml_g4w"],
148148
help="type of quantization",
149149
)
150150

@@ -487,7 +487,7 @@ def _export_llama(modelname, args) -> LLMEdgeManager: # noqa: C901
487487

488488
if args.coreml:
489489
coreml_partitioner = get_coreml_partitioner(
490-
args.use_kv_cache, args.pt2e_quantize
490+
args.use_kv_cache, args.pt2e_quantize, args.quantization_mode
491491
)
492492
partitioners.append(coreml_partitioner)
493493
modelname = f"coreml_{modelname}"

examples/models/llama2/install_requirements.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
# Install snakeviz for cProfile flamegraph
99
# Install sentencepiece for llama tokenizer
1010
pip install snakeviz sentencepiece
11-
pip install torchao==0.1
11+
pip install torchao==0.4.0
1212

1313
# Install lm-eval for Model Evaluation with lm-evalution-harness
1414
# Install tiktoken for tokenizer

examples/models/llama2/source_transformation/quantize.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,15 @@ def quantize(
130130
group_size,
131131
)
132132
model = gptq_quantizer.quantize(model, inputs)
133+
return model
134+
elif qmode == "coreml_g4w":
135+
from torchao.quantization.quant_api import Int4WeightOnlyQuantizer
136+
137+
quantizer = Int4WeightOnlyQuantizer(
138+
precision=torch.float32, groupsize=32, inner_k_tiles=2, device=torch.device("cpu")
139+
)
140+
model = quantizer.quantize(model)
141+
133142
return model
134143
else:
135144
raise Exception(f"Unrecognized quantize mode: {qmode}")

extension/llm/export/partitioner_lib.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def get_mps_partitioner(use_kv_cache: bool = False):
5656

5757

5858
def get_coreml_partitioner(
59-
use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None
59+
use_kv_cache: bool = False, pt2e_quantize: Optional[str] = None, quantization_mode: Optional[str] = None
6060
):
6161
assert (
6262
use_kv_cache is True
@@ -82,7 +82,10 @@ def get_coreml_partitioner(
8282
if pt2e_quantize in ("coreml_8a_c8w", "coreml_baseline_8a_c8w"):
8383
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS17)
8484
# In Core ML, 4-bit weight compression is introduced in iOS 18
85-
if pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w"):
85+
if (
86+
pt2e_quantize in ("coreml_c4w", "coreml_8a_c4w", "coreml_baseline_8a_c4w")
87+
or quantization_mode == "coreml_g4w"
88+
):
8689
minimum_deployment_target = max(minimum_deployment_target, ct.target.iOS18)
8790
# In Core ML, stateful execution is introduced in iOS 18
8891
# TODO (https://github.com/pytorch/executorch/issues/4209)

0 commit comments

Comments
 (0)