Enable GPTQ in executorch

jerryzh168 · jerryzh168 · commit a6e51cf275f6 · 2024-03-15T16:56:55.000-07:00
Summary:
Previously we just added the code but didn't test it, this PR also tests gptq locally to make
sure we can produce a model using gptq from torchao repo

Currently blocked on xnnpack lowering

Test Plan:
python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w-gptq -X

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -37,7 +37,6 @@
 
 from .quantize import (
     EmbeddingOnlyInt8QuantHandler,
-    Int8DynActInt4WeightGPTQQuantHandler,
     Int8DynActInt4WeightQuantHandler,
     WeightOnlyInt8QuantHandler,
 )
@@ -183,7 +182,7 @@ def quantize(
     groupsize: int = 128,
     # following arguments only used for GPTQ
     calibration_tasks: Optional[list] = None,
-    calibration_limit: int = 1000,
+    calibration_limit: int = 5,
     calibration_seq_length: int = 100,
     pad_calibration_inputs: bool = False,
     percdamp: float = 0.01,
@@ -206,7 +205,7 @@ def quantize(
         checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
 
     if calibration_tasks is None:
-        calibration_tasks = ["hellaswag"]
+        calibration_tasks = ["wikitext"]
 
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
@@ -219,15 +218,14 @@ def quantize(
         print("quantized model:", model_int4)
         return model_int4
     elif qmode == "8da4w-gptq":
-        gptq_quant_handler = Int8DynActInt4WeightGPTQQuantHandler(
-            precision=torch_dtype,
-        )
+        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
         assert tokenizer_path.is_file(), tokenizer_path
         tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
             model_file=str(tokenizer_path)
         )
-        model_updated_state_dict = gptq_quant_handler.create_quantized_state_dict(
+        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
             tokenizer,
             blocksize,
             percdamp,
@@ -237,8 +235,7 @@ def quantize(
             calibration_seq_length,
             pad_calibration_inputs,
         )
-        model = gptq_quant_handler.convert_for_runtime(model)
-        model.load_state_dict(model_updated_state_dict)
+        model = gptq_quantizer.quantize(model)
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py
@@ -13,6 +13,7 @@
 import torch.nn.functional as F
 from .ops.quantized_ops import *  # noqa
 
+# TODO: move to correct place
 from torchao.quantization.quant_primitives import (
     get_group_qparams_symmetric,
     group_quantize_tensor_symmetric,
@@ -652,7 +653,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             self.scales,
             self.zeros,
             self.out_features,
-            self.groupsize,
+            self.group_size,
             self.precision,
         )
 
@@ -737,7 +738,6 @@ class GPTQQuantHandler(QuantHandler):
     """
 
     def __init__(self):
-        assert self.mod is not None
         assert self.get_qparams_func is not None
         assert self.quantize_func is not None
         assert self.dequantize_func is not None