Enable GPTQ in executorch

jerryzh168 · jerryzh168 · commit 9b6c568631c1 · 2024-03-14T15:19:48.000-07:00
Summary:
Previously we just added the code but didn't test it, this PR also tests gptq locally to make
sure we can produce a model using gptq from torchao repo

Currently blocked on xnnpack lowering

Test Plan:
python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w-gptq -X

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -35,7 +35,6 @@
 
 from .quantize import (
     EmbeddingOnlyInt8QuantHandler,
-    Int8DynActInt4WeightGPTQQuantHandler,
     Int8DynActInt4WeightQuantHandler,
     WeightOnlyInt8QuantHandler,
 )
@@ -181,7 +180,7 @@ def quantize(
     groupsize: int = 128,
     # following arguments only used for GPTQ
     calibration_tasks: Optional[list] = None,
-    calibration_limit: int = 1000,
+    calibration_limit: int = 5,
     calibration_seq_length: int = 100,
     pad_calibration_inputs: bool = False,
     percdamp: float = 0.01,
@@ -204,7 +203,7 @@ def quantize(
         checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
 
     if calibration_tasks is None:
-        calibration_tasks = ["hellaswag"]
+        calibration_tasks = ["wikitext"]
 
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
@@ -217,15 +216,14 @@ def quantize(
         print("quantized model:", model_int4)
         return model_int4
     elif qmode == "8da4w-gptq":
-        gptq_quant_handler = Int8DynActInt4WeightGPTQQuantHandler(
-            precision=torch_dtype,
-        )
+        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
         assert tokenizer_path.is_file(), tokenizer_path
         tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
             model_file=str(tokenizer_path)
         )
-        model_updated_state_dict = gptq_quant_handler.create_quantized_state_dict(
+        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
             tokenizer,
             blocksize,
             percdamp,
@@ -235,8 +233,7 @@ def quantize(
             calibration_seq_length,
             pad_calibration_inputs,
         )
-        model = gptq_quant_handler.convert_for_runtime(model)
-        model.load_state_dict(model_updated_state_dict)
+        model = gptq_quantizer.quantize(model)
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -211,7 +211,9 @@ def forward(
         bsz, seqlen, _ = x.shape
 
         # QKV
+        # TODO: re-enable
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        # xq, xk, xv = x, x, x
         # We need view_copy elimination
         xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
         xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
@@ -461,9 +463,9 @@ def forward(
             freqs_cos = self.freqs_cos[sp : sp + seqlen]
             freqs_sin = self.freqs_sin[sp : sp + seqlen]
         else:
-            assert (
-                start_pos is None and cache_k is None and cache_v is None
-            ), "Caches and start_pos are unused when use_kv_cache is False"
+            # assert (
+            #     start_pos is None and cache_k is None and cache_v is None
+            # ), "Caches and start_pos are unused when use_kv_cache is False"
             freqs_cos = self.freqs_cos[:seqlen]
             freqs_sin = self.freqs_sin[:seqlen]
 
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py