Enable GPTQ in executorch

jerryzh168 · jerryzh168 · commit 2ff6842e2c85 · 2024-03-13T19:51:48.000-07:00
Summary:
Previously we just added the code but didn't test it, this PR also tests gptq locally to make
sure we can produce a model using gptq from torchao repo

Test Plan:
python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w-gptq

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -35,7 +35,6 @@
 
 from .quantize import (
     EmbeddingOnlyInt8QuantHandler,
-    Int8DynActInt4WeightGPTQQuantHandler,
     Int8DynActInt4WeightQuantHandler,
     WeightOnlyInt8QuantHandler,
 )
@@ -217,15 +216,15 @@ def quantize(
         print("quantized model:", model_int4)
         return model_int4
     elif qmode == "8da4w-gptq":
-        gptq_quant_handler = Int8DynActInt4WeightGPTQQuantHandler(
-            precision=torch_dtype,
+        from torchao.quantization.quant_api import (
+            Int8DynActInt4WeightGPTQQuantizer,
         )
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
         assert tokenizer_path.is_file(), tokenizer_path
         tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
             model_file=str(tokenizer_path)
         )
-        model_updated_state_dict = gptq_quant_handler.create_quantized_state_dict(
+        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
             tokenizer,
             blocksize,
             percdamp,
@@ -235,8 +234,7 @@ def quantize(
             calibration_seq_length,
             pad_calibration_inputs,
         )
-        model = gptq_quant_handler.convert_for_runtime(model)
-        model.load_state_dict(model_updated_state_dict)
+        model = gptq_quantizer.quantize(model)
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")
@@ -442,6 +440,7 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     # export_to_edge
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
+    print("quantizers:", quantizers)
 
     # to_backend
     partitioners = {}
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -461,9 +461,9 @@ def forward(
             freqs_cos = self.freqs_cos[sp : sp + seqlen]
             freqs_sin = self.freqs_sin[sp : sp + seqlen]
         else:
-            assert (
-                start_pos is None and cache_k is None and cache_v is None
-            ), "Caches and start_pos are unused when use_kv_cache is False"
+            # assert (
+            #     start_pos is None and cache_k is None and cache_v is None
+            # ), "Caches and start_pos are unused when use_kv_cache is False"
             freqs_cos = self.freqs_cos[:seqlen]
             freqs_sin = self.freqs_sin[:seqlen]
 
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py