pytorch
diff --git a/‎examples/models/llama2/export_llama_lib.py
Lines changed: 6 additions & 9 deletions b/‎examples/models/llama2/export_llama_lib.py
Lines changed: 6 additions & 9 deletions
@@ -35,7 +35,6 @@
 
 from .quantize import (
     EmbeddingOnlyInt8QuantHandler,
-    Int8DynActInt4WeightGPTQQuantHandler,
     Int8DynActInt4WeightQuantHandler,
     WeightOnlyInt8QuantHandler,
 )
@@ -181,7 +180,7 @@ def quantize(
     groupsize: int = 128,
     # following arguments only used for GPTQ
     calibration_tasks: Optional[list] = None,
-    calibration_limit: int = 1000,
+    calibration_limit: int = 5,
     calibration_seq_length: int = 100,
     pad_calibration_inputs: bool = False,
     percdamp: float = 0.01,
@@ -204,7 +203,7 @@ def quantize(
         checkpoint_path = Path("checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
 
     if calibration_tasks is None:
-        calibration_tasks = ["hellaswag"]
+        calibration_tasks = ["wikitext"]
 
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
@@ -217,15 +216,14 @@ def quantize(
         print("quantized model:", model_int4)
         return model_int4
     elif qmode == "8da4w-gptq":
-        gptq_quant_handler = Int8DynActInt4WeightGPTQQuantHandler(
-            precision=torch_dtype,
-        )
+        from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
+
         tokenizer_path = checkpoint_path.parent / "tokenizer.model"
         assert tokenizer_path.is_file(), tokenizer_path
         tokenizer = SentencePieceProcessor(  # pyre-ignore[28]
             model_file=str(tokenizer_path)
         )
-        model_updated_state_dict = gptq_quant_handler.create_quantized_state_dict(
+        gptq_quantizer = Int8DynActInt4WeightGPTQQuantizer(
             tokenizer,
             blocksize,
             percdamp,
@@ -235,8 +233,7 @@ def quantize(
             calibration_seq_length,
             pad_calibration_inputs,
         )
-        model = gptq_quant_handler.convert_for_runtime(model)
-        model.load_state_dict(model_updated_state_dict)
+        model = gptq_quantizer.quantize(model)
         return model
     else:
         raise Exception(f"Unrecognized quantize mode: {qmode}")