[quant] Use Int8DynActInt4WeightQuantizer in torchao

jerryzh168 · jerryzh168 · commit 80ad94bcc5dc · 2024-03-20T17:35:07.000-07:00
Summary: att Test Plan: python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w -X -d fp32 Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 2fc1355 Pull Request resolved: #2551
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -212,11 +212,9 @@ def quantize(
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
-    elif qmode == "int4":
-        model_int4 = Int8DynActInt4WeightQuantHandler(
-            model,
-            precision=torch_dtype,
-        ).quantized_model()
+    elif qmode == "8da4w":
+        from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
+        model_int4 = Int8DynActInt4WeightQuantizer(precision=torch_dtype).quantize(model)
         print("quantized model:", model_int4)
         return model_int4
     elif qmode == "8da4w-gptq":
@@ -287,7 +285,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantization_mode",
         type=str,
         default=None,
-        choices=["int8", "int4", "8da4w-gptq"],
+        choices=["int8", "8da4w", "8da4w-gptq"],
         help="type of quantization",
     )
 
@@ -430,7 +428,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     if args.dtype_override is not None:
         dtype_override = DType[args.dtype_override]
     else:
-        dtype_override = DType["fp16"] if args.quantization_mode == "int4" else None
+        dtype_override = DType["fp16"] if args.quantization_mode in ["8da4w", "8da4w-gptq"] else None
 
     # source transforms
     transforms = []
@@ -500,7 +498,7 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     if args.xnnpack:
         # Following changes due to.
         # 1. We need dynamically quantized partitioner for both pt2e_quantize options
-        #    as well as "qmode int4" which is also dynamic quantizes linear layers.
+        #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
         # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
         partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = (
             XnnpackDynamicallyQuantizedPartitioner()