Rename int4 to 8da4w in llama2 quantization (#2573)

andrewor14 · facebook-github-bot · commit 6bef9e7db29f · 2024-03-21T18:16:12.000-07:00
Summary: Pull Request resolved: #2573 int4 has been confused with "int4 weight only" before, when in reality it is "int4 weights + int8 dynamic activations". Renaming it to "8da4w" will reduce confusion and make it more consistent with "8da4w-gptq". #accept2land Reviewed By: jerryzh168 Differential Revision: D55215146 fbshipit-source-id: 435c9b3e70e2546c8e0afc2df848546d7eb2d208
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -222,7 +222,7 @@ def quantize(
     Quantizes a model by converting all weights to int8.
     Args:
         model: A model to quantize.
-        qmode: quantization mode, e.g. int8, int4
+        qmode: quantization mode, e.g. int8, 8da4w, 8da4w-gptq
     Returns:
         A quantized model.
     """
@@ -240,13 +240,13 @@ def quantize(
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
-    elif qmode == "int4":
-        model_int4 = Int8DynActInt4WeightQuantHandler(
+    elif qmode == "8da4w":
+        model = Int8DynActInt4WeightQuantHandler(
             model,
             precision=torch_dtype,
         ).quantized_model()
-        print("quantized model:", model_int4)
-        return model_int4
+        print("quantized model:", model)
+        return model
     elif qmode == "8da4w-gptq":
         from torchao.quantization.quant_api import Int8DynActInt4WeightGPTQQuantizer
 
@@ -315,7 +315,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--quantization_mode",
         type=str,
         default=None,
-        choices=["int8", "int4", "8da4w-gptq"],
+        choices=["int8", "8da4w", "8da4w-gptq"],
         help="type of quantization",
     )
 
@@ -472,8 +472,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     # dtype override
     if args.dtype_override is not None:
         dtype_override = DType[args.dtype_override]
+    elif args.quantization_mode in ["8da4w", "8da4w-gptq"]:
+        dtype_override = DType["fp16"]
     else:
-        dtype_override = DType["fp16"] if args.quantization_mode == "int4" else None
+        dtype_override = None
 
     # source transforms
     transforms = []
@@ -547,7 +549,7 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
     if args.xnnpack:
         # Following changes due to.
         # 1. We need dynamically quantized partitioner for both pt2e_quantize options
-        #    as well as "qmode int4" which is also dynamic quantizes linear layers.
+        #    as well as "qmode 8da4w" which is also dynamic quantizes linear layers.
         # 2. XNNPACK partitioner seems to result in seg fault for non dqlinear ops.
         partitioners[XnnpackDynamicallyQuantizedPartitioner.__name__] = (
             XnnpackDynamicallyQuantizedPartitioner()
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -144,8 +144,8 @@ def __init__(self, **kwargs):
 
             simple_quantizer = WeightOnlyInt8QuantHandler(self.model_)
             self.model_ = simple_quantizer.convert_for_runtime()
-        elif "int4" in str(checkpoint_path):
-            print("Using int4 weight-only quantization!")
+        elif "8da4w" in str(checkpoint_path):
+            print("Using int4 weight and int8 dynamic activation quantization!")
             from .quantize import Int8DynActInt4WeightQuantHandler
 
             simple_quantizer = Int8DynActInt4WeightQuantHandler(self.model_)