Defer resolution of the default value of arguments used by quantize (#2738)

Jack-Khuu · facebook-github-bot · commit 2a3e6ab5d70e · 2024-03-28T15:49:57.000-07:00
Summary:

Quantize() (specifically GPTQ) is the sole user of the many params, but default values are introduced early and in multiple places. This is bug prone and confusing.
* For example, previously the default value of calibration tasks was [], which is not something `Int8DynActInt4WeightGPTQQuantizer` handles gracefully.

This diff defers default value resolution to quantize() since that is the direct call that uses them.

Reviewed By: jerryzh168

Differential Revision: D55458866
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -218,8 +218,8 @@ def quantize(
     group_size: int = 128,
     # following arguments only used for GPTQ
     calibration_tasks: Optional[list] = None,
-    calibration_limit: int = 5,
-    calibration_seq_length: int = 100,
+    calibration_limit: int = 100,
+    calibration_seq_length: int = 2048,
     pad_calibration_inputs: bool = False,
     percdamp: float = 0.01,
     blocksize: int = 128,
@@ -342,19 +342,19 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--calibration_tasks",
         nargs="+",
         type=str,
-        default=[],
+        default=None,
         help="Tasks for GPTQ calibration",
     )
     parser.add_argument(
         "--calibration_limit",
         type=int,
-        default=5,
+        default=None,
         help="number of samples used for calibration",
     )
     parser.add_argument(
         "--calibration_seq_length",
         type=int,
-        default=2048,
+        default=None,
         help="Sequence length for GPTQ calibration",
     )
     parser.add_argument(
@@ -531,9 +531,25 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     transforms = []
     if args.quantization_mode:
         modelname = f"{modelname}_q"
+
+        # If these optional args are None, don't provide them to quantize()
+        quant_args_str = [
+            "group_size",
+            "calibration_tasks",
+            "calibration_limit",
+            "calibration_seq_length",
+        ]
+        arg_dict = vars(args)
+        quant_args = {
+            param: val
+            for param in quant_args_str
+            if (val := arg_dict.get(param)) is not None
+        }
+
         transforms.append(
             partial(
                 quantize,
+                **quant_args,
                 qmode=args.quantization_mode,
                 activation_dtype=dtype_override,
                 checkpoint_path=(
@@ -542,10 +558,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
                 tokenizer_path=(
                     Path(path) if (path := args.tokenizer_path) is not None else None
                 ),
-                group_size=args.group_size,
-                calibration_tasks=args.calibration_tasks,
-                calibration_limit=args.calibration_limit,
-                calibration_seq_length=args.calibration_seq_length,
             )
         )