Add some gptq related args to quantize function (#2577)

jerryzh168 · facebook-github-bot · commit 2d8fa1f13211 · 2024-03-22T14:15:32.000-07:00
Summary: Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #2577 Test Plan: Manully verified locally that the args passed through to quantize function `python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w-gptq -X -d fp32 -G 2568 --calibration_tasks wikitext fads --calibration_seq_length 1288 --calibration_limit 5123` Pull Request resolved: #2577 python3 -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -qmode 8da4w-gptq -X -d fp32 -G 2568 --calibration_tasks wikitext fads --calibration_seq_length 1288 --calibration_limit 5123 Reviewed By: Jack-Khuu Differential Revision: D55250463 Pulled By: jerryzh168 fbshipit-source-id: bdf1299952c1f1010a39849bcf70f398bddfce06
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -204,7 +204,7 @@ def quantize(
     activation_dtype: Optional[DType],
     checkpoint_path: Optional[Path] = None,
     # following arguments only available when setting int4 quantization.
-    groupsize: int = 128,
+    group_size: int = 128,
     # following arguments only used for GPTQ
     calibration_tasks: Optional[list] = None,
     calibration_limit: int = 5,
@@ -255,7 +255,7 @@ def quantize(
             tokenizer,
             blocksize,
             percdamp,
-            groupsize,
+            group_size,
             calibration_tasks,
             calibration_limit,
             calibration_seq_length,
@@ -320,6 +320,25 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=f"{ckpt_dir}/params/demo_rand_params.pth",
         help="checkpoint path",
     )
+    parser.add_argument(
+        "--calibration_tasks",
+        nargs="+",
+        type=str,
+        default=[],
+        help="Tasks for GPTQ calibration",
+    )
+    parser.add_argument(
+        "--calibration_limit",
+        type=int,
+        default=5,
+        help="number of samples used for calibration",
+    )
+    parser.add_argument(
+        "--calibration_seq_length",
+        type=int,
+        default=2048,
+        help="Sequence length for GPTQ calibration",
+    )
     parser.add_argument(
         "-t",
         "--tokenizer_path",
@@ -370,7 +389,9 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=None,
         help="Use cProfile to profile model export. Results saved to profile_path as a html file.",
     )
-    parser.add_argument("-G", "--groupsize", default=None, help="specify the groupsize")
+    parser.add_argument(
+        "-G", "--group_size", default=None, help="group_size for weight quantization"
+    )
 
     parser.add_argument(
         "-d",
@@ -487,6 +508,10 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
                 tokenizer_path=(
                     Path(path) if (path := args.tokenizer_path) is not None else None
                 ),
+                group_size=args.group_size,
+                calibration_tasks=args.calibration_tasks,
+                calibration_limit=args.calibration_limit,
+                calibration_seq_length=args.calibration_seq_length,
             )
         )