address comments

cccclai · cccclai · commit 85154aa18254 · 2024-09-05T21:21:29.000-07:00
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -162,7 +162,7 @@ def gen_eval_wrapper(
         tokenizer=tokenizer,
         max_seq_length=args.max_seq_length,
         use_kv_cache=args.use_kv_cache,
-        dynamic_shape=(manager.dynamic_shapes != None),
+        dynamic_shape=(manager.dynamic_shapes is not None),
     )
 
 
diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama2/evaluate/eager_eval.py
@@ -83,7 +83,7 @@ def _model_call(self, inps):
                 # graph module exported without dynamic shape won't work with a different shape.
                 # And we have to do single token prefill here.
                 result_logits = []
-                for pos in range(self._max_seq_length):
+                for pos in range(inps.shape[-1]):
                     pos_tensor = torch.tensor([pos], dtype=torch.int64)
                     logits = self._model(inps[:, pos : pos + 1], pos_tensor)
                     result_logits.append(logits)
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -16,7 +16,7 @@
 from enum import Enum
 from json import JSONDecodeError
 from pathlib import Path
-from typing import Optional, Union, List
+from typing import List, Optional, Union
 
 import pkg_resources
 
@@ -166,19 +166,25 @@ def build_args_parser() -> argparse.ArgumentParser:
         nargs="+",
         type=str,
         default=None,
-        help="Tasks for GPTQ calibration",
+        help="Tasks for GPTQ calibration from lm_eval",
     )
     parser.add_argument(
         "--calibration_limit",
         type=int,
         default=None,
-        help="number of samples used for calibration",
+        help="number of samples used for calibration from lm_eval",
     )
     parser.add_argument(
         "--calibration_seq_length",
         type=int,
         default=None,
-        help="Sequence length for GPTQ calibration",
+        help="Sequence length for GPTQ calibration from lm_eval",
+    )
+    parser.add_argument(
+        "--calibration_data",
+        type=str,
+        default="Once upon a time",
+        help="Calibration prompts from users",
     )
     parser.add_argument(
         "-t",
@@ -424,6 +430,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             calibration_tasks=args.calibration_tasks,
             calibration_limit=args.calibration_limit,
             calibration_seq_length=args.calibration_seq_length,
+            calibration_data=args.calibration_data,
             tokenizer_path=args.tokenizer_path,
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
@@ -637,6 +644,7 @@ def _load_llama_model(
     calibration_tasks: Optional[List[str]] = None,
     calibration_limit: Optional[int] = None,
     calibration_seq_length: Optional[int] = None,
+    calibration_data: Optional[str] = None,
     tokenizer_path: Optional[str] = None,
     verbose: bool = False,
     max_seq_len: int = 128,
@@ -696,6 +704,7 @@ def _load_llama_model(
         calibration_tasks=calibration_tasks,
         calibration_limit=calibration_limit,
         calibration_seq_length=calibration_seq_length,
+        calibration_data=calibration_data,
         tokenizer_path=tokenizer_path,
         verbose=verbose,
         metadata=_load_llama_model_metadata(
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -11,7 +11,6 @@
 import logging
 from enum import Enum
 from typing import Any, Callable, List, Optional
-from executorch.extension.llm.tokenizer.utils import get_tokenizer
 
 import torch
 from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
@@ -28,6 +27,7 @@
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
+from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
@@ -70,6 +70,7 @@ def __init__(
         calibration_tasks: Optional[List[str]] = None,
         calibration_limit: Optional[int] = None,
         calibration_seq_length: Optional[int] = None,
+        calibration_data: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
         verbose: bool = False,
         metadata: Optional[dict] = None,
@@ -95,6 +96,7 @@ def __init__(
         self.calibration_tasks = calibration_tasks
         self.calibration_limit = calibration_limit
         self.calibration_seq_length = calibration_seq_length
+        self.calibration_data = calibration_data
         self.tokenizer_path = tokenizer_path
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
@@ -176,41 +178,51 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
             )
         return self
 
-
     def pt2e_calibrate(
         self,
         prepared_module,
         calibration_tasks,
         calibration_limit,
         calibration_seq_length,
+        calibration_data,
         tokenizer_path,
     ):
         logging.info("Run calibration...")
         try:
-            from executorch.examples.models.llama2.evaluate import EagerEvalWrapper, evaluate_model
+            from executorch.examples.models.llama2.evaluate import (
+                EagerEvalWrapper,
+                evaluate_model,
+            )
         except ImportError:
             raise ImportError(
                 "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh"
             )
 
         tokenizer = get_tokenizer(tokenizer_path)
 
-        def calibrate_template(module: torch.fx.GraphModule, tokenizer, string: str = "Once upon a time", max_len: int = 128):
-                # TODO: change criteria & support batch inputs if necessary
-                pos = torch.tensor(0, dtype=torch.int64)
-                token_list = [tokenizer.bos_id] + tokenizer.encode(string, bos=True, eos=False)
-
-                with torch.no_grad():
-                    while token_list[-1] != tokenizer.eos_id and pos < max_len:
-                        logits = module(
-                            torch.full((1, 1), token_list[pos]),
-                            torch.tensor((pos, )),
-                        )
-                        pos += 1
-                        if pos >= len(token_list):
-                            token_list.append(torch.argmax(logits[:], dim=-1).item())
+        def calibrate_template(
+            module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
+        ):
+            # TODO: change criteria & support batch inputs if necessary
+            pos = torch.tensor(0, dtype=torch.int64)
+            token_list = tokenizer.encode(prompts, bos=True, eos=False)
+
+            with torch.no_grad():
+                while token_list[-1] != tokenizer.eos_id and pos < max_len:
+                    logits = module(
+                        torch.full((1, 1), token_list[pos]),
+                        torch.tensor((pos,)),
+                    )
+                    pos += 1
+                    if pos >= len(token_list):
+                        token_list.append(torch.argmax(logits[:], dim=-1).item())
 
-        calibrate_template(prepared_module, tokenizer, string="Once upon a time", max_len=calibration_seq_length)
+        calibrate_template(
+            module=prepared_module,
+            tokenizer=tokenizer,
+            prompts=calibration_data,
+            max_len=calibration_seq_length,
+        )
 
         eval_wrapper = EagerEvalWrapper(
             model=prepared_module.to(device="cuda"),
@@ -251,20 +263,26 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                     self.pre_autograd_graph_module is not None
                 ), "Please run capture_pre_autograd_graph first"
                 m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer)
+                logging.info(
+                    f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
+                )
                 # Calibrate
-                logging.info(f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, seq_length: {self.calibration_seq_length}, tokenizer_path: {self.tokenizer_path}")
                 if (
                     self.calibration_tasks is not None
                     and self.calibration_limit is not None
                     and self.calibration_seq_length is not None
+                    and self.calibration_data is not None
                     and self.tokenizer_path is not None
                 ):
-                    logging.info(f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, seq_length: {self.calibration_seq_length}")
+                    logging.info(
+                        f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
+                    )
                     self.pt2e_calibrate(
                         prepared_module=m,
                         calibration_tasks=self.calibration_tasks,
                         calibration_limit=self.calibration_limit,
                         calibration_seq_length=self.calibration_seq_length,
+                        calibration_data=self.calibration_data,
                         tokenizer_path=self.tokenizer_path,
                     )
                 else:

Original file line number	Diff line number	Diff line change
`@@ -162,7 +162,7 @@ def gen_eval_wrapper(`
`162`	`162`	`tokenizer=tokenizer,`
`163`	`163`	`max_seq_length=args.max_seq_length,`
`164`	`164`	`use_kv_cache=args.use_kv_cache,`
`165`		`- dynamic_shape=(manager.dynamic_shapes != None),`
	`165`	`+ dynamic_shape=(manager.dynamic_shapes is not None),`
`166`	`166`	`)`
`167`	`167`
`168`	`168`