Revert "Add proper pt2e calibration (#5095)"

cccclai · web-flow · commit 44f95261558e · 2024-09-06T11:27:17.000-07:00
This reverts commit 7122d31.
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -29,51 +29,6 @@
 )
 
 
-class GraphModuleEvalWrapper(EagerEvalWrapper):
-    """
-    A wrapper class for ExecuTorch py-binded integration with the
-    lm-evaluation-harness library.
-    """
-
-    def __init__(
-        self,
-        model: torch.fx.GraphModule,
-        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
-        max_seq_length: Optional[int] = None,
-        use_kv_cache: bool = False,
-        enable_dynamic_shape: bool = True,
-    ):
-        super().__init__(
-            model=model, tokenizer=tokenizer, max_seq_length=max_seq_length
-        )
-        self._model = model.to(self.device)
-        self._use_kv_cache = use_kv_cache
-        self._enable_dynamic_shape = enable_dynamic_shape
-
-    def _model_call(self, inps):
-        if self._use_kv_cache:
-            if not self._enable_dynamic_shape:
-                # graph module exported without dynamic shape won't work with a different shape.
-                # And we have to do single token prefill here.
-                result_logits = []
-                for pos in range(inps.shape[-1]):
-                    pos_tensor = torch.tensor([pos], dtype=torch.int64)
-                    logits = self._model(inps[:, pos : pos + 1], pos_tensor)
-                    result_logits.append(logits)
-                return torch.cat(result_logits, dim=1)
-            else:
-                pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
-                # Batch process the whole sequence.
-                logits = self._model(inps[:, : self._max_seq_length], pos_tensor)
-                return logits
-
-        else:
-            return self._model(inps)
-
-    def _model_generate(self, context, max_length, eos_token_id):
-        raise Exception("unimplemented")
-
-
 class ETPybindEvalWrapper(EagerEvalWrapper):
     """
     A wrapper class for ExecuTorch py-binded integration with the
@@ -193,13 +148,6 @@ def gen_eval_wrapper(
             if torch.cuda.is_available()
             else manager.pre_autograd_graph_module.to(device="cpu")
         )
-        return GraphModuleEvalWrapper(
-            model=model,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            use_kv_cache=args.use_kv_cache,
-            enable_dynamic_shape=args.enable_dynamic_shape,
-        )
     else:
         # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch
         # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but
@@ -209,12 +157,13 @@ def gen_eval_wrapper(
             if torch.cuda.is_available()
             else manager.model.eval().to(device="cpu")
         )
-        return EagerEvalWrapper(
-            model=model,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            use_kv_cache=args.use_kv_cache,
-        )
+
+    return EagerEvalWrapper(
+        model=model,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        use_kv_cache=args.use_kv_cache,
+    )
 
 
 def build_args_parser() -> argparse.ArgumentParser:
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -16,7 +16,7 @@
 from enum import Enum
 from json import JSONDecodeError
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pkg_resources
 
@@ -166,25 +166,19 @@ def build_args_parser() -> argparse.ArgumentParser:
         nargs="+",
         type=str,
         default=None,
-        help="Tasks for GPTQ calibration from lm_eval",
+        help="Tasks for GPTQ calibration",
     )
     parser.add_argument(
         "--calibration_limit",
         type=int,
         default=None,
-        help="number of samples used for calibration from lm_eval",
+        help="number of samples used for calibration",
     )
     parser.add_argument(
         "--calibration_seq_length",
         type=int,
         default=None,
-        help="Sequence length for GPTQ calibration from lm_eval",
-    )
-    parser.add_argument(
-        "--calibration_data",
-        type=str,
-        default="Once upon a time",
-        help="Calibration prompts from users",
+        help="Sequence length for GPTQ calibration",
     )
     parser.add_argument(
         "-t",
@@ -427,11 +421,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             generate_full_logits=args.generate_full_logits,
             weight_type=weight_type,
             enable_dynamic_shape=args.enable_dynamic_shape,
-            calibration_tasks=args.calibration_tasks,
-            calibration_limit=args.calibration_limit,
-            calibration_seq_length=args.calibration_seq_length,
-            calibration_data=args.calibration_data,
-            tokenizer_path=args.tokenizer_path,
             verbose=args.verbose,
             max_seq_len=args.max_seq_length,
             metadata_str=args.metadata,
@@ -641,11 +630,6 @@ def _load_llama_model(
     generate_full_logits: bool = False,
     weight_type: WeightType = WeightType.LLAMA,
     enable_dynamic_shape: bool = False,
-    calibration_tasks: Optional[List[str]] = None,
-    calibration_limit: Optional[int] = None,
-    calibration_seq_length: Optional[int] = None,
-    calibration_data: Optional[str] = None,
-    tokenizer_path: Optional[str] = None,
     verbose: bool = False,
     max_seq_len: int = 128,
     metadata_str: Optional[str] = None,
@@ -701,11 +685,6 @@ def _load_llama_model(
         use_kv_cache=use_kv_cache,
         example_inputs=example_inputs,
         enable_dynamic_shape=enable_dynamic_shape,
-        calibration_tasks=calibration_tasks,
-        calibration_limit=calibration_limit,
-        calibration_seq_length=calibration_seq_length,
-        calibration_data=calibration_data,
-        tokenizer_path=tokenizer_path,
         verbose=verbose,
         metadata=_load_llama_model_metadata(
             weight_type,
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -27,7 +27,6 @@
 from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
-from executorch.extension.llm.tokenizer.utils import get_tokenizer
 from torch._export import capture_pre_autograd_graph
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer import Quantizer
@@ -67,11 +66,6 @@ def __init__(
         use_kv_cache,
         example_inputs,
         enable_dynamic_shape: bool = False,
-        calibration_tasks: Optional[List[str]] = None,
-        calibration_limit: Optional[int] = None,
-        calibration_seq_length: Optional[int] = None,
-        calibration_data: Optional[str] = None,
-        tokenizer_path: Optional[str] = None,
         verbose: bool = False,
         metadata: Optional[dict] = None,
         dynamic_shapes: Optional[Any] = None,
@@ -93,11 +87,6 @@ def __init__(
         self.output_dir = "."
         self.dynamic_shapes = dynamic_shapes
         self._saved_pte_filename = None
-        self.calibration_tasks = calibration_tasks
-        self.calibration_limit = calibration_limit
-        self.calibration_seq_length = calibration_seq_length
-        self.calibration_data = calibration_data
-        self.tokenizer_path = tokenizer_path
 
     def set_output_dir(self, output_dir: str) -> "LLMEdgeManager":
         """
@@ -178,69 +167,6 @@ def capture_pre_autograd_graph(self) -> "LLMEdgeManager":
             )
         return self
 
-    def pt2e_calibrate(
-        self,
-        prepared_module,
-        calibration_tasks,
-        calibration_limit,
-        calibration_seq_length,
-        calibration_data,
-        tokenizer_path,
-    ):
-        logging.info("Run calibration...")
-        try:
-            from executorch.examples.models.llama2.eval_llama_lib import (
-                GraphModuleEvalWrapper,
-            )
-            from executorch.examples.models.llama2.evaluate import evaluate_model
-        except ImportError:
-            raise ImportError(
-                "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh"
-            )
-
-        tokenizer = get_tokenizer(tokenizer_path)
-
-        def calibrate_template(
-            module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
-        ):
-            # TODO: change criteria & support batch inputs if necessary
-            pos = torch.tensor(0, dtype=torch.int64)
-            token_list = tokenizer.encode(prompts, bos=True, eos=False)
-
-            with torch.no_grad():
-                while token_list[-1] != tokenizer.eos_id and pos < max_len:
-                    logits = module(
-                        torch.full((1, 1), token_list[pos]),
-                        torch.tensor((pos,)),
-                    )
-                    pos += 1
-                    if pos >= len(token_list):
-                        token_list.append(torch.argmax(logits[:], dim=-1).item())
-
-        calibrate_template(
-            module=prepared_module,
-            tokenizer=tokenizer,
-            prompts=calibration_data,
-            max_len=calibration_seq_length,
-        )
-
-        eval_wrapper = GraphModuleEvalWrapper(
-            model=prepared_module,
-            tokenizer=tokenizer,
-            max_seq_length=calibration_seq_length,
-            use_kv_cache=self.use_kv_cache,
-            enable_dynamic_shape=self.enable_dynamic_shape,
-        )
-        eval_results = evaluate_model(
-            eval_wrapper,
-            calibration_tasks,
-            calibration_limit,
-        )
-
-        for task, res in eval_results["results"].items():
-            print(f"{task}: {res}")
-        logging.info("Calibration finish...")
-
     def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager":
         """
         Quantize the model via pt2e flow and retrieve LLMEdgeManager including the quantized model.
@@ -263,33 +189,8 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                     self.pre_autograd_graph_module is not None
                 ), "Please run capture_pre_autograd_graph first"
                 m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer)
-                logging.info(
-                    f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
-                )
                 # Calibrate
-                if (
-                    self.calibration_tasks is not None
-                    and self.calibration_limit is not None
-                    and self.calibration_seq_length is not None
-                    and self.calibration_data is not None
-                    and self.tokenizer_path is not None
-                ):
-                    logging.info(
-                        f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
-                    )
-                    self.pt2e_calibrate(
-                        prepared_module=m,
-                        calibration_tasks=self.calibration_tasks,
-                        calibration_limit=self.calibration_limit,
-                        calibration_seq_length=self.calibration_seq_length,
-                        calibration_data=self.calibration_data,
-                        tokenizer_path=self.tokenizer_path,
-                    )
-                else:
-                    logging.info(
-                        "No calibration provided, using dummy input to calibrate..."
-                    )
-                    m(*self.example_inputs)
+                m(*self.example_inputs)
                 m = convert_pt2e(m)
                 DuplicateDynamicQuantChainPass()(m)
                 self.pre_autograd_graph_module = m