Enable eval for pt2e quantization (#3652)

cccclai · facebook-github-bot · commit 0dc948878c41 · 2024-05-22T11:05:25.000-07:00
Summary: Pull Request resolved: #3652 Have the eval framework to work with pt2e quantizer to unblock measuring the accuracy for qnn quantizer Reviewed By: Jack-Khuu Differential Revision: D57316602 fbshipit-source-id: ca2b6f0dc6a6ebb4f2f323d09c27c20978cec569
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
@@ -253,6 +253,16 @@ def _get_metadata(self):
         self.metadata = metadata
         return self.metadata
 
+    def capture_pre_autograd_graph(self) -> "LlamaEdgeManager":
+        dynamic_shape = self._get_dynamic_shape()
+        # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
+        # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
+        with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+            self.pre_autograd_graph_module = capture_pre_autograd_graph(
+                self.model, self.example_inputs, dynamic_shapes=dynamic_shape
+            )
+        return self
+
     def pt2e_quantize(
         self, quantizers: Optional[List[Quantizer]]
     ) -> "LlamaEdgeManager":
@@ -265,19 +275,18 @@ def pt2e_quantize(
             self.edge_manager is None
         ), "export_to_edge is already called, please call pt2e_quantize before export_to_edge"
         logging.info(f"Using pt2e {quantizers} to quantizing the model...")
-        dynamic_shape = self._get_dynamic_shape()
 
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
         if quantizers:
             with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
-                m = capture_pre_autograd_graph(
-                    self.model, self.example_inputs, dynamic_shapes=dynamic_shape
-                )
                 if self.verbose:
                     logging.info(f"Applied quantizers: {quantizers}")
                 composed_quantizer = ComposableQuantizer(quantizers)
-                m = prepare_pt2e(m, composed_quantizer)
+                assert (
+                    self.pre_autograd_graph_module is not None
+                ), "Please run capture_pre_autograd_graph first"
+                m = prepare_pt2e(self.pre_autograd_graph_module, composed_quantizer)
                 # Calibrate
                 m(*self.example_inputs)
                 m = convert_pt2e(m)
diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama2/eval_llama_lib.py
@@ -11,7 +11,9 @@
 
 import lm_eval
 import torch
-
+from executorch.examples.models.llama2.export_llama_lib import (
+    get_quantizer_and_quant_params,
+)
 from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken
 from executorch.examples.models.llama2.tokenizer.tokenizer import (
     Tokenizer as SentencePieceTokenizer,
@@ -233,13 +235,27 @@ def gen_eval_wrapper(
             max_seq_length=args.max_seq_length - 1,
         )
 
+    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
     # GPTFastEvalWrapper: Create a wrapper around a pre-exported model
     manager: LlamaEdgeManager = _prepare_for_llama_export(model_name, args)
-    model = (
-        manager.model.eval().to(device="cuda")
-        if torch.cuda.is_available()
-        else manager.model.to(device="cpu")
-    )
+
+    if len(quantizers) != 0:
+        manager = manager.capture_pre_autograd_graph().pt2e_quantize(quantizers)
+        model = (
+            manager.pre_autograd_graph_module.to(device="cuda")
+            if torch.cuda.is_available()
+            else manager.pre_autograd_graph_module.to(device="cpu")
+        )
+    else:
+        # TODO: use manager.pre_autograd_graph_module for the eval to remove the if-else branch
+        # for quantizers. Currently capture_pre_autograd_graph only works with --kv_cache, but
+        # fails without the kv_cache mode
+        model = (
+            manager.model.eval().to(device="cuda")
+            if torch.cuda.is_available()
+            else manager.model.eval().to(device="cpu")
+        )
+
     return EagerEvalWrapper(
         model=model,
         tokenizer=tokenizer,
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -371,18 +371,25 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     )
 
 
-def _export_llama(modelname, args) -> str:  # noqa: C901
-    # export_to_edge
+def get_quantizer_and_quant_params(args):
     pt2e_quant_params = _get_pt2e_quantization_params(args)
     quantizers = get_pt2e_quantizers(pt2e_quant_params, args)
     quant_dtype = None
     if args.qnn and args.pt2e_quantize:
         assert len(quantizers) == 0, "Should not enable both xnnpack and qnn"
         qnn_quantizer, quant_dtype = get_qnn_quantizer(args)
         quantizers.append(qnn_quantizer)
+    logging.info(f"Applying quantizers: {quantizers}")
+    return pt2e_quant_params, quantizers, quant_dtype
+
 
+def _export_llama(modelname, args) -> str:  # noqa: C901
+    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
+
+    # export_to_edge
     builder_exported_to_edge = (
         _prepare_for_llama_export(modelname, args)
+        .capture_pre_autograd_graph()
         .pt2e_quantize(quantizers)
         .export_to_edge()
     )