Refactor export_llama_lib

larryliu0820 · facebook-github-bot · commit e74a586fab04 · 2024-02-21T13:54:23.000-08:00
Summary:
Separate out "receipe" code and actual "cooking" code.

Introduces a new `LlamaBuilder` class that handles internal logic of exporting. Takes source transforms, quantizers, and partitioners.

Differential Revision: D54027081
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
@@ -49,6 +49,7 @@ runtime.python_binary(
 runtime.python_library(
     name = "export_library",
     srcs = [
+        "builder.py",
         "export_llama.py",
         "export_llama_lib.py",
     ],
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
@@ -0,0 +1,261 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting Llama2 to flatbuffer
+
+import json
+import logging
+from json import JSONDecodeError
+from typing import Callable, List, Optional
+
+import pkg_resources
+import torch
+from executorch.backends.transforms.duplicate_dynamic_quant_chain import (
+    DuplicateDynamicQuantChainPass,
+)
+from executorch.exir.backend.partitioner import Partitioner
+from executorch.exir.capture._config import EdgeCompileConfig, ExecutorchBackendConfig
+from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
+from torch.ao.quantization.quantizer import Quantizer
+from torch.ao.quantization.quantizer.composable_quantizer import ComposableQuantizer
+from torch.nn.attention import SDPBackend
+
+from ...portable.utils import export_to_edge, save_pte_program
+from ..model_factory import EagerModelFactory
+
+
+IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+pkg_name = __name__
+
+
+def canonical_path(path: str, *, dir: bool = False) -> str:
+
+    print(f"creating canonical path for {path}")
+    if not path.startswith("par:"):
+        return path
+
+    if not IS_FBCODE:
+        print("not FBCODE")
+        return path[4:]
+    else:
+        return_val = pkg_resources.resource_filename(pkg_name, path[4:])
+        print(f"canonical name is: {return_val}")
+        return return_val
+
+
+class LlamaBuilder:
+    """
+    A builder class that builds a Llama2 model, apply source transformation & quantization and export to Executorch.
+    If you want to apply different quantization and source tranformation schemes, before modifying this file consider using these existing hooks:
+    * .source_transform()
+    * .export_to_edge()
+    * .to_backend()
+    """
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+        self.checkpoint = None
+        self.params = None
+        self.output_dir = "."
+        self.is_fairseq2 = False
+        self.use_kv_cache = False
+        self.dynamic_shape = None
+        self.model = None
+        self.example_inputs = None
+        self.dtype = None
+        self.applied_source_transforms = []
+        self.edge_manager = None
+        self.output_name = "llama2"
+        self.edge_config = None
+        self.metadata = None
+        self.export_program = None
+
+    def set_checkpoint(self, checkpoint: str, is_fairseq2=False):
+        self.checkpoint = checkpoint
+        self.is_fairseq2 = is_fairseq2
+        return self
+
+    def set_params(self, params: str):
+        self.params = params
+        return self
+
+    def set_output_dir(self, output_dir: str):
+        self.output_dir = output_dir
+        return self
+
+    def set_use_kv_cache(self, use_kv_cache: bool):
+        assert self.model is None, (
+            "To ensure consistency, set_use_kv_cache can't be called after load_model()."
+            f"Currently the model has use_kv_cache = {self.use_kv_cache}"
+        )
+        self.use_kv_cache = use_kv_cache
+        return self
+
+    def set_metadata(self, metadata: Optional[dict]):
+        self.metadata = metadata
+        return self
+
+    def load_model(self):
+        assert (
+            self.checkpoint and self.params
+        ), "Both checkpoint and params needs to be set"
+        if self.model:
+            logging.info(f"Reloading model from {self.checkpoint} and {self.params}")
+        checkpoint_path = canonical_path(self.checkpoint)
+        params_path = canonical_path(self.params)
+        logging.info(
+            f"Loading model with checkpoint={checkpoint_path}, params={params_path}, use_kv_cache={self.use_kv_cache}, fairseq2={self.is_fairseq2}"
+        )
+        self.model, self.example_inputs, _ = EagerModelFactory.create_model(
+            "llama2",
+            "Llama2Model",
+            checkpoint=checkpoint_path,
+            params=params_path,
+            use_kv_cache=self.use_kv_cache,
+            fairseq2=self.is_fairseq2,
+        )
+        state_dict = self.model.state_dict()
+        dtype = state_dict[next(iter(state_dict))].dtype
+        assert dtype in [torch.float16, torch.float32], "Only support fp16 or fp32"
+        logging.info(f"Loaded model with dtype={dtype}")
+        self.dtype = "fp16" if dtype == torch.float16 else "fp32"
+
+        return self
+
+    def to_dtype(self, dtype_override: Optional[str]):
+        assert self.model, "Need to run load_model() first"
+        assert not dtype_override or dtype_override in [
+            "fp16",
+            "fp32",
+        ], "Only support fp16 or fp32"
+
+        if dtype_override == "fp16" and self.dtype != "fp16":
+            logging.info("model.to torch.float16")
+            self.model = self.model.to(dtype=torch.float16)
+            self.dtype = dtype_override
+        elif dtype_override == "fp32" and self.dtype != "fp32":
+            logging.info("model.to torch.float32")
+            self.model = self.model.to(dtype=torch.float32)
+            self.dtype = dtype_override
+        return self
+
+    def source_transform(
+        self, transforms: List[Callable[[torch.nn.Module], torch.nn.Module]]
+    ):
+        assert self.model, "Need to run load_model() first"
+        for transform in transforms:
+            self.model = transform(self.model)
+        self.applied_source_transforms.extend(transforms)
+
+        if self.verbose:
+            logging.info(f"{self.output_name}:")
+            logging.info(f"{self.model}")
+        return self
+
+    def _get_dynamic_shape(self):
+        assert self.model, "Need to run load_model() first"
+        if self.use_kv_cache:
+            return None
+        dim = torch.export.Dim("token_dim", max=self.model.params.max_seq_len - 1)
+        self.dynamic_shape = {"tokens": {1: dim}}
+        return self.dynamic_shape
+
+    def _get_edge_config(self):
+        self.edge_config = EdgeCompileConfig(
+            _check_ir_validity=False,
+            _skip_type_promotion=bool(self.dtype == "fp16"),
+        )
+
+    def _get_metadata(self):
+        assert self.model, "Need to run load_model() first"
+        params = self.model.params
+        metadata = {
+            "append_eos_to_prompt": self.is_fairseq2,  # For language llama, tell the runtime to always append EOS token(s) to prompt.
+            "get_bos_id": 3 if self.is_fairseq2 else 1,
+            "get_dtype": 5 if self.dtype == "fp16" else 6,
+            "get_eos_id": 3 if self.is_fairseq2 else 2,
+            "get_head_dim": params.dim // params.n_heads,
+            "get_max_batch_size": params.max_batch_size,
+            "get_max_seq_len": params.max_seq_len,
+            "get_n_bos": 1,
+            "get_n_eos": 2 if self.is_fairseq2 else 1,
+            "get_n_kv_heads": params.n_kv_heads,
+            "get_n_layers": params.n_layers,
+            "get_vocab_size": params.vocab_size,
+            "use_kv_cache": self.use_kv_cache,
+        }
+        if self.metadata:
+            try:
+                extra = json.loads(self.metadata)
+                for k, v in extra.items():
+                    metadata[k] = v
+            except JSONDecodeError:
+                logging.error("Invalid metadata, should be a valid JSON string")
+        self.metadata = metadata
+        return self.metadata
+
+    def export_to_edge(self, quantizers: Optional[List[Quantizer]]):
+        assert self.model, "Need to run load_model() first"
+        dynamic_shape = self._get_dynamic_shape()
+        edge_config = self._get_edge_config()
+        metadata = self._get_metadata()
+
+        with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+            m = capture_pre_autograd_graph(
+                self.model, self.example_inputs, dynamic_shapes=dynamic_shape
+            )
+            if quantizers:
+                composed_quantizer = ComposableQuantizer(quantizers)
+                m = prepare_pt2e(m, composed_quantizer)
+                # Calibrate
+                m(*self.example_inputs)
+                m = convert_pt2e(m)
+                DuplicateDynamicQuantChainPass()(m)
+            self.edge_manager = export_to_edge(
+                m,
+                self.example_inputs,
+                dynamic_shapes=dynamic_shape,
+                edge_constant_methods=metadata,
+                edge_compile_config=edge_config,
+            )
+        return self
+
+    def to_backend(self, partitioners: Optional[List[Partitioner]]):
+        assert self.edge_manager, "Need to run export_to_edge() first"
+        if partitioners:
+            for partitioner in partitioners:
+                self.edge_manager = self.edge_manager.to_backend(partitioner)
+        return self
+
+    def to_executorch(self):
+        assert self.edge_manager, "Need to run export_to_edge() first"
+        self.export_program = self.edge_manager.to_executorch(
+            ExecutorchBackendConfig(
+                extract_constant_segment=True,
+                extract_delegate_segments=True,
+                passes=[
+                    QuantFusionPass(),
+                ],
+                sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
+            )
+        )
+        return self
+        logging.info(
+            "Required memory for activation in bytes: ",
+            self.export_program._emitter_output.program.execution_plan[
+                0
+            ].non_const_buffer_sizes,
+        )
+
+    def save(self, output_name: Optional[str]):
+        if output_name:
+            self.output_name = output_name
+        save_pte_program(self.export_program.buffer, self.output_name, self.output_dir)
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py