Accept model type parameter in export_llama

jackzhxng · jackzhxng · commit a044fb1d9d7e · 2024-10-07T13:55:44.000-07:00
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
@@ -140,6 +140,7 @@ LLAMA_CHECKPOINT=path/to/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama2.export_llama \
+  --model llama3_2
   --checkpoint "${LLAMA_CHECKPOINT:?}" \
   --params "${LLAMA_PARAMS:?}" \
   -kv -X \
@@ -158,6 +159,7 @@ LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth
 LLAMA_PARAMS=path/to/params.json
 
 python -m examples.models.llama2.export_llama \
+   --model llama3_2
    --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \
    --params "${LLAMA_PARAMS:?}" \
    --use_sdpa_with_kv_cache \
@@ -181,7 +183,19 @@ You can export and run the original Llama 3 8B instruct model.
 
 2. Export model and generate `.pte` file
     ```
-    python -m examples.models.llama2.export_llama --checkpoint <consolidated.00.pth> -p <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w  --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
+    python -m examples.models.llama2.export_llama
+	   --model llama3
+	   --checkpoint <consolidated.00.pth>
+	   -p <params.json>
+	   -kv
+	   --use_sdpa_with_kv_cache
+	   -X
+	   -qmode 8da4w
+	   --group_size 128
+	   -d fp32
+	   --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
+	   --embedding-quantize 4,32
+	   --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte"
     ```
 
     Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size.
@@ -201,10 +215,9 @@ If you want to deploy and run a smaller model for educational purposes. From `ex
     ```
 3. Export model and generate `.pte` file.
     ```
-    python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv
+    python -m examples.models.llama2.export_llama --model llama2 --checkpoint stories110M.pt --params params.json -X -kv
     ```
 4. Create tokenizer.bin.
-
     ```
     python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
     ```
@@ -219,10 +232,9 @@ You can export and run the original Llama 2 7B model.
 
 3. Export model and generate `.pte` file:
     ```
-    python -m examples.models.llama2.export_llama --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
+    python -m examples.models.llama2.export_llama --model llama2 --checkpoint <checkpoint.pth> --params <params.json> -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32
     ```
 4. Create tokenizer.bin.
-
     ```
     python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
     ```
@@ -414,9 +426,9 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction
 for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is
 
-- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
-- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
-- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
+- Lower to CoreML: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json `
+- MPS: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json `
+- QNN: `python -m examples.models.llama2.export_llama --model llama3 -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json `
 
 The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run.
 
diff --git a/examples/models/llama2/export_llama.py b/examples/models/llama2/export_llama.py
@@ -20,10 +20,9 @@
 def main() -> None:
     seed = 42
     torch.manual_seed(seed)
-    modelname = "llama2"
     parser = build_args_parser()
     args = parser.parse_args()
-    export_llama(modelname, args)
+    export_llama(args)
 
 
 if __name__ == "__main__":
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -77,6 +77,10 @@
 verbosity_setting = None
 
 
+EXECUTORCH_DEFINED_MODELS = ["llama2", "llama3", "llama3_1", "llama3_2"]
+TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
+
+
 class WeightType(Enum):
     LLAMA = "LLAMA"
     FAIRSEQ2 = "FAIRSEQ2"
@@ -112,11 +116,11 @@ def build_model(
     else:
         output_dir_path = "."
 
-    argString = f"--checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}"
+    argString = f"--model {modelname} --checkpoint par:{modelname}_ckpt.pt --params par:{modelname}_params.json {extra_opts} --output-dir {output_dir_path}"
     parser = build_args_parser()
     args = parser.parse_args(shlex.split(argString))
     # pkg_name = resource_pkg_name
-    return export_llama(modelname, args)
+    return export_llama(args)
 
 
 def build_args_parser() -> argparse.ArgumentParser:
@@ -126,6 +130,12 @@ def build_args_parser() -> argparse.ArgumentParser:
     # parser.add_argument(
     #     "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file"
     # )
+    parser.add_argument(
+        "--model",
+        default="llama2",
+        choices=EXECUTORCH_DEFINED_MODELS + TORCHTUNE_DEFINED_MODELS,
+        help="The Lllama model to export. llama2, llama3, llama3_1, llama3_2 share the same architecture, so they are technically interchangeable, given you provide the checkpoint file for the desired version.",
+    )
     parser.add_argument(
         "-E",
         "--embedding-quantize",
@@ -456,13 +466,13 @@ def canonical_path(path: Union[str, Path], *, dir: bool = False) -> str:
         return return_val
 
 
-def export_llama(modelname, args) -> str:
+def export_llama(args) -> str:
     if args.profile_path is not None:
         try:
             from executorch.util.python_profiler import CProfilerFlameGraph
 
             with CProfilerFlameGraph(args.profile_path):
-                builder = _export_llama(modelname, args)
+                builder = _export_llama(args)
                 assert (
                     filename := builder.get_saved_pte_filename()
                 ) is not None, "Fail to get file name from builder"
@@ -473,14 +483,14 @@ def export_llama(modelname, args) -> str:
             )
             return ""
     else:
-        builder = _export_llama(modelname, args)
+        builder = _export_llama(args)
         assert (
             filename := builder.get_saved_pte_filename()
         ) is not None, "Fail to get file name from builder"
         return filename
 
 
-def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
+def _prepare_for_llama_export(args) -> LLMEdgeManager:
     """
     Helper function for export_llama. Loads the model from checkpoint and params,
     and sets up a LLMEdgeManager with initial transforms and dtype conversion.
@@ -506,7 +516,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
 
     return (
         _load_llama_model(
-            modelname=modelname,
+            args.model,
             checkpoint=checkpoint_path,
             checkpoint_dir=checkpoint_dir,
             params_path=params_path,
@@ -528,7 +538,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             args=args,
         )
         .set_output_dir(output_dir_path)
-        .source_transform(_get_source_transforms(modelname, dtype_override, args))
+        .source_transform(_get_source_transforms(args.model, dtype_override, args))
     )
 
 
@@ -566,13 +576,13 @@ def _validate_args(args):
         raise ValueError("Model shard is only supported with qnn backend now.")
 
 
-def _export_llama(modelname, args) -> LLMEdgeManager:  # noqa: C901
+def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
     pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
 
     # export_to_edge
     builder_exported_to_edge = (
-        _prepare_for_llama_export(modelname, args)
+        _prepare_for_llama_export(args)
         .capture_pre_autograd_graph()
         .pt2e_quantize(quantizers)
         .export_to_edge()
@@ -746,8 +756,8 @@ def _load_llama_model_metadata(
 
 
 def _load_llama_model(
+    modelname: str,
     *,
-    modelname: str = "llama2",
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
     params_path: str,
@@ -774,26 +784,41 @@ def _load_llama_model(
     Returns:
         An instance of LLMEdgeManager which contains the eager mode model.
     """
+
     assert (
         checkpoint or checkpoint_dir
     ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
     logging.info(
         f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
     )
-    model, example_inputs, _ = EagerModelFactory.create_model(
-        "llama2",
-        "Llama2Model",
-        checkpoint=checkpoint,
-        checkpoint_dir=checkpoint_dir,
-        params=params_path,
-        use_kv_cache=use_kv_cache,
-        use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
-        generate_full_logits=generate_full_logits,
-        fairseq2=weight_type == WeightType.FAIRSEQ2,
-        max_seq_len=max_seq_len,
-        enable_dynamic_shape=enable_dynamic_shape,
-        output_prune_map_path=output_prune_map_path,
-        args=args,
+
+    if modelname in EXECUTORCH_DEFINED_MODELS:
+        # Set to llama2 because all models in EXECUTORCH_DEFINED_MODELS share the same archteciture as
+        # defined in example/models/llama2.
+        modelname = "llama2"
+        model_class_name = "Llama2Model"
+    elif modelname in TORCHTUNE_DEFINED_MODELS:
+        if modelname == "llama3_2_vision":
+            model_class_name = "Llama3_2Decoder"
+    else:
+        raise ValueError(f"{modelname} is not a valid Llama model.")
+
+    model, (example_inputs, example_kwarg_inputs), dynamic_shapes = (
+        EagerModelFactory.create_model(
+            modelname,
+            model_class_name,
+            checkpoint=checkpoint,
+            checkpoint_dir=checkpoint_dir,
+            params=params_path,
+            use_kv_cache=use_kv_cache,
+            use_sdpa_with_kv_cache=use_sdpa_with_kv_cache,
+            generate_full_logits=generate_full_logits,
+            fairseq2=weight_type == WeightType.FAIRSEQ2,
+            max_seq_len=max_seq_len,
+            enable_dynamic_shape=enable_dynamic_shape,
+            output_prune_map_path=output_prune_map_path,
+            args=args,
+        )
     )
     if dtype_override:
         assert isinstance(