Torchat tiktoken (#127)

mikekgfb · malfet · commit d0588e2f3bea · 2024-07-16T23:03:10.000-07:00
* add torchat.py and --tiktoken option

* add default device to torchat

* dtype handling for export_et

* handle dtype args
diff --git a/export.py b/export.py
@@ -60,7 +60,11 @@ def forward(self, idx, input_pos):
         return logits  # sample(logits, **sampling_kwargs)
 
 
-def main(checkpoint_path, device, quantize = "{ }", args = None):
+def main(args):
+    checkpoint_path = args.checkpoint_path
+    device = args.device
+    quantize = args.quantize
+
     assert checkpoint_path.is_file(), checkpoint_path
 
     print(f"Using device={device}")
@@ -201,7 +205,7 @@ def cli():
 
 
     args = parser.parse_args()
-    main(args.checkpoint_path, args.device, args.quantize, args)
+    main(args)
 
 if __name__ == "__main__":
     cli()
diff --git a/export_et.py b/export_et.py
@@ -13,6 +13,7 @@
 
 from generate import _load_model, decode_one_token
 from quantize import quantize_model
+from quantize import quantize_model, name_to_dtype, set_precision, get_precision
 
 from model import Transformer
 # from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
@@ -92,23 +93,23 @@ def export_model(model, device, output_path, args=None) -> str:  # noqa: C901
     # need to use kv sdpa?
     edge_config = EdgeCompileConfig(
         _check_ir_validity=False,
-        _skip_type_promotion=bool(args.dtype == "fp16"),
+        _skip_type_promotion=bool(target_precision == torch.float16),
     )
 
     dynamic_shapes = None
 
-    if args.dtype is not None:
-        if args.dtype == "fp16": # or args.quantization_mode == "int4":
-            if state_dict_dtype != torch.float16:
-                print("model.to torch.float16")
-                model = model.to(dtype=torch.float16)
-                state_dict_dtype = torch.float16
-        elif args.dtype == "fp32":
-            if state_dict_dtype != torch.float32:
-                print("model.to torch.float32")
-                model = model.to(dtype=torch.float32)
-        else:
-            raise ValueError(f"Unsupported dtype: {args.dtype}")
+    target_precision = get_precision()
+    if target_precision == torch.float16: # or args.quantization_mode=="int4":
+        if state_dict_dtype != torch.float16:
+            print("model.to torch.float16")
+            model = model.to(dtype=torch.float16)
+            state_dict_dtype = torch.float16
+    elif target_precision = torch.float32:
+        if state_dict_dtype != torch.float32:
+            print("model.to torch.float32")
+            model = model.to(dtype=torch.float32)
+    else:
+        raise ValueError(f"Unsupported dtype for ET export: {target_precision}")
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]), torch.no_grad():
         m = capture_pre_autograd_graph(
diff --git a/generate.py b/generate.py
@@ -336,7 +336,7 @@ def _load_model(
 B_INST, E_INST = "[INST]", "[/INST]"
 
 
-def main(
+def _main(
     prompt: str = "Hello, my name is",
     interactive: bool = False,
     num_samples: int = 5,
@@ -357,6 +357,7 @@ def main(
     pte_path=None,
     quantize=None,
     model_dtype=None,
+    use_tiktoken=False,
 ) -> None:
     """Generates text samples based on a pre-trained Transformer model and tokenizer."""
     assert (
@@ -573,6 +574,28 @@ def callback(x):
     )
     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
 
+def main(args):
+    _main(
+        args.prompt,
+        args.interactive,
+        args.num_samples,
+        args.max_new_tokens,
+        args.top_k,
+        args.temperature,
+        args.checkpoint_path,
+        args.tokenizer_path,
+        args.compile,
+        args.compile_prefill,
+        args.profile,
+        args.draft_checkpoint_path,
+        args.speculate_k,
+        args.device,
+        args.dso_path,
+        args.pte_path,
+        args.quantize,
+        args.dtype,
+        args.tiktoken
+    )
 
 def cli():
     import argparse
@@ -672,35 +695,20 @@ def cli():
         default="float32",
         help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32",
     )
+    parser.add_argument(
+        "--tiktoken",
+        action="store_true",
+        help="Whether to use tiktoken tokenizer.",
+    )
 
 
     args = parser.parse_args()
 
     if args.seed:
               torch.manual_seed(args.seed)
 
-    main(
-        args.prompt,
-        args.interactive,
-        args.num_samples,
-        args.max_new_tokens,
-        args.top_k,
-        args.temperature,
-        args.checkpoint_path,
-        args.checkpoint_dir,
-        args.params_path,
-        args.tokenizer_path,
-        args.compile,
-        args.compile_prefill,
-        args.profile,
-        args.draft_checkpoint_path,
-        args.speculate_k,
-        args.device,
-        args.dso_path,
-        args.pte_path,
-        args.quantize,
-        args.dtype,
-    )
+    main(args)
+
 
 if __name__ == "__main__":
         cli()
diff --git a/torchat.py b/torchat.py
@@ -0,0 +1,126 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import os
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+from torch.export import Dim, export
+
+from export import main as export_main
+from generate import main as generate_main
+
+default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
+
+def cli():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Your CLI description.")
+
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, my name is", help="Input prompt."
+    )
+    parser.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Whether to launch in interactive mode",
+    )
+    parser.add_argument(
+        "--tiktoken",
+        action="store_true",
+        help="Whether to use tiktoken tokenizer.",
+    )
+    parser.add_argument(
+        "--export",
+        action="store_true",
+        help="Use torchat to export a model.",
+    )
+    parser.add_argument(
+        "--generate",
+        action="store_true",
+        help="Use torchat to generate a sequence using a model.",
+    )
+    parser.add_argument("--num-samples", type=int, default=5, help="Number of samples.")
+    parser.add_argument(
+        "--max-new-tokens", type=int, default=200, help="Maximum number of new tokens."
+    )
+    parser.add_argument("--top-k", type=int, default=200, help="Top-k for sampling.")
+    parser.add_argument(
+        "--temperature", type=float, default=0.8, help="Temperature for sampling."
+    )
+    parser.add_argument(
+        "--compile", action="store_true", help="Whether to compile the model."
+    )
+    parser.add_argument(
+        "--compile-prefill",
+        action="store_true",
+        help="Whether to compile the prefill (improves prefill perf, but higher compile times)",
+    )
+    parser.add_argument(
+        "--profile", type=Path, default=None, help="Profile path.")
+    parser.add_argument(
+        "--speculate-k", type=int, default=5, help="Speculative execution depth."
+    )
+    parser.add_argument(
+        "--draft-checkpoint-path",
+        type=Path,
+        default=None,
+        help="Draft checkpoint path.",
+    )
+    #####################################################################
+
+    parser.add_argument(
+        "--checkpoint-path",
+        type=Path,
+        default="not_specified",
+        help="Model checkpoint path.",
+    )
+    parser.add_argument(
+        "--output-pte-path",
+        type=str,
+        default=None,
+        help="Filename"
+    )
+    parser.add_argument(
+        "--output-dso-path",
+        type=str,
+        default=None,
+        help="Filename"
+    )
+    parser.add_argument(
+        "-d",
+        "--dtype",
+        default=None,
+        help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument(
+        "--quantize",
+        type=str,
+        default="{ }",
+        help="Quantization options."
+    )
+    parser.add_argument(
+        "--device", type=str, default=default_device, help="Device to use"
+    )
+
+
+    args = parser.parse_args()
+    
+    if args.seed:
+              torch.manual_seed(args.seed)
+
+    if args.generate:
+        generate_main(args)
+    elif args.export:
+        export_main(args)
+    else:
+        raise RuntimeError("must specify either --generate or --export")
+    
+if __name__ == "__main__":
+    cli()