add --parallel-prefill options & option validation (#368)

mikekgfb · malfet · commit bd01667cd5a3 · 2024-07-16T22:53:16.000-07:00
* add --parallel-prefill options, option validation, and refactor option validation

* handle model is None for model validation

* typo

* move model compile opttions to generator args

* typo

* typo

* typo

* refactor

* update eval
diff --git a/build/builder.py b/build/builder.py
@@ -88,7 +88,9 @@ def from_args(cls, args):  # -> BuilderArgs:
             )
             # The transformers config is keyed on the last section
             # of the name/path.
-            params_table = model_config.transformer_params_key or model_config.name.split("/")[-1]
+            params_table = (
+                model_config.transformer_params_key or model_config.name.split("/")[-1]
+            )
 
         is_chat_model = False
         if args.is_chat_model:
@@ -143,6 +145,24 @@ class TokenizerArgs:
     is_sentencepiece: bool = True
     is_tiktoken: bool = False
 
+    def validate_model(
+        self,
+        model: Transformer,
+        model_description: str = "model",
+    ):
+        if model is None:
+            return
+
+        use_tiktoken = model.config.use_tiktoken
+        is_tiktoken = self.is_tiktoken
+
+        if use_tiktoken is None:
+            model.config.use_tiktoken = is_tiktoken
+        elif use_tiktoken != is_tiktoken:
+            raise RuntimeError(
+                f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)} for {model_description}"
+            )
+
     @classmethod
     def from_args(cls, args):  # -> TokenizerArgs:
         is_sentencepiece = True
@@ -152,7 +172,11 @@ def from_args(cls, args):  # -> TokenizerArgs:
             tokenizer_path = args.tokenizer_path
         elif args.model:  # Using a named, well-known model
             model_config = resolve_model_config(args.model)
-            tokenizer_path = Path(args.model_directory) / model_config.name / model_config.tokenizer_file
+            tokenizer_path = (
+                Path(args.model_directory)
+                / model_config.name
+                / model_config.tokenizer_file
+            )
 
         elif args.checkpoint_path:
             tokenizer_path = args.checkpoint_path.parent / "tokenizer.model"
@@ -365,18 +389,6 @@ def tokenizer_setting_to_name(tiktoken: bool = False) -> str:
     return "TikToken" if tiktoken else "SentencePiece"
 
 
-def validate_args(model: Transformer, tokenizer_args: TokenizerArgs):
-    use_tiktoken = model.config.use_tiktoken
-    is_tiktoken = tokenizer_args.is_tiktoken
-
-    if use_tiktoken is None:
-        model.config.use_tiktoken = is_tiktoken
-    elif use_tiktoken != is_tiktoken:
-        raise RuntimeError(
-            f"model-specified tokenizer ({tokenizer_setting_to_name(use_tiktoken)} does not match provided tokenizer ({tokenizer_setting_to_name(is_tiktoken)}"
-        )
-
-
 def resolve_model_name(model: str) -> str:
     # If the provided model name is an alias, retrieve the full path.
     if model in model_aliases:
diff --git a/cli.py b/cli.py
@@ -131,7 +131,12 @@ def add_arguments(parser):
     parser.add_argument(
         "--compile-prefill",
         action="store_true",
-        help="Whether to compile the prefill. Improves prefill perf, but has higher compile times.",
+        help="Whether to compile the prefill. Improves prefill perf, but has higher compile times. (Requires `--parallel-prefill`)",
+    )
+    parser.add_argument(
+        "--parallel-prefill",
+        action="store_true",
+        help="Whether to perform prefill in parallel, or one token at a time. Improves prefill perf. DSO and PTE models presently do not support parallel prefill.",
     )
     parser.add_argument(
         "--profile",
diff --git a/download.py b/download.py
@@ -42,10 +42,11 @@ def _download_hf_snapshot(
         else:
             raise e
 
-    
     # Convert the model to the torchchat format.
     print(f"Converting {model_config.name} to torchchat format...")
-    convert_hf_checkpoint(model_dir=artifact_dir, model_name=model_config.name, remove_bin_files=True)
+    convert_hf_checkpoint(
+        model_dir=artifact_dir, model_name=model_config.name, remove_bin_files=True
+    )
 
 
 def _download_direct(
@@ -79,13 +80,15 @@ def download_and_convert(
             == ModelDistributionChannel.HuggingFaceSnapshot
         ):
             _download_hf_snapshot(model_config, temp_dir, hf_token)
-        elif model_config.distribution_channel == ModelDistributionChannel.DirectDownload:
+        elif (
+            model_config.distribution_channel == ModelDistributionChannel.DirectDownload
+        ):
             _download_direct(model_config, temp_dir)
         else:
             raise RuntimeError(
                 f"Unknown distribution channel {model_config.distribution_channel}."
             )
-        
+
         # Move from the temporary directory to the intended location,
         # overwriting if necessary.
         if os.path.isdir(model_dir):
diff --git a/eval.py b/eval.py
@@ -16,7 +16,6 @@
     _initialize_tokenizer,
     BuilderArgs,
     TokenizerArgs,
-    validate_args,
 )
 
 from build.model import Transformer
@@ -245,7 +244,7 @@ def main(args) -> None:
         quantize,
         tokenizer,
     )
-    validate_args(model, tokenizer_args)
+    tokenizer_args.validate_model(model)
 
     if compile:
         assert not (
diff --git a/export_et_util.py b/export_et_util.py
@@ -1,6 +1,7 @@
 import torch
 from build.model import apply_rotary_emb, Attention
-from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache
+
+# from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache
 from torch import nn
 
 
diff --git a/generate.py b/generate.py
@@ -22,7 +22,6 @@
     _initialize_tokenizer,
     BuilderArgs,
     TokenizerArgs,
-    validate_args,
 )
 from build.model import Transformer
 from build.utils import device_sync, set_precision
@@ -47,6 +46,31 @@ class GeneratorArgs:
     compile: bool = False
     compile_prefill: bool = False
     speculate_k: int = 5
+    sequential_prefill: bool = True
+
+    def __post_init__(self):
+        if self.compile_prefill and self.sequential_prefill:
+            raise RuntimeError("prefill compilation requires parallel prefill")
+
+    def validate_build(
+        self, builder_args: BuilderArgs, model_description: str = "model"
+    ):
+        reason = ""
+        model_type = ""
+        if not self.sequential_prefill:
+            reason = "parallel prefill"
+        if self.compile_prefill:
+            reason = "model compilation for prefill"
+        if self.compile:
+            reason = "model compilation"
+        if builder_args.dso_path:
+            model_type = "DSO"
+        if builder_args.pte_path:
+            model_type = "PTE"
+        if model_type and reason:
+            raise RuntimeError(
+                f"cannot perform {reason} because a {model_type} {model_description} is used"
+            )
 
     @classmethod
     def from_args(cls, args):  # -> GeneratorArgs:
@@ -62,6 +86,7 @@ def from_args(cls, args):  # -> GeneratorArgs:
             compile=args.compile,
             compile_prefill=args.compile_prefill,
             speculate_k=args.speculate_k,
+            sequential_prefill=not args.parallel_prefill,
         )
 
 
@@ -116,7 +141,6 @@ def prefill(
     logging.debug(f"x: {x}, input_pos: {input_pos}")
     width = x.size(1)
     assert input_pos.size(0) == width
-    sequential_prefill = True
 
     if sequential_prefill:
         for i in range(width):
@@ -244,6 +268,7 @@ def generate(
     chat_mode: bool,
     draft_model: Transformer,
     speculate_k: Optional[int] = 8,
+    sequential_prefill=True,
     callback=lambda x: x,
     **sampling_kwargs,
 ) -> torch.Tensor:
@@ -276,9 +301,21 @@ def generate(
     seq = empty
     input_pos = torch.arange(0, T, device=device, dtype=torch.int)
 
-    next_token = prefill(model, prompt.view(1, -1), input_pos, **sampling_kwargs)
+    next_token = prefill(
+        model,
+        prompt.view(1, -1),
+        input_pos,
+        sequential_prefill=sequential_prefill,
+        **sampling_kwargs,
+    )
     if is_speculative:
-        prefill(draft_model, prompt.view(1, -1), input_pos, **sampling_kwargs)
+        prefill(
+            draft_model,
+            prompt.view(1, -1),
+            input_pos,
+            sequential_prefill=sequential_prefill,
+            **sampling_kwargs,
+        )
     seq[T] = next_token
 
     input_pos = torch.tensor([T], device=device, dtype=torch.int)
@@ -355,11 +392,9 @@ def _main(
     speculative_builder_args: BuilderArgs,
     tokenizer_args: TokenizerArgs,
     generator_args: GeneratorArgs,
-    compile: bool = True,
-    compile_prefill: bool = False,
-    profile: Optional[Path] = None,
-    quantize=None,
-    draft_quantize=None,
+    profile: Optional[Path],
+    quantize,
+    draft_quantize,
 ) -> None:
     """
     Generates text samples based on a pre-trained Transformer model and tokenizer.
@@ -398,7 +433,6 @@ def _main(
 
     builder_args.setup_caches = False
     model = _initialize_model(builder_args, quantize, tokenizer)
-    validate_args(model, tokenizer_args)
 
     # will add a version of _initialize_model in future
     # (need additional args)
@@ -411,6 +445,11 @@ def _main(
     else:
         draft_model = None
 
+    tokenizer_args.validate_model(model)
+    tokenizer_args.validate_model(draft_model, "draft model")
+    generator_args.validate_build(builder_args)
+    generator_args.validate_build(speculative_builder_args, "draft model")
+
     encoded = encode_tokens(
         tokenizer, generator_args.prompt, bos=True, device=builder_args.device
     )
@@ -423,7 +462,7 @@ def _main(
             for p in itertools.chain(model.parameters(), model.buffers())
         ]
     )
-    if compile:
+    if generator_args.compile:
         if (
             is_speculative and builder_args.use_tp
         ):  # and ("cuda" in builder_args.device):
@@ -443,14 +482,14 @@ def _main(
         )
 
         # Uncomment to squeeze more perf out of prefill
-        if compile_prefill:
+        if generator_args.compile_prefill:
             prefill = torch.compile(prefill, fullgraph=True, dynamic=True)
 
     aggregate_metrics = {
         "tokens_per_sec": [],
         "accept_counts": [],
     }
-    start = -1 if compile else 0
+    start = -1 if generator_args.compile else 0
 
     for i in range(start, generator_args.num_samples):
         device_sync(device=builder_args.device)
@@ -506,6 +545,7 @@ def callback(x):
                 callback=callback,
                 temperature=generator_args.temperature,
                 top_k=generator_args.top_k,
+                sequential_prefill=generator_args.sequential_prefill,
             )
             aggregate_metrics["accept_counts"].append(metrics["accept_counts"])
         if i == -1:
@@ -560,8 +600,6 @@ def main(args):
         speculative_builder_args,
         tokenizer_args,
         generator_args,
-        args.compile,
-        args.compile_prefill,
         args.profile,
         args.quantize,
         args.draft_quantize,
diff --git a/quantize.py b/quantize.py
@@ -7,9 +7,10 @@
 from __future__ import annotations
 
 import json
-from functools import reduce
-from math import gcd
-from typing import Dict, Optional, Tuple
+
+# from functools import reduce
+# from math import gcd
+from typing import Dict, Optional
 
 import torch
 import torch.nn as nn

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,6 @@`
`16`	`16`	`_initialize_tokenizer,`
`17`	`17`	`BuilderArgs,`
`18`	`18`	`TokenizerArgs,`
`19`		`- validate_args,`
`20`	`19`	`)`
`21`	`20`
`22`	`21`	`from build.model import Transformer`
`@@ -245,7 +244,7 @@ def main(args) -> None:`
`245`	`244`	`quantize,`
`246`	`245`	`tokenizer,`
`247`	`246`	`)`
`248`		`- validate_args(model, tokenizer_args)`
	`247`	`+ tokenizer_args.validate_model(model)`
`249`	`248`
`250`	`249`	`if compile:`
`251`	`250`	`assert not (`