Qwen runs with HF tokenizer

jackzhxng · jackzhxng · commit 70fd1feb6330 · 2025-02-12T11:07:14.000-08:00
diff --git a/examples/models/llama/install_requirements.sh b/examples/models/llama/install_requirements.sh
@@ -5,14 +5,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Install sentencepiece for llama tokenizer.
+# Install tiktoken for tokenizer.
+# Install tokenizers for hf .json tokenizer.
 # Install snakeviz for cProfile flamegraph
-# Install sentencepiece for llama tokenizer
-pip install snakeviz sentencepiece
-
-# Install lm-eval for Model Evaluation with lm-evalution-harness
-# Install tiktoken for tokenizer
-pip install lm_eval==0.4.5
-pip install tiktoken blobfile
+# Install lm-eval for Model Evaluation with lm-evalution-harness.
+pip install tiktoken sentencepiece tokenizers snakeviz lm_eval==0.4.5 blobfile
 
 # Call the install helper for further setup
 python examples/models/llama/install_requirement_helper.py
diff --git a/examples/models/llama/runner/generation.py b/examples/models/llama/runner/generation.py
@@ -48,7 +48,9 @@ def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int:
 class LlamaRunner(ABC):
     def __init__(
         self,
+        *,
         tokenizer_path: str,
+        tokenizer_config_path: Optional[str] = None,
         max_seq_len: int,
         max_batch_size: int,
         use_kv_cache: bool,
@@ -59,20 +61,23 @@ def __init__(
         Constructor.
 
         Args:
-        tokenizer_path: path to tokenizer.model file.
-        max_seq_len: max length of the output sequence, after which the output will be clipped.
-        max_batch_size: max batch size.
-        use_kv_cache: whether to use a KV cache.
-        vocab_size: number of items in the vocab.
-        device: device to run the runner on.
+            tokenizer_path: path to tokenizer.model file.
+            max_seq_len: max length of the output sequence, after which the output will be clipped.
+            max_batch_size: max batch size.
+            use_kv_cache: whether to use a KV cache.
+            vocab_size: number of items in the vocab.
+            device: device to run the runner on.
         """
         self.max_seq_len = max_seq_len
         self.max_batch_size = max_batch_size
         self.use_kv_cache = use_kv_cache
-        self.tokenizer = get_tokenizer(tokenizer_path)
+        self.tokenizer = get_tokenizer(tokenizer_path, tokenizer_config_path)
         self.device = device
-        # For qwen anything above 151646 is "useless": https://github.com/QwenLM/Qwen2.5/issues/466#issuecomment-2146759706
-        # assert vocab_size == self.tokenizer.n_words
+        # For some models like qwen, mismatch is acceptable: https://github.com/QwenLM/Qwen2.5/issues/466#issuecomment-2146759706
+        if vocab_size != self.tokenizer.n_words:
+            print(
+                "Warning - given vocab_size in params is unequal to tokenizer vocab size."
+            )
 
     @abstractmethod
     def forward(
@@ -102,8 +107,7 @@ def generate(  # noqa: C901
         )
 
         current_token = next_token(logits, temperature, top_p)
-        # print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
-        print(f"{self.tokenizer.decode([current_token])}", end="", flush=True)
+        print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         tokens = prompt_tokens + [current_token]
 
         while len(tokens) < max_seq_len:
@@ -133,8 +137,7 @@ def generate(  # noqa: C901
             ):
                 break
 
-            # print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
-            print(f"{self.tokenizer.decode([current_token])}", end="", flush=True)
+            print(f"{self.tokenizer.decode_token(current_token)}", end="", flush=True)
         print("\n")
 
         return tokens if echo else tokens[len(prompt_tokens) :]
@@ -200,9 +203,7 @@ def chat_completion(
             # prompt_tokens = self.tokenizer.encode(
             #     self._format_prompt(prompt), bos=True, eos=False
             # )
-            prompt_tokens = self.tokenizer.encode(
-                self._format_prompt(prompt)
-            ).ids
+            prompt_tokens = self.tokenizer.encode(self._format_prompt(prompt)).ids
             generated_tokens = self.generate(
                 prompt_tokens=pre_stop_token + prompt_tokens,
                 max_seq_len=max_seq_len,
diff --git a/examples/models/llama/runner/native.py b/examples/models/llama/runner/native.py
@@ -37,6 +37,7 @@ def __init__(self, args):
             params = json.loads(f.read())
         super().__init__(
             tokenizer_path=args.tokenizer,
+            tokenizer_config_path=args.tokenizer_config,
             max_seq_len=args.max_len,
             max_batch_size=1,
             use_kv_cache=args.kv_cache,
@@ -56,6 +57,14 @@ def forward(
         )[0]
 
 
+def validate_args(args) -> None:
+    if args.tokenizer and args.tokenizer.endswith(".json"):
+        if not args.tokenizer_config:
+            raise TypeError(
+                "Json tokenizers require an accompanying tokenizer config (--tokenizer_config) to be specified."
+            )
+
+
 def build_args_parser() -> argparse.ArgumentParser:
     # TODO: merge these with build_args_parser from export_llama_lib.
     parser = argparse.ArgumentParser()
@@ -85,6 +94,12 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=None,
     )
 
+    parser.add_argument(
+        "--tokenizer_config",
+        type=str,
+        default=None,
+    )
+
     parser.add_argument(
         "--prompt",
         type=str,
@@ -116,6 +131,7 @@ def build_args_parser() -> argparse.ArgumentParser:
 def main() -> None:
     parser = build_args_parser()
     args = parser.parse_args()
+    validate_args(args)
     runner = NativeLlamaRunner(args)
     generated_tokens = runner.text_completion(
         prompt=args.prompt,
diff --git a/examples/models/llama3_2_vision/runner/generation.py b/examples/models/llama3_2_vision/runner/generation.py
@@ -13,6 +13,7 @@
 class TorchTuneLlamaRunner(LlamaRunner):
     def __init__(
         self,
+        *,
         tokenizer_path: str,
         max_seq_len: int,
         max_batch_size: int,
@@ -21,12 +22,12 @@ def __init__(
         device: str = "cpu",
     ):
         super().__init__(
-            tokenizer_path,
-            max_seq_len,
-            max_batch_size,
-            use_kv_cache,
-            vocab_size,
-            device,
+            tokenizer_path=tokenizer_path,
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            use_kv_cache=use_kv_cache,
+            vocab_size=vocab_size,
+            device=device,
         )
 
         self.causal_mask = torch.tril(
diff --git a/extension/llm/tokenizer/hf_tokenizer.py b/extension/llm/tokenizer/hf_tokenizer.py
@@ -3,6 +3,7 @@
 import re
 from typing import Dict, List, Optional
 
+
 class HFTokenizer:
     def __init__(self):
         self.special_token_encoder: Dict[str, int] = {}
diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py
@@ -4,29 +4,32 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import json
+from typing import Optional
+
 from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken
+from executorch.extension.llm.tokenizer.hf_tokenizer import HFTokenizer
 from executorch.extension.llm.tokenizer.tokenizer import (
     Tokenizer as SentencePieceTokenizer,
 )
-from executorch.extension.llm.tokenizer.hf_tokenizer import HFTokenizer
 
 
-def get_tokenizer(tokenizer_path):
+def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
     if tokenizer_path.endswith(".json"):
-        # print("Using Hugging Face tokenizer")
-        # tokenizer = HFTokenizer()
-        # tokenizer.load(tokenizer_path)
-
         from tokenizers import Tokenizer
 
         # Load the tokenizer from the tokenizer.json file
         tokenizer = Tokenizer.from_file(tokenizer_path)
-        
-        # from tokenizers import SentencePieceBPETokenizer
 
-        # tokenizer = SentencePieceBPETokenizer(tokenizer_path)
+        # export_llama expects n_words attribute.
         tokenizer.n_words = tokenizer.get_vocab_size()
-        breakpoint()
+        # Keep in line with internal tokenizer apis.
+        tokenizer.decode_token = lambda token: tokenizer.decode([token])
+
+        if tokenizer_config_path:
+            with open(tokenizer_config_path) as f:
+                tokenizer_config = json.load(f)
+                tokenizer.eos_id = tokenizer.token_to_id(tokenizer_config["eos_token"])
     else:
         try:
             tokenizer = SentencePieceTokenizer(model_path=str(tokenizer_path))