Move into class

jackzhxng · jackzhxng · commit f1573f29cccf · 2025-02-12T12:00:48.000-08:00
diff --git a/extension/llm/tokenizer/hf_tokenizer.py b/extension/llm/tokenizer/hf_tokenizer.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+from typing import List, Optional
+
+from tokenizers import Tokenizer
+
+
+class HuggingFaceTokenizer:
+    """
+    Tokenizing and encoding/decoding text using the Hugging face tokenizer.
+    """
+    def __init__(self, model_path: str, config_path: Optional[str] = None):
+        """
+        Initializes the Tokenizer with a tokenizer.json from HuggingFace.
+
+        Args:
+            model_path (str): The path to the Tiktoken model file.
+        """
+        assert os.path.isfile(model_path), model_path
+
+        self.model = tokenizer = Tokenizer.from_file(model_path)
+
+        self.n_words: int = tokenizer.get_vocab_size()
+        if config_path:
+            with open(config_path) as f:
+                tokenizer_config = json.load(f)
+                self.bos_id = self.model.token_to_id(tokenizer_config["bos_token"])if tokenizer_config["bos_token"] else None
+                self.eos_id = self.model.token_to_id(tokenizer_config["eos_token"])
+        else:  # Fallback guess.
+            self.bos_id = self.model.token_to_id("<|begin_of_text|>")
+            self.eos_id = self.model.token_to_id("<|endoftext|>")
+
+        self.stop_tokens = [
+            self.eos_id,
+        ]
+
+    def encode(self, s: str, *, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        return self.model.encode(s).ids
+
+    def decode(self, t: List[int]) -> str:
+        return self.model.decode(t)
+
+    def decode_token(self, t: int) -> str:
+        return self.model.decode([t])
+
diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py
@@ -15,20 +15,9 @@
 
 def get_tokenizer(tokenizer_path: str, tokenizer_config_path: Optional[str] = None):
     if tokenizer_path.endswith(".json"):
-        from tokenizers import Tokenizer
+        from executorch.extension.llm.tokenizer.hf_tokenizer import HuggingFaceTokenizer
 
-        tokenizer = Tokenizer.from_file(tokenizer_path)
-
-        # Keep in line with internal tokenizer apis.
-        tokenizer.n_words = tokenizer.get_vocab_size()
-        tokenizer.decode_token = lambda token: tokenizer.decode([token])
-        original_encode = tokenizer.encode
-        tokenizer.encode = lambda prompt, **kwargs: original_encode(prompt).ids
-
-        if tokenizer_config_path:
-            with open(tokenizer_config_path) as f:
-                tokenizer_config = json.load(f)
-                tokenizer.eos_id = tokenizer.token_to_id(tokenizer_config["eos_token"])
+        tokenizer = HuggingFaceTokenizer(tokenizer_path, tokenizer_config_path)
     else:
         try:
             tokenizer = SentencePieceTokenizer(model_path=str(tokenizer_path))