abetlen · jaepil · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -48,7 +48,7 @@ def __init__(
             raise ValueError(f"Model path does not exist: {path_model}")
 
         with suppress_stdout_stderr(disable=verbose):
-            model = llama_cpp.llama_load_model_from_file(
+            model = llama_cpp.llama_model_load_from_file(
                 self.path_model.encode("utf-8"), self.params
             )
 
@@ -60,7 +60,7 @@ def __init__(
         def free_model():
             if self.model is None:
                 return
-            llama_cpp.llama_free_model(self.model)
+            llama_cpp.llama_model_free(self.model)
             self.model = None
 
         self._exit_stack.callback(free_model)
@@ -71,20 +71,20 @@ def close(self):
     def __del__(self):
         self.close()
 
-    def vocab_type(self) -> int:
-        return llama_cpp.llama_vocab_type(self.model)
+    def vocab_type(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_type(vocab)
 
-    def n_vocab(self) -> int:
-        return llama_cpp.llama_n_vocab(self.model)
+    def n_vocab(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_n_tokens(vocab)
 
     def n_ctx_train(self) -> int:
-        return llama_cpp.llama_n_ctx_train(self.model)
+        return llama_cpp.llama_model_n_ctx_train(self.model)
 
     def n_embd(self) -> int:
-        return llama_cpp.llama_n_embd(self.model)
+        return llama_cpp.llama_model_n_embd(self.model)
 
     def rope_freq_scale_train(self) -> float:
-        return llama_cpp.llama_rope_freq_scale_train(self.model)
+        return llama_cpp.llama_model_rope_freq_scale_train(self.model)
 
     def desc(self) -> str:
         buf = ctypes.create_string_buffer(1024)
@@ -97,95 +97,95 @@ def size(self) -> int:
     def n_params(self) -> int:
         return llama_cpp.llama_model_n_params(self.model)
 
-    def get_tensor(self, name: str) -> ctypes.c_void_p:
-        return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
-
     # Vocab
 
-    def token_get_text(self, token: int) -> str:
-        return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
+    def token_get_text(self, vocab: llama_cpp.llama_vocab_p, token: int) -> str:
+        return llama_cpp.llama_vocab_get_text(vocab, token).decode("utf-8")
 
-    def token_get_score(self, token: int) -> float:
-        return llama_cpp.llama_token_get_score(self.model, token)
+    def token_get_score(self, vocab: llama_cpp.llama_vocab_p, token: int) -> float:
+        return llama_cpp.llama_vocab_get_score(vocab, token)
 
-    def token_get_attr(self, token: int) -> int:
-        return llama_cpp.llama_token_get_attr(self.model, token)
+    def token_get_attr(self, vocab: llama_cpp.llama_vocab_p, token: int) -> int:
+        return llama_cpp.llama_vocab_get_attr(vocab, token)
 
     # Special tokens
 
-    def token_bos(self) -> int:
-        return llama_cpp.llama_token_bos(self.model)
+    def token_bos(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_bos(vocab)
+
+    def token_eos(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_eos(vocab)
 
-    def token_eos(self) -> int:
-        return llama_cpp.llama_token_eos(self.model)
+    def token_eot(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_eot(vocab)
 
-    def token_cls(self) -> int:
-        return llama_cpp.llama_token_cls(self.model)
+    def token_cls(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_cls(vocab)
 
-    def token_sep(self) -> int:
-        return llama_cpp.llama_token_sep(self.model)
+    def token_sep(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_sep(vocab)
 
-    def token_nl(self) -> int:
-        return llama_cpp.llama_token_nl(self.model)
+    def token_nl(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_nl(vocab)
 
-    def token_prefix(self) -> int:
-        return llama_cpp.llama_token_prefix(self.model)
+    def token_pad(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_pad(vocab)
 
-    def token_middle(self) -> int:
-        return llama_cpp.llama_token_middle(self.model)
+    def token_prefix(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_fim_pre(vocab)
 
-    def token_suffix(self) -> int:
-        return llama_cpp.llama_token_suffix(self.model)
+    def token_middle(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_fim_mid(vocab)
 
-    def token_eot(self) -> int:
-        return llama_cpp.llama_token_eot(self.model)
+    def token_suffix(self, vocab: llama_cpp.llama_vocab_p) -> int:
+        return llama_cpp.llama_vocab_fim_suf(vocab)
 
-    def add_bos_token(self) -> bool:
-        return llama_cpp.llama_add_bos_token(self.model)
+    def add_bos_token(self, vocab: llama_cpp.llama_vocab_p) -> bool:
+        return llama_cpp.llama_vocab_get_add_bos(vocab)
 
-    def add_eos_token(self) -> bool:
-        return llama_cpp.llama_add_eos_token(self.model)
+    def add_eos_token(self, vocab: llama_cpp.llama_vocab_p) -> bool:
+        return llama_cpp.llama_vocab_get_add_eos(vocab)
 
     # Tokenization
 
-    def tokenize(self, text: bytes, add_bos: bool, special: bool):
+    def tokenize(self, vocab: llama_cpp.llama_vocab_p, text: bytes, add_bos: bool, special: bool):
         n_ctx = self.n_ctx_train()
         tokens = (llama_cpp.llama_token * n_ctx)()
         n_tokens = llama_cpp.llama_tokenize(
-            self.model, text, len(text), tokens, n_ctx, add_bos, special
+            vocab, text, len(text), tokens, n_ctx, add_bos, special
         )
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
             tokens = (llama_cpp.llama_token * n_tokens)()
             n_tokens = llama_cpp.llama_tokenize(
-                self.model, text, len(text), tokens, n_tokens, add_bos, special
+                vocab, text, len(text), tokens, n_tokens, add_bos, special
             )
             if n_tokens < 0:
                 raise RuntimeError(
                     f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
                 )
         return list(tokens[:n_tokens])
 
-    def token_to_piece(self, token: int, special: bool = False) -> bytes:
+    def token_to_piece(self, vocab: llama_cpp.llama_vocab_p, token: int, special: bool = False) -> bytes:
         buf = ctypes.create_string_buffer(32)
-        llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
+        llama_cpp.llama_token_to_piece(vocab, token, buf, 32, 0, special)
         return bytes(buf)
 
-    def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
+    def detokenize(self, vocab: llama_cpp.llama_vocab_p, tokens: List[int], special: bool = False) -> bytes:
         output = b""
         size = 32
         buffer = (ctypes.c_char * size)()
         for token in tokens:
             n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size, 0, special
+                vocab, llama_cpp.llama_token(token), buffer, size, 0, special
             )
             assert n <= size
             output += bytes(buffer[:n])
         # NOTE: Llama1 models automatically added a space at the start of the prompt
         # this line removes a leading space if the first token is a beginning of sentence token
         return (
             output[1:]
-            if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" "
+            if len(tokens) > 0 and tokens[0] == self.token_bos(vocab) and output[0:1] == b" "
             else output
         )
 
@@ -605,10 +605,11 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str:
     def sample(
         self,
         ctx_main: LlamaContext,
+        vocab: llama_cpp.llama_vocab_p,
         idx: int = 0,
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
-        n_vocab = ctx_main.model.n_vocab()
+        n_vocab = ctx_main.model.n_vocab(vocab)
         id: int = 0
 
         if logits_array is None:

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -374,6 +374,8 @@ def __init__(
             )
         )
 
+        self._vocab = llama_cpp.llama_model_get_vocab(self._model.model)
+
         # Override tokenizer
         self.tokenizer_ = tokenizer or LlamaTokenizer(self)
 
@@ -476,10 +478,10 @@ def free_lora_adapter():
         bos_token_id = self.token_bos()
 
         eos_token = (
-            self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
+            self._model.token_get_text(self._vocab, eos_token_id) if eos_token_id != -1 else ""
         )
         bos_token = (
-            self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
+            self._model.token_get_text(self._vocab, bos_token_id) if bos_token_id != -1 else ""
         )
 
         # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
@@ -584,7 +586,7 @@ def tokenize(
         Returns:
             A list of tokens.
         """
-        return self.tokenizer_.tokenize(text, add_bos, special)
+        return self.tokenizer_.tokenize(self._vocab, text, add_bos, special)
 
     def detokenize(
         self,
@@ -603,7 +605,7 @@ def detokenize(
             The detokenized string.
         """
         return self.tokenizer_.detokenize(
-            tokens, prev_tokens=prev_tokens, special=special
+            self._vocab, tokens, prev_tokens=prev_tokens, special=special
         )
 
     def set_cache(self, cache: Optional[BaseLlamaCache]):
@@ -1150,11 +1152,11 @@ def _create_completion(
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
         bos_token_id: int = self.token_bos()
-        cls_token_id: int = self._model.token_cls()
-        sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = self._model.token_prefix()
-        middle_token_id: int = self._model.token_middle()
-        suffix_token_id: int = self._model.token_suffix()
+        cls_token_id: int = self._model.token_cls(self._vocab)
+        sep_token_id: int = self._model.token_sep(self._vocab)
+        prefix_token_id: int = self._model.token_prefix(self._vocab)
+        middle_token_id: int = self._model.token_middle(self._vocab)
+        suffix_token_id: int = self._model.token_suffix(self._vocab)
         add_space_prefix: bool = (
             self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
         )
@@ -1332,7 +1334,7 @@ def logit_bias_processor(
             logits_processor=logits_processor,
             grammar=grammar,
         ):
-            if llama_cpp.llama_token_is_eog(self._model.model, token):
+            if llama_cpp.llama_vocab_is_eog(self._vocab, token):
                 text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
@@ -2171,23 +2173,23 @@ def n_embd(self) -> int:
 
     def n_vocab(self) -> int:
         """Return the vocabulary size."""
-        return self._model.n_vocab()
+        return self._model.n_vocab(self._vocab)
 
     def tokenizer(self) -> LlamaTokenizer:
         """Return the llama tokenizer for this model."""
         return LlamaTokenizer(self)
 
     def token_eos(self) -> int:
         """Return the end-of-sequence token."""
-        return self._model.token_eos()
+        return self._model.token_eos(self._vocab)
 
     def token_bos(self) -> int:
         """Return the beginning-of-sequence token."""
-        return self._model.token_bos()
+        return self._model.token_bos(self._vocab)
 
     def token_nl(self) -> int:
         """Return the newline token."""
-        return self._model.token_nl()
+        return self._model.token_nl(self._vocab)
 
     def pooling_type(self) -> str:
         """Return the pooling type."""