Skip to content

Migrate to the latest version of llama.cpp APIs and support for DeepSeek models #1912

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
Closed
99 changes: 50 additions & 49 deletions llama_cpp/_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
raise ValueError(f"Model path does not exist: {path_model}")

with suppress_stdout_stderr(disable=verbose):
model = llama_cpp.llama_load_model_from_file(
model = llama_cpp.llama_model_load_from_file(
self.path_model.encode("utf-8"), self.params
)

Expand All @@ -60,7 +60,7 @@ def __init__(
def free_model():
if self.model is None:
return
llama_cpp.llama_free_model(self.model)
llama_cpp.llama_model_free(self.model)
self.model = None

self._exit_stack.callback(free_model)
Expand All @@ -71,20 +71,20 @@ def close(self):
def __del__(self):
self.close()

def vocab_type(self) -> int:
return llama_cpp.llama_vocab_type(self.model)
def vocab_type(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_type(vocab)

def n_vocab(self) -> int:
return llama_cpp.llama_n_vocab(self.model)
def n_vocab(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_n_tokens(vocab)

def n_ctx_train(self) -> int:
return llama_cpp.llama_n_ctx_train(self.model)
return llama_cpp.llama_model_n_ctx_train(self.model)

def n_embd(self) -> int:
return llama_cpp.llama_n_embd(self.model)
return llama_cpp.llama_model_n_embd(self.model)

def rope_freq_scale_train(self) -> float:
return llama_cpp.llama_rope_freq_scale_train(self.model)
return llama_cpp.llama_model_rope_freq_scale_train(self.model)

def desc(self) -> str:
buf = ctypes.create_string_buffer(1024)
Expand All @@ -97,95 +97,95 @@ def size(self) -> int:
def n_params(self) -> int:
return llama_cpp.llama_model_n_params(self.model)

def get_tensor(self, name: str) -> ctypes.c_void_p:
return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))

# Vocab

def token_get_text(self, token: int) -> str:
return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
def token_get_text(self, vocab: llama_cpp.llama_vocab_p, token: int) -> str:
return llama_cpp.llama_vocab_get_text(vocab, token).decode("utf-8")

def token_get_score(self, token: int) -> float:
return llama_cpp.llama_token_get_score(self.model, token)
def token_get_score(self, vocab: llama_cpp.llama_vocab_p, token: int) -> float:
return llama_cpp.llama_vocab_get_score(vocab, token)

def token_get_attr(self, token: int) -> int:
return llama_cpp.llama_token_get_attr(self.model, token)
def token_get_attr(self, vocab: llama_cpp.llama_vocab_p, token: int) -> int:
return llama_cpp.llama_vocab_get_attr(vocab, token)

# Special tokens

def token_bos(self) -> int:
return llama_cpp.llama_token_bos(self.model)
def token_bos(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_bos(vocab)

def token_eos(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_eos(vocab)

def token_eos(self) -> int:
return llama_cpp.llama_token_eos(self.model)
def token_eot(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_eot(vocab)

def token_cls(self) -> int:
return llama_cpp.llama_token_cls(self.model)
def token_cls(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_cls(vocab)

def token_sep(self) -> int:
return llama_cpp.llama_token_sep(self.model)
def token_sep(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_sep(vocab)

def token_nl(self) -> int:
return llama_cpp.llama_token_nl(self.model)
def token_nl(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_nl(vocab)

def token_prefix(self) -> int:
return llama_cpp.llama_token_prefix(self.model)
def token_pad(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_pad(vocab)

def token_middle(self) -> int:
return llama_cpp.llama_token_middle(self.model)
def token_prefix(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_pre(vocab)

def token_suffix(self) -> int:
return llama_cpp.llama_token_suffix(self.model)
def token_middle(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_mid(vocab)

def token_eot(self) -> int:
return llama_cpp.llama_token_eot(self.model)
def token_suffix(self, vocab: llama_cpp.llama_vocab_p) -> int:
return llama_cpp.llama_vocab_fim_suf(vocab)

def add_bos_token(self) -> bool:
return llama_cpp.llama_add_bos_token(self.model)
def add_bos_token(self, vocab: llama_cpp.llama_vocab_p) -> bool:
return llama_cpp.llama_vocab_get_add_bos(vocab)

def add_eos_token(self) -> bool:
return llama_cpp.llama_add_eos_token(self.model)
def add_eos_token(self, vocab: llama_cpp.llama_vocab_p) -> bool:
return llama_cpp.llama_vocab_get_add_eos(vocab)

# Tokenization

def tokenize(self, text: bytes, add_bos: bool, special: bool):
def tokenize(self, vocab: llama_cpp.llama_vocab_p, text: bytes, add_bos: bool, special: bool):
n_ctx = self.n_ctx_train()
tokens = (llama_cpp.llama_token * n_ctx)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_ctx, add_bos, special
vocab, text, len(text), tokens, n_ctx, add_bos, special
)
if n_tokens < 0:
n_tokens = abs(n_tokens)
tokens = (llama_cpp.llama_token * n_tokens)()
n_tokens = llama_cpp.llama_tokenize(
self.model, text, len(text), tokens, n_tokens, add_bos, special
vocab, text, len(text), tokens, n_tokens, add_bos, special
)
if n_tokens < 0:
raise RuntimeError(
f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
)
return list(tokens[:n_tokens])

def token_to_piece(self, token: int, special: bool = False) -> bytes:
def token_to_piece(self, vocab: llama_cpp.llama_vocab_p, token: int, special: bool = False) -> bytes:
buf = ctypes.create_string_buffer(32)
llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special)
llama_cpp.llama_token_to_piece(vocab, token, buf, 32, 0, special)
return bytes(buf)

def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
def detokenize(self, vocab: llama_cpp.llama_vocab_p, tokens: List[int], special: bool = False) -> bytes:
output = b""
size = 32
buffer = (ctypes.c_char * size)()
for token in tokens:
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token), buffer, size, 0, special
vocab, llama_cpp.llama_token(token), buffer, size, 0, special
)
assert n <= size
output += bytes(buffer[:n])
# NOTE: Llama1 models automatically added a space at the start of the prompt
# this line removes a leading space if the first token is a beginning of sentence token
return (
output[1:]
if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" "
if len(tokens) > 0 and tokens[0] == self.token_bos(vocab) and output[0:1] == b" "
else output
)

Expand Down Expand Up @@ -605,10 +605,11 @@ def prev_str(self, ctx_main: LlamaContext, n: int) -> str:
def sample(
self,
ctx_main: LlamaContext,
vocab: llama_cpp.llama_vocab_p,
idx: int = 0,
logits_array: Optional[npt.NDArray[np.single]] = None,
):
n_vocab = ctx_main.model.n_vocab()
n_vocab = ctx_main.model.n_vocab(vocab)
id: int = 0

if logits_array is None:
Expand Down
30 changes: 16 additions & 14 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ def __init__(
)
)

self._vocab = llama_cpp.llama_model_get_vocab(self._model.model)

# Override tokenizer
self.tokenizer_ = tokenizer or LlamaTokenizer(self)

Expand Down Expand Up @@ -476,10 +478,10 @@ def free_lora_adapter():
bos_token_id = self.token_bos()

eos_token = (
self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
self._model.token_get_text(self._vocab, eos_token_id) if eos_token_id != -1 else ""
)
bos_token = (
self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
self._model.token_get_text(self._vocab, bos_token_id) if bos_token_id != -1 else ""
)

# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
Expand Down Expand Up @@ -584,7 +586,7 @@ def tokenize(
Returns:
A list of tokens.
"""
return self.tokenizer_.tokenize(text, add_bos, special)
return self.tokenizer_.tokenize(self._vocab, text, add_bos, special)

def detokenize(
self,
Expand All @@ -603,7 +605,7 @@ def detokenize(
The detokenized string.
"""
return self.tokenizer_.detokenize(
tokens, prev_tokens=prev_tokens, special=special
self._vocab, tokens, prev_tokens=prev_tokens, special=special
)

def set_cache(self, cache: Optional[BaseLlamaCache]):
Expand Down Expand Up @@ -1150,11 +1152,11 @@ def _create_completion(
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
created: int = int(time.time())
bos_token_id: int = self.token_bos()
cls_token_id: int = self._model.token_cls()
sep_token_id: int = self._model.token_sep()
prefix_token_id: int = self._model.token_prefix()
middle_token_id: int = self._model.token_middle()
suffix_token_id: int = self._model.token_suffix()
cls_token_id: int = self._model.token_cls(self._vocab)
sep_token_id: int = self._model.token_sep(self._vocab)
prefix_token_id: int = self._model.token_prefix(self._vocab)
middle_token_id: int = self._model.token_middle(self._vocab)
suffix_token_id: int = self._model.token_suffix(self._vocab)
add_space_prefix: bool = (
self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
)
Expand Down Expand Up @@ -1332,7 +1334,7 @@ def logit_bias_processor(
logits_processor=logits_processor,
grammar=grammar,
):
if llama_cpp.llama_token_is_eog(self._model.model, token):
if llama_cpp.llama_vocab_is_eog(self._vocab, token):
text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
finish_reason = "stop"
break
Expand Down Expand Up @@ -2171,23 +2173,23 @@ def n_embd(self) -> int:

def n_vocab(self) -> int:
"""Return the vocabulary size."""
return self._model.n_vocab()
return self._model.n_vocab(self._vocab)

def tokenizer(self) -> LlamaTokenizer:
"""Return the llama tokenizer for this model."""
return LlamaTokenizer(self)

def token_eos(self) -> int:
"""Return the end-of-sequence token."""
return self._model.token_eos()
return self._model.token_eos(self._vocab)

def token_bos(self) -> int:
"""Return the beginning-of-sequence token."""
return self._model.token_bos()
return self._model.token_bos(self._vocab)

def token_nl(self) -> int:
"""Return the newline token."""
return self._model.token_nl()
return self._model.token_nl(self._vocab)

def pooling_type(self) -> str:
"""Return the pooling type."""
Expand Down
Loading