Skip to content

Commit ca01f98

Browse files
committed
Add LlamaTokenizer class
1 parent 1d247e0 commit ca01f98

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

llama_cpp/llama.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1380,6 +1380,11 @@ def n_vocab(self) -> int:
13801380
assert self.ctx is not None
13811381
return llama_cpp.llama_n_vocab(self.ctx)
13821382

1383+
def tokenizer(self) -> "LlamaTokenizer":
1384+
"""Return the tokenizer for this model."""
1385+
assert self.ctx is not None
1386+
return LlamaTokenizer(self)
1387+
13831388
@staticmethod
13841389
def token_eos() -> int:
13851390
"""Return the end-of-sequence token."""
@@ -1410,3 +1415,18 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
14101415
else:
14111416
break
14121417
return longest_prefix
1418+
1419+
1420+
class LlamaTokenizer:
1421+
def __init__(self, llama: Llama):
1422+
self.llama = llama
1423+
1424+
def encode(self, text: str) -> List[int]:
1425+
return self.llama.tokenize(text.encode("utf-8", errors="ignore"))
1426+
1427+
def decode(self, tokens: List[int]) -> str:
1428+
return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
1429+
1430+
@classmethod
1431+
def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
1432+
return cls(Llama(model_path=path, vocab_only=True))

0 commit comments

Comments
 (0)