Skip to content

Commit 18a0c10

Browse files
author
Mug
committed
Remove excessive errors="ignore" and add utf8 test
1 parent b7d14ef commit 18a0c10

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

llama_cpp/llama.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ def create_embedding(self, input: str) -> Embedding:
358358
if self.verbose:
359359
llama_cpp.llama_reset_timings(self.ctx)
360360

361-
tokens = self.tokenize(input.encode("utf-8", errors="ignore"))
361+
tokens = self.tokenize(input.encode("utf-8"))
362362
self.reset()
363363
self.eval(tokens)
364364
n_tokens = len(tokens)
@@ -416,7 +416,7 @@ def _create_completion(
416416
completion_tokens: List[llama_cpp.llama_token] = []
417417
# Add blank space to start of prompt to match OG llama tokenizer
418418
prompt_tokens: List[llama_cpp.llama_token] = self.tokenize(
419-
b" " + prompt.encode("utf-8", errors="ignore")
419+
b" " + prompt.encode("utf-8")
420420
)
421421
text: bytes = b""
422422
returned_characters: int = 0
@@ -431,7 +431,7 @@ def _create_completion(
431431
)
432432

433433
if stop != []:
434-
stop_sequences = [s.encode("utf-8", errors="ignore") for s in stop]
434+
stop_sequences = [s.encode("utf-8") for s in stop]
435435
else:
436436
stop_sequences = []
437437

tests/test_llama.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def mock_eval(*args, **kwargs):
2424
monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
2525

2626
output_text = " jumps over the lazy dog."
27-
output_tokens = llama.tokenize(output_text.encode("utf-8", errors="ignore"))
27+
output_tokens = llama.tokenize(output_text.encode("utf-8"))
2828
token_eos = llama.token_eos()
2929
n = 0
3030

@@ -93,4 +93,38 @@ def test_llama_pickle():
9393

9494
text = b"Hello World"
9595

96-
assert llama.detokenize(llama.tokenize(text)) == text
96+
assert llama.detokenize(llama.tokenize(text)) == text
97+
98+
def test_utf8(monkeypatch):
99+
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
100+
101+
## Set up mock function
102+
def mock_eval(*args, **kwargs):
103+
return 0
104+
105+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
106+
107+
output_text = "😀"
108+
output_tokens = llama.tokenize(output_text.encode("utf-8"))
109+
token_eos = llama.token_eos()
110+
n = 0
111+
112+
def mock_sample(*args, **kwargs):
113+
nonlocal n
114+
if n < len(output_tokens):
115+
n += 1
116+
return output_tokens[n - 1]
117+
else:
118+
return token_eos
119+
120+
monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_top_p_top_k", mock_sample)
121+
122+
## Test basic completion with utf8 multibyte
123+
n = 0 # reset
124+
completion = llama.create_completion("", max_tokens=4)
125+
assert completion["choices"][0]["text"] == output_text
126+
127+
## Test basic completion with incomplete utf8 multibyte
128+
n = 0 # reset
129+
completion = llama.create_completion("", max_tokens=1)
130+
assert completion["choices"][0]["text"] == ""

0 commit comments

Comments
 (0)