Skip to content

Commit 3a461db

Browse files
committed
tests : add test that fails with DeepSeek tokenizers
1 parent cf00fe1 commit 3a461db

22 files changed

+67
-5
lines changed

convert-hf-to-gguf-update.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ def download_file_with_auth(url, token, save_path):
189189
# generate tests for each tokenizer model
190190

191191
tests = [
192-
"ied 4 ½ months"
192+
"ied 4 ½ months",
193+
"Führer",
193194
"",
194195
" ",
195196
" ",

llama.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11952,7 +11952,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
1195211952
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
1195311953
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
1195411954
GGML_ASSERT(llama_is_byte_token(vocab, id));
11955-
const auto& token_data = vocab.id_to_token.at(id);
11955+
const auto & token_data = vocab.id_to_token.at(id);
1195611956
switch (llama_vocab_get_type(vocab)) {
1195711957
case LLAMA_VOCAB_TYPE_SPM: {
1195811958
auto buf = token_data.text.substr(3, 2);
@@ -17471,9 +17471,10 @@ int32_t llama_tokenize(
1747117471

1747217472
static std::string llama_decode_text(const std::string & text) {
1747317473
std::string decoded_text;
17474-
auto unicode_sequences = unicode_cpts_from_utf8(text);
17475-
for (auto & unicode_sequence : unicode_sequences) {
17476-
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
17474+
17475+
const auto cpts = unicode_cpts_from_utf8(text);
17476+
for (const auto cpt : cpts) {
17477+
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
1747717478
}
1747817479

1747917480
return decoded_text;

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
29464 2094 1018 1092 2706
2+
11865 17875
3+
24

35

46

models/ggml-vocab-deepseek-coder.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-deepseek-coder.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
1050 207 19 207 19192 4217
2+
37 32009 71 6247
3+
24
207
35
243
46
315

models/ggml-vocab-deepseek-llm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-deepseek-llm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
1052 207 19 207 19109 4223
2+
37 100014 71 6245
3+
24
207
35
243
46
300

models/ggml-vocab-falcon.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-falcon.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
878 204 31 3068 133 2137
2+
28611 132 30042
3+
24
204
35
258
46
466

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
798 604 25208 1933
2+
37 9116 71 11751
3+
24
220
35
220 220
46
220 220 220

models/ggml-vocab-llama-bpe.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-llama-bpe.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
1142 220 19 220 27154 4038
2+
37 51853 261
3+
24
220
35
256
46
262

models/ggml-vocab-llama-spm.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-llama-spm.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
3+
24
259
35
1678
46
268

models/ggml-vocab-mpt.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-mpt.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
728 577 24142 2607
2+
39 26288 6554
3+
24
209
35
50276
46
50275

models/ggml-vocab-phi-3.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-phi-3.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
474 287 29871 29946 29871 30226 7378
2+
383 4000 261
3+
24
259
35
1678
46
268

models/ggml-vocab-starcoder.gguf.inp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
26
__ggml_vocab_test__
37

48
__ggml_vocab_test__

models/ggml-vocab-starcoder.gguf.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
4850 244 57 244 162 159 17722
2+
75 2022 3943 284
3+
24
244
35
280
46
283

0 commit comments

Comments
 (0)