Skip to content

Commit cf00fe1

Browse files
committed
starcoder : fix pre-tokenizer
1 parent 7053b26 commit cf00fe1

22 files changed

+26
-20
lines changed

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ def download_file_with_auth(url, token, save_path):
189189
# generate tests for each tokenizer model
190190

191191
tests = [
192+
"ied 4 ½ months"
192193
"",
193194
" ",
194195
" ",

llama.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12235,6 +12235,11 @@ struct llm_tokenizer_bpe {
1223512235
});
1223612236
break;
1223712237
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12238+
word_collection = unicode_regex_split(text, {
12239+
"\\p{N}",
12240+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12241+
});
12242+
break;
1223812243
case LLAMA_VOCAB_PRE_TYPE_GPT2:
1223912244
word_collection = unicode_regex_split(text, {
1224012245
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",

models/ggml-vocab-bert-bge.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-bert-bge.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
29464 2094 1018 1092 2706
22

33

44

models/ggml-vocab-deepseek-coder.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-deepseek-coder.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
1050 207 19 207 19192 4217
22
207
33
243
44
315

models/ggml-vocab-deepseek-llm.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-deepseek-llm.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
1052 207 19 207 19109 4223
22
207
33
243
44
300

models/ggml-vocab-falcon.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-falcon.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
878 204 31 3068 133 2137
22
204
33
258
44
466

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
798 604 25208 1933
22
220
33
220 220
44
220 220 220

models/ggml-vocab-llama-bpe.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-llama-bpe.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
1142 220 19 220 27154 4038
22
220
33
256
44
262

models/ggml-vocab-llama-spm.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-llama-spm.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
474 287 29871 29946 29871 30226 7378
22
259
33
1678
44
268

models/ggml-vocab-mpt.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-mpt.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
728 577 24142 2607
22
209
33
50276
44
50275

models/ggml-vocab-phi-3.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-phi-3.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
474 287 29871 29946 29871 30226 7378
22
259
33
1678
44
268

models/ggml-vocab-starcoder.gguf.inp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
ied 4 ½ months
22
__ggml_vocab_test__
33

44
__ggml_vocab_test__

models/ggml-vocab-starcoder.gguf.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
4850 244 57 244 162 159 17722
22
244
33
280
44
283

0 commit comments

Comments
 (0)