Skip to content

Commit 120cf37

Browse files
committed
models : add phi-3, mpt, gpt-2, starcoder
1 parent c21ab18 commit 120cf37

20 files changed

+645
-10
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,13 @@ test: $(TEST_TARGETS)
6464
if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
6565
./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
6666
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
67+
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
6768
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
6869
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
6970
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
7071
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
72+
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
73+
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
7174
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
7275
continue; \
7376
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

convert-hf-to-gguf-update.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,16 @@ class TOKENIZER_TYPE(IntEnum):
4646

4747
# TODO: add models here, base models preferred
4848
models = [
49-
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50-
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
51-
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
52-
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
53-
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
54-
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
49+
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
50+
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
51+
{ "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
52+
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
53+
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
54+
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
55+
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
56+
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
57+
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
58+
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
5559
]
5660

5761
# make directory "models/tokenizers" if it doesn't exist

convert-hf-to-gguf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
296296
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
297297
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5
298298
res = "bert-bge"
299+
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
300+
# ref: https://huggingface.co/mosaicml/mpt-7b
301+
res = "mpt"
302+
if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
303+
# ref: https://huggingface.co/bigcode/starcoder2-3b
304+
res = "starcoder"
305+
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
306+
# ref: https://huggingface.co/openai-community/gpt2
307+
res = "gpt-2"
299308

300309
if res is None:
301310
print("\n")

llama.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4352,6 +4352,15 @@ static void llm_load_vocab(
43524352
} else if (
43534353
tokenizer_pre == "falcon") {
43544354
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
4355+
} else if (
4356+
tokenizer_pre == "mpt") {
4357+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
4358+
} else if (
4359+
tokenizer_pre == "starcoder") {
4360+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
4361+
} else if (
4362+
tokenizer_pre == "gpt-2") {
4363+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
43554364
} else {
43564365
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
43574366
}
@@ -12124,6 +12133,23 @@ struct llm_tokenizer_bpe {
1212412133
"[0-9][0-9][0-9]",
1212512134
});
1212612135
break;
12136+
case LLAMA_VOCAB_PRE_TYPE_MPT:
12137+
// TODO: MPT pre-tokenization regexes are unknown
12138+
// the following are close, but not exact. run the following:
12139+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
12140+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
12141+
word_collection = unicode_regex_split(text, {
12142+
"\\s?\\p{L}+",
12143+
"\\s?\\p{P}+",
12144+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12145+
});
12146+
break;
12147+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
12148+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
12149+
word_collection = unicode_regex_split(text, {
12150+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
12151+
});
12152+
break;
1212712153
default:
1212812154
// default regex for BPE tokenization pre-processing
1212912155
word_collection = unicode_regex_split(text, {

llama.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ extern "C" {
7676
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
7777
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
7878
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
79+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
80+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
81+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
7982
};
8083

8184
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-gpt-2.gguf

1.68 MB
Binary file not shown.

models/ggml-vocab-gpt-2.gguf.inp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__

models/ggml-vocab-gpt-2.gguf.out

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
220
3+
220 220
4+
220 220 220
5+
197
6+
198
7+
628
8+
628 198
9+
197 198
10+
15496 995
11+
18435 995
12+
15496 2159
13+
18435 2159
14+
18435 2159 0
15+
15496 11 995 0
16+
18435 11 995 0
17+
428 318 12520 99 247 13 20322
18+
86 47202 767 28047 45961 288 82 7568 13415
19+
22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849
20+
157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231
21+
8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8
22+
15496
23+
18435
24+
220 18435
25+
220 220 18435
26+
220 220 220 18435
27+
220 220 220 18435 198 220 220 220 18435
28+
357
29+
198 796
30+
6 6980
31+
15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
32+
18
33+
2091
34+
20370
35+
24840
36+
2091 20370
37+
24840 2091
38+
24840 20370
39+
24840 24840
40+
24840 2091 20370
41+
198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43

models/ggml-vocab-mpt.gguf

-13 Bytes
Binary file not shown.

models/ggml-vocab-mpt.gguf.inp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
__ggml_vocab_test__
3+
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
13+
__ggml_vocab_test__
14+
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
22+
__ggml_vocab_test__
23+
24+
25+
__ggml_vocab_test__
26+
Hello world
27+
__ggml_vocab_test__
28+
Hello world
29+
__ggml_vocab_test__
30+
Hello World
31+
__ggml_vocab_test__
32+
Hello World
33+
__ggml_vocab_test__
34+
Hello World!
35+
__ggml_vocab_test__
36+
Hello, world!
37+
__ggml_vocab_test__
38+
Hello, world!
39+
__ggml_vocab_test__
40+
this is 🦙.cpp
41+
__ggml_vocab_test__
42+
w048 7tuijk dsdfhu
43+
__ggml_vocab_test__
44+
нещо на Български
45+
__ggml_vocab_test__
46+
កាន់តែពិសេសអាចខលចេញ
47+
__ggml_vocab_test__
48+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
49+
__ggml_vocab_test__
50+
Hello
51+
__ggml_vocab_test__
52+
Hello
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
Hello
62+
__ggml_vocab_test__
63+
(
64+
__ggml_vocab_test__
65+
66+
=
67+
__ggml_vocab_test__
68+
' era
69+
__ggml_vocab_test__
70+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
71+
__ggml_vocab_test__
72+
3
73+
__ggml_vocab_test__
74+
33
75+
__ggml_vocab_test__
76+
333
77+
__ggml_vocab_test__
78+
3333
79+
__ggml_vocab_test__
80+
33333
81+
__ggml_vocab_test__
82+
333333
83+
__ggml_vocab_test__
84+
3333333
85+
__ggml_vocab_test__
86+
33333333
87+
__ggml_vocab_test__
88+
333333333
89+
__ggml_vocab_test__
90+
91+
92+
93+
94+
95+
96+
97+
98+
99+
100+
101+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
102+
__ggml_vocab_test__

models/ggml-vocab-mpt.gguf.out

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
209
3+
50276
4+
50275
5+
186
6+
187
7+
535
8+
2756
9+
186 187
10+
12092 1533
11+
24387 1533
12+
12092 3645
13+
24387 3645
14+
24387 3645 2
15+
12092 13 1533 2
16+
24387 13 1533 2
17+
436 310 22692 101 236 15 14161
18+
88 27244 818 16853 16392 20505 4989 11917
19+
32520 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389
20+
18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 18081 212 18081 238 18081 216 39936 212 18081 220
21+
14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 313 7483 802 80 8020 326 556 697 1211 10669 10
22+
12092
23+
24387
24+
50276 12092
25+
50275 12092
26+
50274 12092
27+
50274 12092 187 50274 12092
28+
313
29+
187 426
30+
8 8685
31+
12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241
32+
20
33+
1610
34+
20084
35+
26409
36+
1610 20084
37+
26409 1610
38+
26409 20084
39+
26409 26409
40+
26409 1610 20084
41+
586 1744 33525 186 209 623 28910 187 50276 187 50275 187 50274 187 50273 187 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 22692 101 236 14931 101 236 495 5922 30057 495 20084 495 26409 30057 20084 495 26409 1610 495 26409 20084 495 15 20 495 537 20 495 1051 20 209 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 14931 235 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 16081 6877 12880 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 42011 35033 34842 11202 9739 9739 33021 18963 4672 25561 8220 309 1849 644 686 42618 344 434 627 13 686 1848 368 2119 32 686 46 417 2119 309 1833 1056 352 13 686 37 368 751 690 10331 32 844 8 31516 247 8 77 45

models/ggml-vocab-phi-3.gguf

709 KB
Binary file not shown.

0 commit comments

Comments
 (0)