Skip to content

Commit 4da25c0

Browse files
authored
Merge pull request #1 from MagnusS0/noramistral-tokenizer
feat: add compatability with noramistral
2 parents 6a2f298 + 921e2c3 commit 4da25c0

27 files changed

+1359
-10
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ test: $(TEST_TARGETS)
124124
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
125125
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
126126
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
127+
./$$test_target $(CURDIR)/models/ggml-vocab-normistral-7b-warm.gguf; \
127128
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
128129
continue; \
129130
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \

convert-hf-to-gguf-update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class TOKENIZER_TYPE(IntEnum):
8585
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
8686
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
8787
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
88+
{"name": "normistral-7b-warm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/norallm/normistral-7b-warm", },
8889
]
8990

9091

convert-hf-to-gguf.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -453,18 +453,12 @@ def get_vocab_base_pre(self, tokenizer) -> str:
453453
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
454454
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base
455455
res = "refact"
456-
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
457-
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
458-
res = "command-r"
459456
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
460457
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
461458
res = "qwen2"
462459
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
463460
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
464461
res = "olmo"
465-
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
466-
# ref: https://huggingface.co/databricks/dbrx-base
467-
res = "dbrx"
468462
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
469463
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
470464
res = "jina-v2-en"
@@ -483,6 +477,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
483477
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
484478
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
485479
res = "jina-v2-code"
480+
if chkhsh == "a3ab9069a4c073804dfd16a852e6a0776cba5a46402ec3c7325851b57e0c4869":
481+
# ref: https://huggingface.co/norallm/normistral-7b-warm
482+
res = "normistral-7b-warm"
486483

487484
if res is None:
488485
logger.warning("\n")

kompute

Submodule kompute deleted from 4565194

llama.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4899,6 +4899,9 @@ static void llm_load_vocab(
48994899
} else if (
49004900
tokenizer_pre == "poro-chat") {
49014901
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
4902+
} else if (
4903+
tokenizer_pre == "normistral-7b-warm") {
4904+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_NORMISTRAL;
49024905
} else {
49034906
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
49044907
}
@@ -13297,6 +13300,15 @@ struct llm_tokenizer_bpe {
1329713300
" ?[^(\\s|.,!?…。,、।۔،)]+",
1329813301
};
1329913302
break;
13303+
case LLAMA_VOCAB_PRE_TYPE_NORMISTRAL:
13304+
regex_exprs = {
13305+
"[^\\S ]{1}",
13306+
" {1}\\S+",
13307+
" {0,1}\\d{1}",
13308+
" {0,1}[^\\sA-Za-z0-9À-ÿĀ-ſḀ-ỿ]{1}",
13309+
" {2,8}",
13310+
};
13311+
break;
1330013312
default:
1330113313
// default regex for BPE tokenization pre-processing
1330213314
regex_exprs = {

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ extern "C" {
8787
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
8888
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
8989
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
90+
LLAMA_VOCAB_PRE_TYPE_NORMISTRAL = 16,
9091
};
9192

9293
// note: these values should be synchronized with ggml_rope
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
3
77+
__ggml_vocab_test__
78+
33
79+
__ggml_vocab_test__
80+
333
81+
__ggml_vocab_test__
82+
3333
83+
__ggml_vocab_test__
84+
33333
85+
__ggml_vocab_test__
86+
333333
87+
__ggml_vocab_test__
88+
3333333
89+
__ggml_vocab_test__
90+
33333333
91+
__ggml_vocab_test__
92+
333333333
93+
__ggml_vocab_test__
94+
95+
96+
97+
98+
99+
100+
101+
102+
103+
104+
105+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106+
__ggml_vocab_test__
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
5187 879 59261 21535
2+
42 6395 3776 266
3+
4+
225
5+
261
6+
264
7+
202
8+
203
9+
420
10+
3712
11+
11208
12+
10564 7550
13+
28137 7550
14+
10564 10288
15+
28137 10288
16+
28137 10288 5
17+
10564 16 7550 5
18+
28137 16 7550 5
19+
472 453 9919 104 252 18 3029
20+
91 13392 1577 54321 19498 364 46363 8437
21+
36655 13633 1769 14501 54827 21893 3849 10107 13878 41078
22+
13065 227 50218 13065 246 25763 238 13065 242 25763 229 13065 249 13065 120 13065 258 25763 228 13065 258 13065 100 50218 13065 232 13065 228 13065 254 13065 232 25763 228 13065 236
23+
8000 253 227 301 4411 13 9919 251 119 2965 240 8000 239 109 26726 301 10186 3520 23869 302 45604 13 12284 255 232 301 2895 53752 810 1533 2920 4613 3565 13
24+
10564
25+
28137
26+
225 28137
27+
261 28137
28+
264 28137
29+
264 28137 287 28137
30+
301
31+
203 278
32+
11 225 3742
33+
10564 16 711 11 474 5 8294 1021 1212 9919 251 228 959 10133 23692 5928 9173 33543 1330 1254 13567 22873 44634 257
34+
23
35+
1103
36+
9581
37+
3303
38+
20428
39+
13652
40+
3303 9581
41+
8274
42+
8274 23
43+
319 655 7239 11489 274 6881 12642 16716 203 8000 253 227 301 4411 13 9919 251 119 2965 240 8000 239 109 26726 301 10186 3520 23869 302 45604 13 12284 255 232 9919 104 252 8000 104 252 795 8104 38292 795 9581 795 3303 795 20428 795 13652 795 3303 9581 795 18 23 795 419 23 795 1713 23 225 13065 227 50218 13065 246 25763 238 13065 242 25763 229 13065 249 13065 120 13065 258 25763 228 13065 258 13065 100 50218 13065 232 8000 251 228 959 10133 23692 5928 9173 33543 1330 1254 13567 22873 44634 257 36031 12434 16706 13633 1769 14501 54827 21893 3849 10107 13878 41078 7095 9107 30834 2678 1246 1246 40651 13911 5366 23681 7887 527 10105 3081 363 88 1505 1063 1476 2866 16 363 495 1212 4509 35 363 49 691 4509 527 9104 2554 605 16 363 40 1212 4156 2681 4594 69 35 2893 11 30247 323 11 80 48

models/ggml-vocab-jina-v2-de.gguf.inp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
ied 4 ½ months
2+
__ggml_vocab_test__
3+
Führer
4+
__ggml_vocab_test__
5+
6+
__ggml_vocab_test__
7+
8+
__ggml_vocab_test__
9+
10+
__ggml_vocab_test__
11+
12+
__ggml_vocab_test__
13+
14+
__ggml_vocab_test__
15+
16+
17+
__ggml_vocab_test__
18+
19+
20+
21+
__ggml_vocab_test__
22+
23+
24+
25+
26+
__ggml_vocab_test__
27+
28+
29+
__ggml_vocab_test__
30+
Hello world
31+
__ggml_vocab_test__
32+
Hello world
33+
__ggml_vocab_test__
34+
Hello World
35+
__ggml_vocab_test__
36+
Hello World
37+
__ggml_vocab_test__
38+
Hello World!
39+
__ggml_vocab_test__
40+
Hello, world!
41+
__ggml_vocab_test__
42+
Hello, world!
43+
__ggml_vocab_test__
44+
this is 🦙.cpp
45+
__ggml_vocab_test__
46+
w048 7tuijk dsdfhu
47+
__ggml_vocab_test__
48+
нещо на Български
49+
__ggml_vocab_test__
50+
កាន់តែពិសេសអាចខលចេញ
51+
__ggml_vocab_test__
52+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53+
__ggml_vocab_test__
54+
Hello
55+
__ggml_vocab_test__
56+
Hello
57+
__ggml_vocab_test__
58+
Hello
59+
__ggml_vocab_test__
60+
Hello
61+
__ggml_vocab_test__
62+
Hello
63+
__ggml_vocab_test__
64+
Hello
65+
Hello
66+
__ggml_vocab_test__
67+
(
68+
__ggml_vocab_test__
69+
70+
=
71+
__ggml_vocab_test__
72+
' era
73+
__ggml_vocab_test__
74+
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75+
__ggml_vocab_test__
76+
3
77+
__ggml_vocab_test__
78+
33
79+
__ggml_vocab_test__
80+
333
81+
__ggml_vocab_test__
82+
3333
83+
__ggml_vocab_test__
84+
33333
85+
__ggml_vocab_test__
86+
333333
87+
__ggml_vocab_test__
88+
3333333
89+
__ggml_vocab_test__
90+
33333333
91+
__ggml_vocab_test__
92+
333333333
93+
__ggml_vocab_test__
94+
95+
96+
97+
98+
99+
100+
101+
102+
103+
104+
105+
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
106+
__ggml_vocab_test__

models/ggml-vocab-jina-v2-de.gguf.out

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
1009 699 35137 3294
2+
39832 261
3+
4+
225
5+
6733
6+
53448
7+
202
8+
203
9+
203 203
10+
203 203 203
11+
202 203
12+
17964 1568
13+
29546 1568
14+
17964 3519
15+
29546 3519
16+
29546 3519 5
17+
17964 16 1568 5
18+
29546 16 1568 5
19+
555 337 5060 104 252 18 71 428
20+
91 3079 28 964 30274 48013 267 87 6649 14811
21+
9024 6983 146 236 6294 52261 4933 244 146 237 13905 32390 46632 51078
22+
162 257 227 162 257 119 162 257 246 162 258 238 162 257 242 162 258 229 162 257 249 162 257 120 162 257 258 162 258 228 162 257 258 162 257 100 162 257 119 162 257 232 162 257 228 162 257 254 162 257 232 162 258 228 162 257 236
23+
3753 253 227 406 17453 13 10278 119 54678 3753 239 109 16598 406 52806 1504 5752 78 276 2365 851 697 13 38607 406 20529 5752 12069 413 671 983 1469 30658 13
24+
17964
25+
29546
26+
225 29546
27+
6733 29546
28+
53448 29546
29+
53448 29546 203 53448 29546
30+
406
31+
203 3887
32+
11 15453
33+
17964 16 361 11 476 5 1953 459 426 10278 228 4985 167 235 244 167 230 116 57520 106 33974 166 120 103 46520 255 2281 2237 42047 47551 107 176 126 257
34+
23
35+
3837
36+
45768
37+
3837 3837
38+
3837 45768
39+
3837 3837 3837
40+
3837 3837 45768
41+
3837 3837 3837 3837
42+
3837 3837 3837 45768
43+
203 225 203 203 225 203 203 203 225 202 225 202 202 225 202 203 6733 203 53448 203 13607 203 13607 225 203 3753 253 227 406 17453 13 10278 119 54678 3753 239 109 16598 406 52806 1504 5752 78 276 2365 851 697 13 38607 5060 104 252 3753 104 252 589 8235 54381 589 45768 54381 3837 54381 45768 54381 3837 3837 54381 3837 45768 589 18 23 589 466 23 589 714 23 34376 257 227 162 257 119 162 257 246 162 258 238 162 257 242 162 258 229 162 257 249 162 257 120 162 257 258 162 258 228 162 257 258 162 257 100 162 257 119 162 257 232 32164 228 4985 167 235 244 167 230 116 57520 106 33974 166 120 103 46520 255 2281 2237 42047 47551 107 176 126 257 485 6624 17 30007 14589 33 36028 6983 146 236 6294 52261 4933 244 146 237 13905 32390 46632 51078 1268 12228 12228 11 51396 51396 51396 68 30699 30699 21828 11344 1844 20800 4300 324 1990 927 1268 88 939 540 507 899 16 1268 3136 426 2158 35 1268 49 586 2158 324 2202 1066 436 16 1268 40 426 917 822 11788 35 628 11 30868 264 11 80 48

0 commit comments

Comments
 (0)