Skip to content

Commit f15ea2c

Browse files
authored
Merge branch 'master' into merge-to-upstream-v2
2 parents 6e15f9b + d94c6e0 commit f15ea2c

File tree

12 files changed

+71
-62
lines changed

12 files changed

+71
-62
lines changed

convert_hf_to_gguf.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,10 @@ def set_gguf_parameters(self):
239239
self.gguf_writer.add_expert_used_count(n_experts_used)
240240
logger.info(f"gguf: experts used count = {n_experts_used}")
241241

242+
if (head_dim := self.hparams.get("head_dim")) is not None:
243+
self.gguf_writer.add_key_length(head_dim)
244+
self.gguf_writer.add_value_length(head_dim)
245+
242246
self.gguf_writer.add_file_type(self.ftype)
243247
logger.info(f"gguf: file type = {self.ftype}")
244248

@@ -596,6 +600,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
596600
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
597601
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
598602
res = "tekken"
603+
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
604+
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
605+
res = "smollm"
599606

600607
if res is None:
601608
logger.warning("\n")
@@ -736,7 +743,7 @@ def _create_vocab_sentencepiece(self):
736743
added_tokens_json = json.load(f)
737744
for key in added_tokens_json:
738745
token_id = added_tokens_json[key]
739-
if (token_id >= vocab_size):
746+
if token_id >= vocab_size:
740747
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
741748
continue
742749

@@ -1484,7 +1491,12 @@ def set_gguf_parameters(self):
14841491
super().set_gguf_parameters()
14851492
hparams = self.hparams
14861493
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1487-
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
1494+
1495+
if "head_dim" in hparams:
1496+
rope_dim = hparams["head_dim"]
1497+
else:
1498+
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1499+
self.gguf_writer.add_rope_dimension_count(rope_dim)
14881500

14891501
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
14901502
if self.hparams["rope_scaling"].get("type") == "linear":
@@ -1999,7 +2011,7 @@ def set_vocab(self):
19992011

20002012
for key in added_tokens_json:
20012013
token_id = added_tokens_json[key]
2002-
if (token_id >= vocab_size):
2014+
if token_id >= vocab_size:
20032015
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
20042016
continue
20052017

@@ -2075,7 +2087,7 @@ def set_gguf_parameters(self):
20752087

20762088
# write rope scaling for long context (128k) model
20772089
rope_scaling = self.find_hparam(['rope_scaling'], True)
2078-
if (rope_scaling is None):
2090+
if rope_scaling is None:
20792091
return
20802092

20812093
scale = max_pos_embds / orig_max_pos_embds
@@ -2722,7 +2734,7 @@ def get_tensors(self):
27222734

27232735
yield name, data
27242736

2725-
def set_vocab(self, *args, **kwargs):
2737+
def set_vocab(self):
27262738
tokenizer_class = 'BertTokenizer'
27272739
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
27282740
tokenizer_class = json.load(f)['tokenizer_class']
@@ -2870,7 +2882,7 @@ def set_vocab(self):
28702882
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
28712883
for token_id, token_json in added_tokens_decoder.items():
28722884
token_id = int(token_id)
2873-
if (token_id >= vocab_size):
2885+
if token_id >= vocab_size:
28742886
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
28752887
continue
28762888

@@ -3119,7 +3131,7 @@ def set_vocab(self):
31193131
added_tokens_json = json.load(f)
31203132
for key in added_tokens_json:
31213133
token_id = added_tokens_json[key]
3122-
if (token_id >= vocab_size):
3134+
if token_id >= vocab_size:
31233135
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
31243136
continue
31253137

convert_hf_to_gguf_update.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
5050

5151
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
5252
# will be updated with time - contributions welcome
53-
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
53+
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
5454

5555
if len(sys.argv) == 2:
5656
token = sys.argv[1]
@@ -93,6 +93,7 @@ class TOKENIZER_TYPE(IntEnum):
9393
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
9494
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
9595
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
96+
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
9697
]
9798

9899

@@ -101,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
101102
response = sess.get(url, headers=headers)
102103
response.raise_for_status()
103104
os.makedirs(os.path.dirname(save_path), exist_ok=True)
104-
with open(save_path, 'wb') as f:
105-
f.write(response.content)
105+
with open(save_path, 'wb') as downloaded_file:
106+
downloaded_file.write(response.content)
106107
logger.info(f"File {save_path} downloaded successfully")
107108

108109

@@ -161,7 +162,7 @@ def download_model(model):
161162
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
162163
continue # Skip to the next model if the tokenizer can't be loaded
163164

164-
chktok = tokenizer.encode(chktxt)
165+
chktok = tokenizer.encode(CHK_TXT)
165166
chkhsh = sha256(str(chktok).encode()).hexdigest()
166167

167168
logger.info(f"model: {name}")
@@ -193,7 +194,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
193194
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
194195
# use in llama.cpp to implement the same pre-tokenizer
195196
196-
chktxt = {repr(chktxt)}
197+
chktxt = {repr(CHK_TXT)}
197198
198199
chktok = tokenizer.encode(chktxt)
199200
chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -289,7 +290,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
289290
"333333333",
290291
"Cửa Việt", # llama-bpe fails on this
291292
" discards",
292-
chktxt,
293+
CHK_TXT,
293294
]
294295

295296
# write the tests to ./models/ggml-vocab-{name}.gguf.inp

convert_llama_ggml_to_gguf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def load(self, data, offset):
132132

133133

134134
class GGMLModel:
135+
136+
file_format: GGMLFormat
137+
format_version: int
138+
135139
def __init__(self):
136140
self.hyperparameters = None
137141
self.vocab = None
@@ -290,7 +294,7 @@ def add_vocab(self, gguf_writer):
290294
if self.vocab_override is not None:
291295
vo = self.vocab_override
292296
logger.info('* Adding vocab item(s)')
293-
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
297+
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
294298
tokens.append(vbytes)
295299
scores.append(score)
296300
toktypes.append(ttype)

examples/llama.android/llama/src/main/cpp/llama-android.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
409409

410410
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
411411
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
412-
return env->NewStringUTF("");
412+
return nullptr;
413413
}
414414

415415
auto new_token_chars = llama_token_to_piece(context, new_token_id);

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,7 @@ node index.js
444444

445445
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
446446

447-
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
447+
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
448448
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
449449

450450
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

flake.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ggml/src/ggml-quants.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4748,7 +4748,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
47484748

47494749
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
47504750

4751-
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
4751+
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
47524752
}
47534753

47544754
#elif defined(__POWER9_VECTOR__)

include/llama.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,9 @@ extern "C" {
9292
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
9393
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
9494
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
95-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 20,
96-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 21,
95+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
96+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
97+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
9798
};
9899

99100
// note: these values should be synchronized with ggml_rope

models/ggml-vocab-gpt2.gguf

-1.68 MB
Binary file not shown.

models/ggml-vocab-stablelm.gguf

-1.69 MB
Binary file not shown.

src/llama.cpp

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3707,7 +3707,7 @@ struct llama_model_loader {
37073707
}
37083708

37093709
if (param_overrides_p != nullptr) {
3710-
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
3710+
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
37113711
kv_overrides.insert({std::string(p->key), *p});
37123712
}
37133713
}
@@ -3875,7 +3875,7 @@ struct llama_model_loader {
38753875
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
38763876

38773877
{
3878-
const int kid = gguf_find_key(meta, "general.file_type");
3878+
const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
38793879
if (kid >= 0) {
38803880
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
38813881
}
@@ -5369,6 +5369,7 @@ static void llm_load_vocab(
53695369
if (merges_keyidx == -1) {
53705370
throw std::runtime_error("cannot find tokenizer merges in model file\n");
53715371
}
5372+
53725373
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
53735374
for (int i = 0; i < n_merges; i++) {
53745375
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
@@ -5407,16 +5408,6 @@ static void llm_load_vocab(
54075408
vocab.special_cls_id = -1;
54085409
vocab.special_mask_id = -1;
54095410

5410-
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
5411-
if (add_space_prefix_keyidx != -1) {
5412-
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
5413-
} // The default value of add_space_prefix is true.
5414-
5415-
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
5416-
if (remove_extra_whitespaces_keyidx != -1) {
5417-
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
5418-
} // The default value of remove_extra_whitespaces is false.
5419-
54205411
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
54215412
if (precompiled_charsmap_keyidx != -1) {
54225413
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@@ -5533,6 +5524,10 @@ static void llm_load_vocab(
55335524
vocab.tokenizer_clean_spaces = false;
55345525
vocab.tokenizer_ignore_merges = true;
55355526
vocab.tokenizer_add_bos = true;
5527+
} else if (
5528+
tokenizer_pre == "smollm") {
5529+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
5530+
vocab.tokenizer_clean_spaces = false;
55365531
} else {
55375532
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
55385533
}
@@ -5556,10 +5551,8 @@ static void llm_load_vocab(
55565551
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
55575552
}
55585553

5559-
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
5560-
if (add_space_prefix_keyidx != -1) {
5561-
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
5562-
}
5554+
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
5555+
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
55635556
}
55645557

55655558
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -6140,10 +6133,10 @@ static bool llm_load_tensors(
61406133

61416134
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
61426135

6143-
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
6144-
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
6145-
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
6146-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
6136+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
6137+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
6138+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
6139+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
61476140

61486141
// optional bias tensors
61496142
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -15558,6 +15551,7 @@ struct llm_tokenizer_bpe {
1555815551
case LLAMA_VOCAB_PRE_TYPE_REFACT:
1555915552
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
1556015553
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
15554+
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
1556115555
regex_exprs = {
1556215556
"\\p{N}",
1556315557
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -18292,8 +18286,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1829218286

1829318287
// copy the KV pairs from the input file
1829418288
gguf_set_kv (ctx_out, ml.meta);
18295-
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
18296-
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
18289+
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
18290+
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
18291+
1829718292
// Remove split metadata
1829818293
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
1829918294
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());

0 commit comments

Comments
 (0)