Skip to content

Commit 06748ff

Browse files
committed
llama: honor add_space_prefix from the model configuration
propagate the add_space_prefix configuration from the HF model configuration to the gguf file and honor it with the gpt2 tokenizer. Signed-off-by: Giuseppe Scrivano <[email protected]>
1 parent 120f7bf commit 06748ff

File tree

2 files changed

+12
-0
lines changed

2 files changed

+12
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,13 @@ def set_gguf_parameters(self):
13151315
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
13161316
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
13171317

1318+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1319+
if tokenizer_config_file.is_file():
1320+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1321+
tokenizer_config_json = json.load(f)
1322+
if "add_prefix_space" in tokenizer_config_json:
1323+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1324+
13181325
@staticmethod
13191326
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
13201327
if n_head_kv is not None and n_head != n_head_kv:

llama.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4494,6 +4494,11 @@ static void llm_load_vocab(
44944494
} else {
44954495
if (tokenizer_model == "gpt2") {
44964496
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4497+
4498+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4499+
if (add_space_prefix_keyidx != -1) {
4500+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4501+
}
44974502
} else {
44984503
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
44994504
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);

0 commit comments

Comments
 (0)