Skip to content

Commit 081fe43

Browse files
authored
llama : fix codeshell support (#8599)
* llama : fix codeshell support * llama : move codeshell after smollm below to respect the enum order
1 parent d94c6e0 commit 081fe43

File tree

4 files changed

+11
-2
lines changed

4 files changed

+11
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
594594
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
595595
# ref: https://huggingface.co/core42/jais-13b
596596
res = "jais"
597+
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
598+
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
599+
res = "codeshell"
597600
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
598601
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
599602
res = "tekken"

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ class TOKENIZER_TYPE(IntEnum):
9191
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
9292
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
9393
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
94+
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
9495
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
9596
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
9697
]

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ extern "C" {
9494
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
9595
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
9696
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
97+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
9798
};
9899

99100
// note: these values should be synchronized with ggml_rope

src/llama.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5007,7 +5007,7 @@ static void llm_load_hparams(
50075007
{
50085008
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
50095009
switch (hparams.n_layer) {
5010-
case 42: model.type = e_model::MODEL_SMALL; break;
5010+
case 42: model.type = e_model::MODEL_7B; break;
50115011
default: model.type = e_model::MODEL_UNKNOWN;
50125012
}
50135013
} break;
@@ -5525,6 +5525,9 @@ static void llm_load_vocab(
55255525
tokenizer_pre == "smollm") {
55265526
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
55275527
vocab.tokenizer_clean_spaces = false;
5528+
} else if (
5529+
tokenizer_pre == "codeshell") {
5530+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
55285531
} else {
55295532
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
55305533
}
@@ -15548,6 +15551,7 @@ struct llm_tokenizer_bpe {
1554815551
case LLAMA_VOCAB_PRE_TYPE_REFACT:
1554915552
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
1555015553
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
15554+
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
1555115555
regex_exprs = {
1555215556
"\\p{N}",
1555315557
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -19447,7 +19451,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1944719451
case LLM_ARCH_BAICHUAN:
1944819452
case LLM_ARCH_STARCODER:
1944919453
case LLM_ARCH_PLAMO:
19450-
case LLM_ARCH_CODESHELL:
1945119454
case LLM_ARCH_ORION:
1945219455
case LLM_ARCH_INTERNLM2:
1945319456
case LLM_ARCH_MINICPM:
@@ -19477,6 +19480,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1947719480
case LLM_ARCH_STARCODER2:
1947819481
case LLM_ARCH_OPENELM:
1947919482
case LLM_ARCH_GPTNEOX:
19483+
case LLM_ARCH_CODESHELL:
1948019484
return LLAMA_ROPE_TYPE_NEOX;
1948119485

1948219486
// all model arches should be listed explicitly here

0 commit comments

Comments
 (0)