Skip to content

Commit ff8d34b

Browse files
committed
llama: define architecture for small granite models
it works only for the small models 3b and 8b. There are enough differences with the base llama arch that it is worth to define a new architecture. To create the .gguf files, it is necessary to specify GraniteSmallForCausalLM in the architectures for the hf model. Signed-off-by: Giuseppe Scrivano <[email protected]>
1 parent c275bc6 commit ff8d34b

File tree

3 files changed

+98
-0
lines changed

3 files changed

+98
-0
lines changed

convert-hf-to-gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2427,6 +2427,34 @@ def set_vocab(self, *args, **kwargs):
24272427
self.gguf_writer.add_add_bos_token(True)
24282428
self.gguf_writer.add_add_eos_token(True)
24292429

2430+
@Model.register("GraniteSmallForCausalLM")
2431+
class GraniteModel(Model):
2432+
model_arch = gguf.MODEL_ARCH.GRANITE_SMALL
2433+
2434+
def __init__(self, *args, **kwargs):
2435+
super().__init__(*args, **kwargs)
2436+
2437+
def set_vocab(self):
2438+
tokens, toktypes, _ = self.get_vocab_base()
2439+
self.gguf_writer.add_tokenizer_model("gpt2")
2440+
self.gguf_writer.add_tokenizer_pre("starcoder")
2441+
self.gguf_writer.add_token_list(tokens)
2442+
self.gguf_writer.add_token_types(toktypes)
2443+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
2444+
special_vocab.add_to_gguf(self.gguf_writer)
2445+
2446+
def set_gguf_parameters(self):
2447+
super().set_gguf_parameters()
2448+
hparams = self.hparams
2449+
self.gguf_writer.add_name("GraniteSmall")
2450+
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2451+
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
2452+
self.gguf_writer.add_add_bos_token(False)
2453+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
2454+
2455+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2456+
return [(self.map_tensor_name(name), data_torch)]
2457+
24302458

24312459
###### CONVERSION LOGIC ######
24322460

gguf-py/gguf/constants.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
139139
COMMAND_R = auto()
140140
DBRX = auto()
141141
OLMO = auto()
142+
GRANITE_SMALL = auto()
142143

143144

144145
class MODEL_TENSOR(IntEnum):
@@ -218,6 +219,7 @@ class MODEL_TENSOR(IntEnum):
218219
MODEL_ARCH.COMMAND_R: "command-r",
219220
MODEL_ARCH.DBRX: "dbrx",
220221
MODEL_ARCH.OLMO: "olmo",
222+
MODEL_ARCH.GRANITE_SMALL: "granite-small",
221223
}
222224

223225
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -732,6 +734,26 @@ class MODEL_TENSOR(IntEnum):
732734
MODEL_TENSOR.FFN_DOWN,
733735
MODEL_TENSOR.FFN_UP,
734736
],
737+
MODEL_ARCH.GRANITE_SMALL: [
738+
MODEL_TENSOR.TOKEN_EMBD,
739+
MODEL_TENSOR.OUTPUT_NORM,
740+
MODEL_TENSOR.OUTPUT,
741+
MODEL_TENSOR.ROPE_FREQS,
742+
MODEL_TENSOR.ATTN_NORM,
743+
MODEL_TENSOR.ATTN_Q,
744+
MODEL_TENSOR.ATTN_K,
745+
MODEL_TENSOR.ATTN_V,
746+
MODEL_TENSOR.ATTN_OUT,
747+
MODEL_TENSOR.ATTN_ROT_EMBD,
748+
MODEL_TENSOR.FFN_GATE_INP,
749+
MODEL_TENSOR.FFN_NORM,
750+
MODEL_TENSOR.FFN_GATE,
751+
MODEL_TENSOR.FFN_DOWN,
752+
MODEL_TENSOR.FFN_UP,
753+
MODEL_TENSOR.FFN_GATE_EXP,
754+
MODEL_TENSOR.FFN_DOWN_EXP,
755+
MODEL_TENSOR.FFN_UP_EXP,
756+
],
735757
# TODO
736758
}
737759

@@ -765,6 +787,10 @@ class MODEL_TENSOR(IntEnum):
765787
MODEL_TENSOR.ROPE_FREQS,
766788
MODEL_TENSOR.ATTN_ROT_EMBD,
767789
],
790+
MODEL_ARCH.GRANITE_SMALL: [
791+
MODEL_TENSOR.ROPE_FREQS,
792+
MODEL_TENSOR.ATTN_ROT_EMBD,
793+
],
768794
}
769795

770796
#

llama.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ enum llm_arch {
225225
LLM_ARCH_COMMAND_R,
226226
LLM_ARCH_DBRX,
227227
LLM_ARCH_OLMO,
228+
LLM_ARCH_GRANITE_SMALL,
228229
LLM_ARCH_UNKNOWN,
229230
};
230231

@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
261262
{ LLM_ARCH_COMMAND_R, "command-r" },
262263
{ LLM_ARCH_DBRX, "dbrx" },
263264
{ LLM_ARCH_OLMO, "olmo" },
265+
{ LLM_ARCH_GRANITE_SMALL, "granite-small" },
264266
{ LLM_ARCH_UNKNOWN, "(unknown)" },
265267
};
266268

@@ -1036,6 +1038,32 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
10361038
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
10371039
},
10381040
},
1041+
{
1042+
LLM_ARCH_GRANITE_SMALL,
1043+
{
1044+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1045+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1046+
{ LLM_TENSOR_OUTPUT, "output" },
1047+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1048+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1049+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1050+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1051+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1052+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1053+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
1054+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1055+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1056+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1057+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1058+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1059+
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
1060+
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
1061+
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
1062+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1063+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1064+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1065+
},
1066+
},
10391067
{
10401068
LLM_ARCH_UNKNOWN,
10411069
{
@@ -4288,6 +4316,16 @@ static void llm_load_hparams(
42884316
default: model.type = e_model::MODEL_UNKNOWN;
42894317
}
42904318
} break;
4319+
case LLM_ARCH_GRANITE_SMALL:
4320+
{
4321+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4322+
4323+
switch (hparams.n_layer) {
4324+
case 32: model.type = e_model::MODEL_3B; break;
4325+
case 36: model.type = e_model::MODEL_8B; break;
4326+
default: model.type = e_model::MODEL_UNKNOWN;
4327+
}
4328+
} break;
42914329
default: (void)0;
42924330
}
42934331

@@ -4397,6 +4435,9 @@ static void llm_load_vocab(
43974435
} else {
43984436
if (tokenizer_model == "gpt2") {
43994437
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4438+
if (model.arch == LLM_ARCH_GRANITE_SMALL) {
4439+
vocab.add_space_prefix = false;
4440+
}
44004441
} else {
44014442
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
44024443
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -4967,6 +5008,7 @@ static bool llm_load_tensors(
49675008
case LLM_ARCH_LLAMA:
49685009
case LLM_ARCH_REFACT:
49695010
case LLM_ARCH_MINICPM:
5011+
case LLM_ARCH_GRANITE_SMALL:
49705012
{
49715013
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
49725014

@@ -10668,6 +10710,7 @@ static struct ggml_cgraph * llama_build_graph(
1066810710

1066910711
switch (model.arch) {
1067010712
case LLM_ARCH_LLAMA:
10713+
case LLM_ARCH_GRANITE_SMALL:
1067110714
{
1067210715
result = llm.build_llama();
1067310716
} break;
@@ -15811,6 +15854,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1581115854
case LLM_ARCH_PHI3:
1581215855
case LLM_ARCH_GEMMA:
1581315856
case LLM_ARCH_STARCODER2:
15857+
case LLM_ARCH_GRANITE_SMALL:
1581415858
return LLAMA_ROPE_TYPE_NEOX;
1581515859

1581615860
// all model arches should be listed explicitly here

0 commit comments

Comments
 (0)