Skip to content

Commit 80acb7b

Browse files
authored
Rename Olmo1124 to Olmo2 (ggml-org#10500)
1 parent 10bce04 commit 80acb7b

File tree

4 files changed

+29
-29
lines changed

4 files changed

+29
-29
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3040,9 +3040,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
30403040
return [(self.map_tensor_name(name), data_torch)]
30413041

30423042

3043-
@Model.register("Olmo1124ForCausalLM")
3044-
class Olmo1124Model(Model):
3045-
model_arch = gguf.MODEL_ARCH.OLMO_1124
3043+
@Model.register("Olmo2ForCausalLM")
3044+
class Olmo2Model(Model):
3045+
model_arch = gguf.MODEL_ARCH.OLMO2
30463046

30473047

30483048
@Model.register("OlmoeForCausalLM")

gguf-py/gguf/constants.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
243243
COMMAND_R = auto()
244244
DBRX = auto()
245245
OLMO = auto()
246-
OLMO_1124 = auto()
246+
OLMO2 = auto()
247247
OLMOE = auto()
248248
OPENELM = auto()
249249
ARCTIC = auto()
@@ -405,7 +405,7 @@ class MODEL_TENSOR(IntEnum):
405405
MODEL_ARCH.COMMAND_R: "command-r",
406406
MODEL_ARCH.DBRX: "dbrx",
407407
MODEL_ARCH.OLMO: "olmo",
408-
MODEL_ARCH.OLMO_1124: "olmo_1124",
408+
MODEL_ARCH.OLMO2: "olmo2",
409409
MODEL_ARCH.OLMOE: "olmoe",
410410
MODEL_ARCH.OPENELM: "openelm",
411411
MODEL_ARCH.ARCTIC: "arctic",
@@ -1071,7 +1071,7 @@ class MODEL_TENSOR(IntEnum):
10711071
MODEL_TENSOR.FFN_DOWN,
10721072
MODEL_TENSOR.FFN_UP,
10731073
],
1074-
MODEL_ARCH.OLMO_1124: [
1074+
MODEL_ARCH.OLMO2: [
10751075
MODEL_TENSOR.TOKEN_EMBD,
10761076
MODEL_TENSOR.OUTPUT_NORM,
10771077
MODEL_TENSOR.OUTPUT,

gguf-py/gguf/tensor_mapping.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class TensorNameMap:
1313
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
1414
"transformer.word_embeddings", # falcon
1515
"word_embeddings", # bloom
16-
"model.embed_tokens", # llama-hf nemotron olmoe olmo_1124
16+
"model.embed_tokens", # llama-hf nemotron olmoe olmo2
1717
"tok_embeddings", # llama-pth
1818
"embeddings.word_embeddings", # bert nomic-bert
1919
"language_model.embedding.word_embeddings", # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
5454
# Output
5555
MODEL_TENSOR.OUTPUT: (
5656
"embed_out", # gptneox
57-
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
57+
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
5858
"output", # llama-pth bloom internlm2
5959
"word_embeddings_for_head", # persimmon
6060
"lm_head.linear", # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
6666
MODEL_TENSOR.OUTPUT_NORM: (
6767
"gpt_neox.final_layer_norm", # gptneox
6868
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
69-
"model.norm", # llama-hf baichuan internlm2 olmoe olmo_1124
69+
"model.norm", # llama-hf baichuan internlm2 olmoe olmo2
7070
"norm", # llama-pth
7171
"transformer.norm_f", # mpt dbrx
7272
"ln_f", # refact bloom qwen gpt2
@@ -145,7 +145,7 @@ class TensorNameMap:
145145

146146
# Attention query
147147
MODEL_TENSOR.ATTN_Q: (
148-
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo_1124
148+
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2
149149
"layers.{bid}.attention.wq", # llama-pth
150150
"encoder.layer.{bid}.attention.self.query", # bert
151151
"transformer.h.{bid}.attn.q_proj", # gpt-j
@@ -157,7 +157,7 @@ class TensorNameMap:
157157

158158
# Attention key
159159
MODEL_TENSOR.ATTN_K: (
160-
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo_1124
160+
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2
161161
"layers.{bid}.attention.wk", # llama-pth
162162
"encoder.layer.{bid}.attention.self.key", # bert
163163
"transformer.h.{bid}.attn.k_proj", # gpt-j
@@ -170,7 +170,7 @@ class TensorNameMap:
170170

171171
# Attention value
172172
MODEL_TENSOR.ATTN_V: (
173-
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo_1124
173+
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2
174174
"layers.{bid}.attention.wv", # llama-pth
175175
"encoder.layer.{bid}.attention.self.value", # bert
176176
"transformer.h.{bid}.attn.v_proj", # gpt-j
@@ -188,7 +188,7 @@ class TensorNameMap:
188188
"transformer.blocks.{bid}.attn.out_proj", # mpt
189189
"transformer.h.{bid}.self_attention.dense", # falcon
190190
"h.{bid}.self_attention.dense", # bloom
191-
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo_1124
191+
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2
192192
"layers.{bid}.attention.wo", # llama-pth
193193
"encoder.layer.{bid}.attention.output.dense", # bert
194194
"transformer.h.{bid}.attn.out_proj", # gpt-j
@@ -215,7 +215,7 @@ class TensorNameMap:
215215
),
216216

217217
MODEL_TENSOR.ATTN_POST_NORM: (
218-
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo_1124
218+
"model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2
219219
),
220220

221221
# Rotary embeddings
@@ -250,7 +250,7 @@ class TensorNameMap:
250250

251251
# Post feed-forward norm
252252
MODEL_TENSOR.FFN_POST_NORM: (
253-
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
253+
"model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
254254
),
255255

256256
MODEL_TENSOR.FFN_GATE_INP: (
@@ -273,7 +273,7 @@ class TensorNameMap:
273273
"transformer.blocks.{bid}.ffn.up_proj", # mpt
274274
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
275275
"h.{bid}.mlp.dense_h_to_4h", # bloom
276-
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo_1124
276+
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
277277
"layers.{bid}.feed_forward.w3", # llama-pth
278278
"encoder.layer.{bid}.intermediate.dense", # bert
279279
"transformer.h.{bid}.mlp.fc_in", # gpt-j
@@ -314,7 +314,7 @@ class TensorNameMap:
314314

315315
# Feed-forward gate
316316
MODEL_TENSOR.FFN_GATE: (
317-
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo_1124
317+
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2
318318
"layers.{bid}.feed_forward.w1", # llama-pth
319319
"transformer.h.{bid}.mlp.w2", # qwen
320320
"transformer.h.{bid}.mlp.c_fc2", # jais
@@ -346,7 +346,7 @@ class TensorNameMap:
346346
"transformer.blocks.{bid}.ffn.down_proj", # mpt
347347
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
348348
"h.{bid}.mlp.dense_4h_to_h", # bloom
349-
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo_1124
349+
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
350350
"layers.{bid}.feed_forward.w2", # llama-pth
351351
"encoder.layer.{bid}.output.dense", # bert
352352
"transformer.h.{bid}.mlp.fc_out", # gpt-j
@@ -383,7 +383,7 @@ class TensorNameMap:
383383
MODEL_TENSOR.ATTN_Q_NORM: (
384384
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
385385
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
386-
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo_1124
386+
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2
387387
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
388388
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
389389
"transformer.layers.{bid}.attn.q_norm", # openelm
@@ -392,7 +392,7 @@ class TensorNameMap:
392392
MODEL_TENSOR.ATTN_K_NORM: (
393393
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
394394
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
395-
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo_1124
395+
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2
396396
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
397397
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
398398
"transformer.layers.{bid}.attn.k_norm", # openelm

src/llama.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ enum llm_arch {
179179
LLM_ARCH_COMMAND_R,
180180
LLM_ARCH_DBRX,
181181
LLM_ARCH_OLMO,
182-
LLM_ARCH_OLMO_1124,
182+
LLM_ARCH_OLMO2,
183183
LLM_ARCH_OLMOE,
184184
LLM_ARCH_OPENELM,
185185
LLM_ARCH_ARCTIC,
@@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
233233
{ LLM_ARCH_COMMAND_R, "command-r" },
234234
{ LLM_ARCH_DBRX, "dbrx" },
235235
{ LLM_ARCH_OLMO, "olmo" },
236-
{ LLM_ARCH_OLMO_1124, "olmo_1124" },
236+
{ LLM_ARCH_OLMO2, "olmo2" },
237237
{ LLM_ARCH_OLMOE, "olmoe" },
238238
{ LLM_ARCH_OPENELM, "openelm" },
239239
{ LLM_ARCH_ARCTIC, "arctic" },
@@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
12101210
},
12111211
},
12121212
{
1213-
LLM_ARCH_OLMO_1124,
1213+
LLM_ARCH_OLMO2,
12141214
{
12151215
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
12161216
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
@@ -5900,7 +5900,7 @@ static void llm_load_hparams(
59005900
default: model.type = e_model::MODEL_UNKNOWN;
59015901
}
59025902
} break;
5903-
case LLM_ARCH_OLMO_1124:
5903+
case LLM_ARCH_OLMO2:
59045904
{
59055905
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
59065906

@@ -8593,7 +8593,7 @@ static bool llm_load_tensors(
85938593
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
85948594
}
85958595
} break;
8596-
case LLM_ARCH_OLMO_1124:
8596+
case LLM_ARCH_OLMO2:
85978597
{
85988598
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
85998599

@@ -14483,7 +14483,7 @@ struct llm_build_context {
1448314483
return gf;
1448414484
}
1448514485

14486-
struct ggml_cgraph * build_olmo_1124() {
14486+
struct ggml_cgraph * build_olmo2() {
1448714487
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
1448814488

1448914489
// mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16799,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
1679916799
{
1680016800
result = llm.build_olmo();
1680116801
} break;
16802-
case LLM_ARCH_OLMO_1124:
16802+
case LLM_ARCH_OLMO2:
1680316803
{
16804-
result = llm.build_olmo_1124();
16804+
result = llm.build_olmo2();
1680516805
} break;
1680616806
case LLM_ARCH_OLMOE:
1680716807
{
@@ -20084,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
2008420084
case LLM_ARCH_QWEN:
2008520085
case LLM_ARCH_QWEN2:
2008620086
case LLM_ARCH_QWEN2MOE:
20087-
case LLM_ARCH_OLMO_1124:
20087+
case LLM_ARCH_OLMO2:
2008820088
case LLM_ARCH_OLMOE:
2008920089
case LLM_ARCH_PHI2:
2009020090
case LLM_ARCH_PHI3:

0 commit comments

Comments
 (0)