Skip to content

Commit 2d50f81

Browse files
committed
Initial OpenELM support (270M only so far)
1 parent c1b295e commit 2d50f81

File tree

4 files changed

+322
-3
lines changed

4 files changed

+322
-3
lines changed

convert-hf-to-gguf.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2395,6 +2395,50 @@ def set_vocab(self, *args, **kwargs):
23952395
self.gguf_writer.add_add_eos_token(True)
23962396

23972397

2398+
@Model.register("OpenELMForCausalLM")
2399+
class OpenELMModel(Model):
2400+
model_arch = gguf.MODEL_ARCH.OPENELM
2401+
2402+
# Copied from LlamaModel
2403+
def set_vocab(self):
2404+
try:
2405+
self. _set_vocab_sentencepiece()
2406+
except FileNotFoundError:
2407+
self._set_vocab_llama_hf()
2408+
2409+
def set_gguf_parameters(self):
2410+
# TODO: Look closer at these
2411+
2412+
self.gguf_writer.add_name("OpenELM")
2413+
self.block_count = self.find_hparam(["num_transformer_layers"])
2414+
self.gguf_writer.add_layer_norm_eps(1e-5)
2415+
# https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
2416+
self.gguf_writer.add_layer_norm_rms_eps(1e-6)
2417+
n_embd = self.find_hparam(["model_dim"])
2418+
self.gguf_writer.add_embedding_length(n_embd)
2419+
head_dim = self.find_hparam(["head_dim"])
2420+
n_head = n_embd // head_dim
2421+
rot_pct = 1.0
2422+
self.gguf_writer.add_context_length(self.find_hparam(["max_context_length"]))
2423+
self.gguf_writer.add_block_count(self.block_count)
2424+
self.gguf_writer.add_head_count_kv(n_head*10)
2425+
self.gguf_writer.add_head_count(n_head*10)
2426+
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
2427+
self.gguf_writer.add_file_type(self.ftype)
2428+
self.gguf_writer.add_feed_forward_length(0) # dynamically calculated
2429+
2430+
def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
2431+
# TODO: Read configuration!
2432+
if "n_layers" in keys:
2433+
return 16 # num_transformer_layers
2434+
if "hidden_size" in keys:
2435+
return 1280 # model_dim
2436+
if "num_attention_heads" in keys:
2437+
return 64 # head_dim
2438+
2439+
return super().find_hparam(keys, optional)
2440+
2441+
23982442
###### CONVERSION LOGIC ######
23992443

24002444

gguf-py/gguf/constants.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
139139
COMMAND_R = auto()
140140
DBRX = auto()
141141
OLMO = auto()
142+
OPENELM = auto()
142143

143144

144145
class MODEL_TENSOR(IntEnum):
@@ -217,6 +218,7 @@ class MODEL_TENSOR(IntEnum):
217218
MODEL_ARCH.COMMAND_R: "command-r",
218219
MODEL_ARCH.DBRX: "dbrx",
219220
MODEL_ARCH.OLMO: "olmo",
221+
MODEL_ARCH.OPENELM: "openelm",
220222
}
221223

222224
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -743,6 +745,18 @@ class MODEL_TENSOR(IntEnum):
743745
MODEL_TENSOR.FFN_DOWN,
744746
MODEL_TENSOR.FFN_UP,
745747
],
748+
MODEL_ARCH.OPENELM: [
749+
MODEL_TENSOR.TOKEN_EMBD,
750+
MODEL_TENSOR.OUTPUT_NORM,
751+
MODEL_TENSOR.ATTN_NORM,
752+
MODEL_TENSOR.ATTN_QKV,
753+
MODEL_TENSOR.ATTN_Q_NORM,
754+
MODEL_TENSOR.ATTN_K_NORM,
755+
MODEL_TENSOR.ATTN_OUT,
756+
MODEL_TENSOR.FFN_NORM,
757+
MODEL_TENSOR.FFN_UP,
758+
MODEL_TENSOR.FFN_DOWN,
759+
],
746760
# TODO
747761
}
748762

gguf-py/gguf/tensor_mapping.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class TensorNameMap:
2424
"backbone.embedding", # mamba
2525
"backbone.embeddings", # mamba-hf
2626
"transformer.in_out_embed", # Grok
27+
"transformer.token_embeddings", # openelm
2728
),
2829

2930
# Token type embeddings
@@ -36,6 +37,7 @@ class TensorNameMap:
3637
"word_embeddings_layernorm", # bloom
3738
"embeddings.LayerNorm", # bert
3839
"emb_ln", # nomic-bert
40+
"transformer.norm", # openelm
3941
),
4042

4143
# Position embeddings
@@ -68,6 +70,7 @@ class TensorNameMap:
6870
"model.norm_f", # mamba-qbert
6971
"backbone.norm_f", # mamba
7072
"transformer.rms_norm", # Grok
73+
"transformer.norm", # openelm
7174
),
7275

7376
# Rope frequencies
@@ -97,6 +100,7 @@ class TensorNameMap:
97100
"backbone.layers.{bid}.norm", # mamba
98101
"transformer.decoder_layer.{bid}.rms_norm", # Grok
99102
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
103+
"transformer.layers.{bid}.attn_norm", # openelm
100104
),
101105

102106
# Attention norm 2
@@ -117,7 +121,8 @@ class TensorNameMap:
117121
"h.{bid}.attn.c_attn", # gpt2
118122
"transformer.h.{bid}.mixer.Wqkv", # phi2
119123
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
120-
"model.layers.{bid}.self_attn.qkv_proj" # phi3
124+
"model.layers.{bid}.self_attn.qkv_proj", # phi3
125+
"transformer.layers.{bid}.attn.qkv_proj", # openelm
121126
),
122127

123128
# Attention query
@@ -175,6 +180,7 @@ class TensorNameMap:
175180
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
176181
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
177182
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
183+
"transformer.layers.{bid}.attn.out_proj", # openelm
178184
),
179185

180186
# Attention output norm
@@ -206,6 +212,7 @@ class TensorNameMap:
206212
"h.{bid}.ln_2", # gpt2
207213
"model.layers.{bid}.ffn_norm", # internlm2
208214
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
215+
"transformer.layers.{bid}.ffn_norm", # openelm
209216
),
210217

211218
MODEL_TENSOR.FFN_GATE_INP: (
@@ -244,6 +251,7 @@ class TensorNameMap:
244251
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
245252
"model.layers.{bid}.mlp.c_fc", # starcoder2
246253
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
254+
"transformer.layers.{bid}.ffn.proj_1", # openelm
247255
),
248256

249257
MODEL_TENSOR.FFN_UP_EXP: (
@@ -306,6 +314,7 @@ class TensorNameMap:
306314
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
307315
"model.layers.{bid}.mlp.c_proj", # starcoder2
308316
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
317+
"transformer.layers.{bid}.ffn.proj_2", # openelm
309318
),
310319

311320
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -324,15 +333,17 @@ class TensorNameMap:
324333
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
325334
"model.layers.{bid}.self_attn.q_norm", # cohere
326335
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
327-
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
336+
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
337+
"transformer.layers.{bid}.attn.q_norm", # openelm
328338
),
329339

330340
MODEL_TENSOR.ATTN_K_NORM: (
331341
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
332342
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
333343
"model.layers.{bid}.self_attn.k_norm", # cohere
334344
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
335-
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
345+
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
346+
"transformer.layers.{bid}.attn.k_norm", # openelm
336347
),
337348

338349
MODEL_TENSOR.ROPE_FREQS: (

0 commit comments

Comments
 (0)