Skip to content

add SEA-LION support #6448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)

**Multimodal models:**

Expand Down
15 changes: 14 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,16 @@ def write_tensors(self):
class MPTModel(Model):
model_arch = gguf.MODEL_ARCH.MPT

def set_vocab(self):
try:
self._set_vocab_gpt2()
except:
Copy link
Collaborator

@HanClinto HanClinto Apr 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bryanSwk Why would _set_vocab_gpt2() ever fail, and why did you add this except? Trying to understand what the except clause is doing here, and if we should have it, or qualify it a bit more. This except being open-ended is breaking the CI linter. We can fix this by changing it to except Exception:, but I'd prefer to understand why this branch is here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @HanClinto, my intention for this try-except is to differentiate between the sealion variant of mpt, which utilises spm tokenizer.

There isn't a differentiating field in the config.json for sealion 7b as it also uses the MPTForCausalLM class. Hence, this is just a fallback for the sealion model.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very helpful, thank you!

Do you happen to know what kind of exception it will throw in that instance? If not, I can just change it to except Exception: and it should still pass lint

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, except Exception: will work, you can go ahead to make the change.

thank you!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good! Feel free to make any suggestions or wording changes on this PR:
#6470

I'm not very familiar with SEA-LION, so I welcome any adjustments you may have that would make things more clear. :)

self._set_vocab_sentencepiece()
self.gguf_writer.add_add_bos_token(False)
self.gguf_writer.add_pad_token_id(3)
self.gguf_writer.add_eos_token_id(1)
self.gguf_writer.add_unk_token_id(0)

def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
Expand All @@ -524,7 +534,10 @@ def set_gguf_parameters(self):
self.gguf_writer.add_layer_norm_eps(1e-5)
if self.hparams["attn_config"]["clip_qkv"] is not None:
self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
if self.hparams["attn_config"]["alibi"]:
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
else:
self.gguf_writer.add_max_alibi_bias(0.0)

def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_ACT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.POS_EMBD,
],
MODEL_ARCH.GPTJ: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,13 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
),

MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
),

MODEL_TENSOR.ROPE_FREQS: (
Expand Down
48 changes: 45 additions & 3 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
{ LLM_TENSOR_POS_EMBD, "position_embd" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
},
},
{
Expand Down Expand Up @@ -4743,6 +4746,7 @@ static bool llm_load_tensors(
case LLM_ARCH_MPT:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);

// output
{
Expand Down Expand Up @@ -4781,6 +4785,12 @@ static bool llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);

layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);

layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);

// AWQ ScaleActivation layer
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
}
Expand Down Expand Up @@ -7595,6 +7605,7 @@ struct llm_build_context {
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

struct ggml_tensor * cur;
struct ggml_tensor * pos;
struct ggml_tensor * inpL;

inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
Expand All @@ -7605,6 +7616,16 @@ struct llm_build_context {
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();

if (model.pos_embd) {
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
cb(pos, "pos_embd", -1);

inpL = ggml_add(ctx0, inpL, pos);
cb(inpL, "inpL", -1);
}

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * attn_norm;

Expand Down Expand Up @@ -7639,11 +7660,32 @@ struct llm_build_context {
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);

Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
// Q/K Layernorm
if (model.layers[il].attn_q_norm) {
Qcur = llm_build_norm(ctx0, Qcur, hparams,
model.layers[il].attn_q_norm,
model.layers[il].attn_q_norm_b,
LLM_NORM, cb, il);
cb(Qcur, "Qcur", il);

cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
Kcur = llm_build_norm(ctx0, Kcur, hparams,
model.layers[il].attn_k_norm,
model.layers[il].attn_k_norm_b,
LLM_NORM, cb, il);
cb(Kcur, "Kcur", il);

Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);

cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} else {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
}

if (il == n_layer - 1) {
Expand Down