Skip to content

Commit 747d17a

Browse files
author
Joan Martinez
committed
feat: create tensors for Jina architecture
1 parent 86a5d96 commit 747d17a

File tree

4 files changed

+61
-38
lines changed

4 files changed

+61
-38
lines changed

convert-hf-to-gguf.py

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
7777
for part_name in self.part_names:
7878
print(f"gguf: loading model part '{part_name}'")
7979
ctx: ContextManager[Any]
80+
8081
if self.is_safetensors:
8182
from safetensors import safe_open
8283
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
@@ -91,6 +92,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
9192

9293
def set_gguf_parameters(self):
9394
self.gguf_writer.add_name(self.dir_model.name)
95+
print(f'self.block_count {self.block_count}')
9496
self.gguf_writer.add_block_count(self.block_count)
9597

9698
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -136,6 +138,7 @@ def set_gguf_parameters(self):
136138
def write_tensors(self):
137139
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
138140
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
141+
print(f'Block_count {block_count} with tensor_map {tensor_map}')
139142
for name, data_torch in self.get_tensors():
140143
# we don't need these
141144
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
@@ -2096,6 +2099,7 @@ def write_tensors(self):
20962099

20972100
# map tensor names
20982101
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
2102+
20992103
if new_name is None:
21002104
print(f"Can not map tensor {name!r}")
21012105
sys.exit()
@@ -2166,34 +2170,6 @@ def get_tensors(self):
21662170
class JinaBertModel(BertModel):
21672171
model_arch = gguf.MODEL_ARCH.JINA_BERT
21682172

2169-
def __init__(self, *args, **kwargs):
2170-
super().__init__(*args, **kwargs)
2171-
2172-
print(f'hparams {self.hparams}')
2173-
2174-
assert self.hparams["position_embedding_type"] == "alibi"
2175-
2176-
# def __init__(self, *args, **kwargs):
2177-
# super().__init__(*args, **kwargs)
2178-
#
2179-
# assert self.hparams["position_embedding_type"] == "alibi"
2180-
#
2181-
# # GeGLU activation
2182-
# assert self.hparams["feed_forward_type"] == "geglu"
2183-
#
2184-
# def get_tensors(self):
2185-
# assert self.vocab_size is not None
2186-
# for name, data in super().get_tensors():
2187-
# print(f'get_tensors: {name} {data.shape}')
2188-
# # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2189-
# if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
2190-
# rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
2191-
# assert data.shape == (rounded_vocab_size, self.hparams["hidden_size"])
2192-
# data = data[:self.vocab_size, :]
2193-
# yield name, data
2194-
2195-
2196-
21972173
@Model.register("GemmaForCausalLM")
21982174
class GemmaModel(Model):
21992175
model_arch = gguf.MODEL_ARCH.GEMMA
@@ -2461,9 +2437,7 @@ def main() -> None:
24612437
print(f"Loading model: {dir_model.name}")
24622438

24632439
hparams = Model.load_hparams(dir_model)
2464-
24652440
with torch.inference_mode():
2466-
print(hparams["architectures"])
24672441
model_class = Model.from_model_architecture(hparams["architectures"][0])
24682442
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
24692443

gguf-py/gguf/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,14 +363,14 @@ class MODEL_TENSOR(IntEnum):
363363
MODEL_TENSOR.TOKEN_EMBD,
364364
MODEL_TENSOR.TOKEN_EMBD_NORM,
365365
MODEL_TENSOR.TOKEN_TYPES,
366-
MODEL_TENSOR.OUTPUT_NORM,
367366
MODEL_TENSOR.ATTN_OUT_NORM,
368367
MODEL_TENSOR.ATTN_Q,
369368
MODEL_TENSOR.ATTN_K,
370369
MODEL_TENSOR.ATTN_V,
371370
MODEL_TENSOR.ATTN_OUT,
372-
MODEL_TENSOR.FFN_DOWN,
373371
MODEL_TENSOR.FFN_UP,
372+
MODEL_TENSOR.FFN_DOWN,
373+
MODEL_TENSOR.FFN_GATE,
374374
MODEL_TENSOR.LAYER_OUT_NORM,
375375
],
376376
MODEL_ARCH.MPT: [

gguf-py/gguf/tensor_mapping.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,6 @@ class TensorNameMap:
217217
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
218218
"layers.{bid}.feed_forward.w3", # llama-pth
219219
"encoder.layer.{bid}.intermediate.dense", # bert
220-
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert
221-
"encoder.layer.{bid}.mlp.layernorm", # jina-bert
222-
"encoder.layer.{bid}.mlp.wo", # jina-bert
223220
"transformer.h.{bid}.mlp.fc_in", # gpt-j
224221
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
225222
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
@@ -251,6 +248,7 @@ class TensorNameMap:
251248
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
252249
"model.layers.{bid}.feed_forward.w1", # internlm2
253250
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
251+
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert
254252
),
255253

256254
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -278,6 +276,7 @@ class TensorNameMap:
278276
"model.layers.{bid}.feed_forward.w2", # internlm2
279277
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
280278
"model.layers.{bid}.mlp.c_proj", # starcoder2
279+
"encoder.layer.{bid}.mlp.wo", # jina-bert
281280
),
282281

283282
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -307,6 +306,7 @@ class TensorNameMap:
307306
"encoder.layer.{bid}.output.LayerNorm", # bert
308307
"encoder.layers.{bid}.norm2", # nomic-bert
309308
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
309+
"encoder.layer.{bid}.mlp.layernorm", # jina-bert
310310
),
311311

312312
MODEL_TENSOR.SSM_IN: (

llama.cpp

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
680680
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
681681
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
682682
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
683+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
683684
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
684685
},
685686
},
@@ -1921,6 +1922,16 @@ struct llama_layer {
19211922
// mamba bias
19221923
struct ggml_tensor * ssm_conv1d_b;
19231924
struct ggml_tensor * ssm_dt_b;
1925+
1926+
//glu mlp (jina-bert)
1927+
struct ggml_tensor * mlp_gated_layer_w;
1928+
1929+
struct ggml_tensor * mlp_wo_w;
1930+
struct ggml_tensor * mlp_wo_b;
1931+
1932+
struct ggml_tensor * mlp_norm_w;
1933+
struct ggml_tensor * mlp_norm_b;
1934+
19241935
};
19251936

19261937
struct llama_kv_cell {
@@ -4813,7 +4824,6 @@ static bool llm_load_tensors(
48134824
}
48144825
} break;
48154826
case LLM_ARCH_BERT:
4816-
case LLM_ARCH_JINA_BERT:
48174827
case LLM_ARCH_NOMIC_BERT:
48184828
{
48194829
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -4831,7 +4841,7 @@ static bool llm_load_tensors(
48314841

48324842
auto & layer = model.layers[i];
48334843

4834-
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
4844+
if (model.arch == LLM_ARCH_BERT) {
48354845
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
48364846
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
48374847

@@ -4852,7 +4862,7 @@ static bool llm_load_tensors(
48524862
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
48534863
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
48544864

4855-
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT) {
4865+
if (model.arch == LLM_ARCH_BERT) {
48564866
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
48574867
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
48584868

@@ -4865,6 +4875,44 @@ static bool llm_load_tensors(
48654875
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
48664876
}
48674877
} break;
4878+
case LLM_ARCH_JINA_BERT:
4879+
{
4880+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
4881+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
4882+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
4883+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias? Not sure needed
4884+
4885+
for (int i = 0; i < n_layer; ++i) {
4886+
ggml_context * ctx_layer = ctx_for_layer(i);
4887+
ggml_context * ctx_split = ctx_for_layer_split(i);
4888+
4889+
auto & layer = model.layers[i]; // JinaBertLayer
4890+
4891+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4892+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
4893+
4894+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4895+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
4896+
4897+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4898+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
4899+
4900+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
4901+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
4902+
4903+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
4904+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
4905+
4906+
// TODO: HANDLE ALL THE MLP
4907+
layer.mlp_gated_layer_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, 2 * n_ff});
4908+
4909+
layer.mlp_wo_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4910+
layer.mlp_wo_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4911+
4912+
layer.mlp_norm_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4913+
layer.mlp_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
4914+
}
4915+
} break;
48684916
case LLM_ARCH_BLOOM:
48694917
{
48704918
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -9713,6 +9761,7 @@ static struct ggml_cgraph * llama_build_graph(
97139761
result = llm.build_refact();
97149762
} break;
97159763
case LLM_ARCH_BERT:
9764+
case LLM_ARCH_JINA_BERT:
97169765
case LLM_ARCH_NOMIC_BERT:
97179766
{
97189767
result = llm.build_bert();

0 commit comments

Comments
 (0)