Skip to content

Commit 48ead50

Browse files
committed
llama : model
ggml-ci
1 parent 5de3687 commit 48ead50

File tree

3 files changed

+129
-106
lines changed

3 files changed

+129
-106
lines changed

src/llama-model.cpp

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,74 @@
22

33
#include "llama-impl.h"
44

5-
std::string llama_model_ftype_name(llama_ftype ftype) {
5+
const char * llm_type_name(llm_type type) {
6+
switch (type) {
7+
case MODEL_14M: return "14M";
8+
case MODEL_17M: return "17M";
9+
case MODEL_22M: return "22M";
10+
case MODEL_33M: return "33M";
11+
case MODEL_60M: return "60M";
12+
case MODEL_70M: return "70M";
13+
case MODEL_80M: return "80M";
14+
case MODEL_109M: return "109M";
15+
case MODEL_137M: return "137M";
16+
case MODEL_160M: return "160M";
17+
case MODEL_220M: return "220M";
18+
case MODEL_250M: return "250M";
19+
case MODEL_270M: return "270M";
20+
case MODEL_335M: return "335M";
21+
case MODEL_410M: return "410M";
22+
case MODEL_450M: return "450M";
23+
case MODEL_770M: return "770M";
24+
case MODEL_780M: return "780M";
25+
case MODEL_0_5B: return "0.5B";
26+
case MODEL_1B: return "1B";
27+
case MODEL_1_3B: return "1.3B";
28+
case MODEL_1_4B: return "1.4B";
29+
case MODEL_1_5B: return "1.5B";
30+
case MODEL_1_6B: return "1.6B";
31+
case MODEL_2B: return "2B";
32+
case MODEL_2_8B: return "2.8B";
33+
case MODEL_3B: return "3B";
34+
case MODEL_4B: return "4B";
35+
case MODEL_6B: return "6B";
36+
case MODEL_6_9B: return "6.9B";
37+
case MODEL_7B: return "7B";
38+
case MODEL_8B: return "8B";
39+
case MODEL_9B: return "9B";
40+
case MODEL_11B: return "11B";
41+
case MODEL_12B: return "12B";
42+
case MODEL_13B: return "13B";
43+
case MODEL_14B: return "14B";
44+
case MODEL_15B: return "15B";
45+
case MODEL_16B: return "16B";
46+
case MODEL_20B: return "20B";
47+
case MODEL_30B: return "30B";
48+
case MODEL_32B: return "32B";
49+
case MODEL_34B: return "34B";
50+
case MODEL_35B: return "35B";
51+
case MODEL_40B: return "40B";
52+
case MODEL_65B: return "65B";
53+
case MODEL_70B: return "70B";
54+
case MODEL_236B: return "236B";
55+
case MODEL_314B: return "314B";
56+
case MODEL_SMALL: return "0.1B";
57+
case MODEL_MEDIUM: return "0.4B";
58+
case MODEL_LARGE: return "0.8B";
59+
case MODEL_XL: return "1.5B";
60+
case MODEL_A1_7B: return "A1.7B";
61+
case MODEL_A2_7B: return "A2.7B";
62+
case MODEL_8x7B: return "8x7B";
63+
case MODEL_8x22B: return "8x22B";
64+
case MODEL_16x12B: return "16x12B";
65+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
66+
case MODEL_57B_A14B: return "57B.A14B";
67+
case MODEL_27B: return "27B";
68+
default: return "?B";
69+
}
70+
}
71+
72+
static std::string llama_model_ftype_name(llama_ftype ftype) {
673
if (ftype & LLAMA_FTYPE_GUESSED) {
774
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
875
}
@@ -45,6 +112,18 @@ std::string llama_model_ftype_name(llama_ftype ftype) {
45112
}
46113
}
47114

115+
std::string llama_model_arch_name (const llama_model & model) {
116+
return llm_arch_name(model.arch);
117+
}
118+
119+
std::string llama_model_type_name (const llama_model & model) {
120+
return llm_type_name(model.type);
121+
}
122+
123+
std::string llama_model_ftype_name(const llama_model & model) {
124+
return llama_model_ftype_name(model.ftype);
125+
}
126+
48127
template<typename F>
49128
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
50129
ggml_init_params params = {
@@ -83,7 +162,8 @@ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & b
83162
}
84163

85164
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
86-
return select_buft(*model.dev_layer.at(il).buft_list,
165+
return select_buft(
166+
*model.dev_layer.at(il).buft_list,
87167
[&](ggml_context * ctx) {
88168
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
89169
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);

src/llama-model.h

Lines changed: 37 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
#define LLAMA_MAX_LAYERS 512
1616
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
1717

18-
// available llama models
19-
enum e_model {
18+
// available models
19+
// TODO: this enum does not follow the enum naming convention
20+
enum llm_type {
2021
MODEL_UNKNOWN,
2122
MODEL_14M,
2223
MODEL_17M,
@@ -81,73 +82,6 @@ enum e_model {
8182
MODEL_27B,
8283
};
8384

84-
static const char * llama_model_type_name(e_model type) {
85-
switch (type) {
86-
case MODEL_14M: return "14M";
87-
case MODEL_17M: return "17M";
88-
case MODEL_22M: return "22M";
89-
case MODEL_33M: return "33M";
90-
case MODEL_60M: return "60M";
91-
case MODEL_70M: return "70M";
92-
case MODEL_80M: return "80M";
93-
case MODEL_109M: return "109M";
94-
case MODEL_137M: return "137M";
95-
case MODEL_160M: return "160M";
96-
case MODEL_220M: return "220M";
97-
case MODEL_250M: return "250M";
98-
case MODEL_270M: return "270M";
99-
case MODEL_335M: return "335M";
100-
case MODEL_410M: return "410M";
101-
case MODEL_450M: return "450M";
102-
case MODEL_770M: return "770M";
103-
case MODEL_780M: return "780M";
104-
case MODEL_0_5B: return "0.5B";
105-
case MODEL_1B: return "1B";
106-
case MODEL_1_3B: return "1.3B";
107-
case MODEL_1_4B: return "1.4B";
108-
case MODEL_1_5B: return "1.5B";
109-
case MODEL_1_6B: return "1.6B";
110-
case MODEL_2B: return "2B";
111-
case MODEL_2_8B: return "2.8B";
112-
case MODEL_3B: return "3B";
113-
case MODEL_4B: return "4B";
114-
case MODEL_6B: return "6B";
115-
case MODEL_6_9B: return "6.9B";
116-
case MODEL_7B: return "7B";
117-
case MODEL_8B: return "8B";
118-
case MODEL_9B: return "9B";
119-
case MODEL_11B: return "11B";
120-
case MODEL_12B: return "12B";
121-
case MODEL_13B: return "13B";
122-
case MODEL_14B: return "14B";
123-
case MODEL_15B: return "15B";
124-
case MODEL_16B: return "16B";
125-
case MODEL_20B: return "20B";
126-
case MODEL_30B: return "30B";
127-
case MODEL_32B: return "32B";
128-
case MODEL_34B: return "34B";
129-
case MODEL_35B: return "35B";
130-
case MODEL_40B: return "40B";
131-
case MODEL_65B: return "65B";
132-
case MODEL_70B: return "70B";
133-
case MODEL_236B: return "236B";
134-
case MODEL_314B: return "314B";
135-
case MODEL_SMALL: return "0.1B";
136-
case MODEL_MEDIUM: return "0.4B";
137-
case MODEL_LARGE: return "0.8B";
138-
case MODEL_XL: return "1.5B";
139-
case MODEL_A1_7B: return "A1.7B";
140-
case MODEL_A2_7B: return "A2.7B";
141-
case MODEL_8x7B: return "8x7B";
142-
case MODEL_8x22B: return "8x22B";
143-
case MODEL_16x12B: return "16x12B";
144-
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
145-
case MODEL_57B_A14B: return "57B.A14B";
146-
case MODEL_27B: return "27B";
147-
default: return "?B";
148-
}
149-
}
150-
15185
struct llama_hparams_posnet {
15286
uint32_t n_embd;
15387
uint32_t n_layer;
@@ -187,27 +121,27 @@ struct llama_hparams {
187121
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
188122

189123
uint32_t n_layer_dense_lead = 0;
190-
uint32_t n_lora_q = 0;
191-
uint32_t n_lora_kv = 0;
192-
uint32_t n_ff_exp = 0;
193-
uint32_t n_ff_shexp = 0;
194-
uint32_t n_expert_shared = 0;
195-
float expert_weights_scale = 0.0;
124+
uint32_t n_lora_q = 0;
125+
uint32_t n_lora_kv = 0;
126+
uint32_t n_ff_exp = 0;
127+
uint32_t n_ff_shexp = 0;
128+
uint32_t n_expert_shared = 0;
129+
uint32_t n_norm_groups = 0;
130+
131+
float expert_weights_scale = 0.0;
196132

197133
float f_norm_eps;
198134
float f_norm_rms_eps;
199135
float f_norm_group_eps;
200136

201-
uint32_t n_norm_groups;
202-
203-
float f_attn_logit_softcapping = 50.0f;
137+
float f_attn_logit_softcapping = 50.0f;
204138
float f_final_logit_softcapping = 30.0f;
205139

206140
// for RWKV
207141
uint32_t rescale_every_n_layers = 0;
208-
uint32_t time_mix_extra_dim = 0;
209-
uint32_t time_decay_extra_dim = 0;
210-
uint32_t wkv_head_size = 0;
142+
uint32_t time_mix_extra_dim = 0;
143+
uint32_t time_decay_extra_dim = 0;
144+
uint32_t wkv_head_size = 0;
211145

212146
float rope_attn_factor = 1.0f;
213147
float rope_freq_base_train;
@@ -221,6 +155,7 @@ struct llama_hparams {
221155
uint32_t ssm_d_inner = 0;
222156
uint32_t ssm_d_state = 0;
223157
uint32_t ssm_dt_rank = 0;
158+
224159
bool ssm_dt_b_c_rms = false;
225160

226161
float f_clamp_kqv = 0.0f;
@@ -518,34 +453,35 @@ struct llama_layer {
518453
};
519454

520455
struct llama_model {
521-
e_model type = MODEL_UNKNOWN;
522-
llm_arch arch = LLM_ARCH_UNKNOWN;
456+
llm_type type = MODEL_UNKNOWN;
457+
llm_arch arch = LLM_ARCH_UNKNOWN;
458+
523459
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
524460

525461
std::string name = "n/a";
526462

527463
llama_hparams hparams = {};
528464
llama_vocab vocab;
529465

530-
struct ggml_tensor * tok_embd = nullptr;
531-
struct ggml_tensor * type_embd = nullptr;
532-
struct ggml_tensor * pos_embd = nullptr;
533-
struct ggml_tensor * tok_norm = nullptr;
466+
struct ggml_tensor * tok_embd = nullptr;
467+
struct ggml_tensor * type_embd = nullptr;
468+
struct ggml_tensor * pos_embd = nullptr;
469+
struct ggml_tensor * tok_norm = nullptr;
534470
struct ggml_tensor * tok_norm_b = nullptr;
535471

536-
struct ggml_tensor * output_norm = nullptr;
537-
struct ggml_tensor * output_norm_b = nullptr;
538-
struct ggml_tensor * output = nullptr;
539-
struct ggml_tensor * output_b = nullptr;
472+
struct ggml_tensor * output_norm = nullptr;
473+
struct ggml_tensor * output_norm_b = nullptr;
474+
struct ggml_tensor * output = nullptr;
475+
struct ggml_tensor * output_b = nullptr;
540476
struct ggml_tensor * output_norm_enc = nullptr;
541477

542478
// classifier
543-
struct ggml_tensor * cls = nullptr;
544-
struct ggml_tensor * cls_b = nullptr;
479+
struct ggml_tensor * cls = nullptr;
480+
struct ggml_tensor * cls_b = nullptr;
545481
struct ggml_tensor * cls_out = nullptr;
546482
struct ggml_tensor * cls_out_b = nullptr;
547483

548-
struct ggml_tensor * conv1d = nullptr;
484+
struct ggml_tensor * conv1d = nullptr;
549485
struct ggml_tensor * conv1d_b = nullptr;
550486

551487
std::vector<llama_layer> layers;
@@ -611,6 +547,11 @@ struct llama_model {
611547
}
612548
};
613549

614-
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
550+
const char * llm_type_name(llm_type type);
615551

616-
std::string llama_model_ftype_name(llama_ftype ftype);
552+
std::string llama_model_arch_name (const llama_model & model);
553+
std::string llama_model_type_name (const llama_model & model);
554+
std::string llama_model_ftype_name(const llama_model & model);
555+
556+
// TODO: this probably belongs to llama-adapter
557+
ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);

src/llama.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1494,6 +1494,8 @@ static void llm_load_hparams(
14941494
hparams.n_embd_head_v = 0;
14951495
}
14961496

1497+
using e_model = llm_type; // TMP
1498+
14971499
// arch-specific KVs
14981500
switch (model.arch) {
14991501
case LLM_ARCH_LLAMA:
@@ -2986,8 +2988,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
29862988
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
29872989
}
29882990

2989-
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
2990-
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
2991+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
2992+
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
29912993
if (ml.n_elements >= 1e12) {
29922994
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
29932995
} else if (ml.n_elements >= 1e9) {
@@ -10016,9 +10018,9 @@ struct llm_build_context {
1001610018

1001710019
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
1001810020
switch (model.type) {
10019-
case e_model::MODEL_2B:
10020-
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
10021-
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
10021+
case llm_type::MODEL_2B:
10022+
case llm_type::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
10023+
case llm_type::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
1002210024
default: GGML_ABORT("fatal error");
1002310025
};
1002410026
cb(Qcur, "Qcur_scaled", il);
@@ -16264,9 +16266,9 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
1626416266

1626516267
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
1626616268
return snprintf(buf, buf_size, "%s %s %s",
16267-
llm_arch_name(model->arch), // TODO: llama_model_arch_name(model)
16268-
llama_model_type_name(model->type), // TODO: llama_model_type_name(model)
16269-
llama_model_ftype_name(model->ftype).c_str()); // TODO: llama_model_ftype_name(model)
16269+
llama_model_arch_name (*model).c_str(),
16270+
llama_model_type_name (*model).c_str(),
16271+
llama_model_ftype_name(*model).c_str());
1627016272
}
1627116273

1627216274
uint64_t llama_model_size(const struct llama_model * model) {

0 commit comments

Comments
 (0)