15
15
#define LLAMA_MAX_LAYERS 512
16
16
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
17
17
18
- // available llama models
19
- enum e_model {
18
+ // available models
19
+ // TODO: this enum does not follow the enum naming convention
20
+ enum llm_type {
20
21
MODEL_UNKNOWN,
21
22
MODEL_14M,
22
23
MODEL_17M,
@@ -81,73 +82,6 @@ enum e_model {
81
82
MODEL_27B,
82
83
};
83
84
84
- static const char * llama_model_type_name (e_model type) {
85
- switch (type) {
86
- case MODEL_14M: return " 14M" ;
87
- case MODEL_17M: return " 17M" ;
88
- case MODEL_22M: return " 22M" ;
89
- case MODEL_33M: return " 33M" ;
90
- case MODEL_60M: return " 60M" ;
91
- case MODEL_70M: return " 70M" ;
92
- case MODEL_80M: return " 80M" ;
93
- case MODEL_109M: return " 109M" ;
94
- case MODEL_137M: return " 137M" ;
95
- case MODEL_160M: return " 160M" ;
96
- case MODEL_220M: return " 220M" ;
97
- case MODEL_250M: return " 250M" ;
98
- case MODEL_270M: return " 270M" ;
99
- case MODEL_335M: return " 335M" ;
100
- case MODEL_410M: return " 410M" ;
101
- case MODEL_450M: return " 450M" ;
102
- case MODEL_770M: return " 770M" ;
103
- case MODEL_780M: return " 780M" ;
104
- case MODEL_0_5B: return " 0.5B" ;
105
- case MODEL_1B: return " 1B" ;
106
- case MODEL_1_3B: return " 1.3B" ;
107
- case MODEL_1_4B: return " 1.4B" ;
108
- case MODEL_1_5B: return " 1.5B" ;
109
- case MODEL_1_6B: return " 1.6B" ;
110
- case MODEL_2B: return " 2B" ;
111
- case MODEL_2_8B: return " 2.8B" ;
112
- case MODEL_3B: return " 3B" ;
113
- case MODEL_4B: return " 4B" ;
114
- case MODEL_6B: return " 6B" ;
115
- case MODEL_6_9B: return " 6.9B" ;
116
- case MODEL_7B: return " 7B" ;
117
- case MODEL_8B: return " 8B" ;
118
- case MODEL_9B: return " 9B" ;
119
- case MODEL_11B: return " 11B" ;
120
- case MODEL_12B: return " 12B" ;
121
- case MODEL_13B: return " 13B" ;
122
- case MODEL_14B: return " 14B" ;
123
- case MODEL_15B: return " 15B" ;
124
- case MODEL_16B: return " 16B" ;
125
- case MODEL_20B: return " 20B" ;
126
- case MODEL_30B: return " 30B" ;
127
- case MODEL_32B: return " 32B" ;
128
- case MODEL_34B: return " 34B" ;
129
- case MODEL_35B: return " 35B" ;
130
- case MODEL_40B: return " 40B" ;
131
- case MODEL_65B: return " 65B" ;
132
- case MODEL_70B: return " 70B" ;
133
- case MODEL_236B: return " 236B" ;
134
- case MODEL_314B: return " 314B" ;
135
- case MODEL_SMALL: return " 0.1B" ;
136
- case MODEL_MEDIUM: return " 0.4B" ;
137
- case MODEL_LARGE: return " 0.8B" ;
138
- case MODEL_XL: return " 1.5B" ;
139
- case MODEL_A1_7B: return " A1.7B" ;
140
- case MODEL_A2_7B: return " A2.7B" ;
141
- case MODEL_8x7B: return " 8x7B" ;
142
- case MODEL_8x22B: return " 8x22B" ;
143
- case MODEL_16x12B: return " 16x12B" ;
144
- case MODEL_10B_128x3_66B: return " 10B+128x3.66B" ;
145
- case MODEL_57B_A14B: return " 57B.A14B" ;
146
- case MODEL_27B: return " 27B" ;
147
- default : return " ?B" ;
148
- }
149
- }
150
-
151
85
struct llama_hparams_posnet {
152
86
uint32_t n_embd;
153
87
uint32_t n_layer;
@@ -187,27 +121,27 @@ struct llama_hparams {
187
121
std::array<uint32_t , LLAMA_MAX_LAYERS> n_ff_arr;
188
122
189
123
uint32_t n_layer_dense_lead = 0 ;
190
- uint32_t n_lora_q = 0 ;
191
- uint32_t n_lora_kv = 0 ;
192
- uint32_t n_ff_exp = 0 ;
193
- uint32_t n_ff_shexp = 0 ;
194
- uint32_t n_expert_shared = 0 ;
195
- float expert_weights_scale = 0.0 ;
124
+ uint32_t n_lora_q = 0 ;
125
+ uint32_t n_lora_kv = 0 ;
126
+ uint32_t n_ff_exp = 0 ;
127
+ uint32_t n_ff_shexp = 0 ;
128
+ uint32_t n_expert_shared = 0 ;
129
+ uint32_t n_norm_groups = 0 ;
130
+
131
+ float expert_weights_scale = 0.0 ;
196
132
197
133
float f_norm_eps;
198
134
float f_norm_rms_eps;
199
135
float f_norm_group_eps;
200
136
201
- uint32_t n_norm_groups;
202
-
203
- float f_attn_logit_softcapping = 50 .0f ;
137
+ float f_attn_logit_softcapping = 50 .0f ;
204
138
float f_final_logit_softcapping = 30 .0f ;
205
139
206
140
// for RWKV
207
141
uint32_t rescale_every_n_layers = 0 ;
208
- uint32_t time_mix_extra_dim = 0 ;
209
- uint32_t time_decay_extra_dim = 0 ;
210
- uint32_t wkv_head_size = 0 ;
142
+ uint32_t time_mix_extra_dim = 0 ;
143
+ uint32_t time_decay_extra_dim = 0 ;
144
+ uint32_t wkv_head_size = 0 ;
211
145
212
146
float rope_attn_factor = 1 .0f ;
213
147
float rope_freq_base_train;
@@ -221,6 +155,7 @@ struct llama_hparams {
221
155
uint32_t ssm_d_inner = 0 ;
222
156
uint32_t ssm_d_state = 0 ;
223
157
uint32_t ssm_dt_rank = 0 ;
158
+
224
159
bool ssm_dt_b_c_rms = false ;
225
160
226
161
float f_clamp_kqv = 0 .0f ;
@@ -518,34 +453,35 @@ struct llama_layer {
518
453
};
519
454
520
455
struct llama_model {
521
- e_model type = MODEL_UNKNOWN;
522
- llm_arch arch = LLM_ARCH_UNKNOWN;
456
+ llm_type type = MODEL_UNKNOWN;
457
+ llm_arch arch = LLM_ARCH_UNKNOWN;
458
+
523
459
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
524
460
525
461
std::string name = " n/a" ;
526
462
527
463
llama_hparams hparams = {};
528
464
llama_vocab vocab;
529
465
530
- struct ggml_tensor * tok_embd = nullptr ;
531
- struct ggml_tensor * type_embd = nullptr ;
532
- struct ggml_tensor * pos_embd = nullptr ;
533
- struct ggml_tensor * tok_norm = nullptr ;
466
+ struct ggml_tensor * tok_embd = nullptr ;
467
+ struct ggml_tensor * type_embd = nullptr ;
468
+ struct ggml_tensor * pos_embd = nullptr ;
469
+ struct ggml_tensor * tok_norm = nullptr ;
534
470
struct ggml_tensor * tok_norm_b = nullptr ;
535
471
536
- struct ggml_tensor * output_norm = nullptr ;
537
- struct ggml_tensor * output_norm_b = nullptr ;
538
- struct ggml_tensor * output = nullptr ;
539
- struct ggml_tensor * output_b = nullptr ;
472
+ struct ggml_tensor * output_norm = nullptr ;
473
+ struct ggml_tensor * output_norm_b = nullptr ;
474
+ struct ggml_tensor * output = nullptr ;
475
+ struct ggml_tensor * output_b = nullptr ;
540
476
struct ggml_tensor * output_norm_enc = nullptr ;
541
477
542
478
// classifier
543
- struct ggml_tensor * cls = nullptr ;
544
- struct ggml_tensor * cls_b = nullptr ;
479
+ struct ggml_tensor * cls = nullptr ;
480
+ struct ggml_tensor * cls_b = nullptr ;
545
481
struct ggml_tensor * cls_out = nullptr ;
546
482
struct ggml_tensor * cls_out_b = nullptr ;
547
483
548
- struct ggml_tensor * conv1d = nullptr ;
484
+ struct ggml_tensor * conv1d = nullptr ;
549
485
struct ggml_tensor * conv1d_b = nullptr ;
550
486
551
487
std::vector<llama_layer> layers;
@@ -611,6 +547,11 @@ struct llama_model {
611
547
}
612
548
};
613
549
614
- ggml_backend_buffer_type_t llama_model_select_buft ( const llama_model & model, int il );
550
+ const char * llm_type_name (llm_type type );
615
551
616
- std::string llama_model_ftype_name (llama_ftype ftype);
552
+ std::string llama_model_arch_name (const llama_model & model);
553
+ std::string llama_model_type_name (const llama_model & model);
554
+ std::string llama_model_ftype_name (const llama_model & model);
555
+
556
+ // TODO: this probably belongs to llama-adapter
557
+ ggml_backend_buffer_type_t llama_model_select_buft (const llama_model & model, int il);
0 commit comments