@@ -189,3 +189,225 @@ struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, co
189
189
190
190
return it->second ;
191
191
}
192
+
193
+ size_t llama_model_max_nodes (const llama_model & model) {
194
+ return std::max<size_t >(8192 , model.tensors_by_name .size ()*5 );
195
+ }
196
+
197
+ //
198
+ // interface implementation
199
+ //
200
+
201
+ struct llama_model_params llama_model_default_params () {
202
+ struct llama_model_params result = {
203
+ /* .devices =*/ nullptr ,
204
+ /* .n_gpu_layers =*/ 0 ,
205
+ /* .split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
206
+ /* .main_gpu =*/ 0 ,
207
+ /* .tensor_split =*/ nullptr ,
208
+ /* .rpc_servers =*/ nullptr ,
209
+ /* .progress_callback =*/ nullptr ,
210
+ /* .progress_callback_user_data =*/ nullptr ,
211
+ /* .kv_overrides =*/ nullptr ,
212
+ /* .vocab_only =*/ false ,
213
+ /* .use_mmap =*/ true ,
214
+ /* .use_mlock =*/ false ,
215
+ /* .check_tensors =*/ false ,
216
+ };
217
+
218
+ #ifdef GGML_USE_METAL
219
+ // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
220
+ result.n_gpu_layers = 999 ;
221
+ #endif
222
+
223
+ return result;
224
+ }
225
+
226
+ void llama_free_model (struct llama_model * model) {
227
+ delete model;
228
+ }
229
+
230
+ enum llama_vocab_type llama_vocab_type (const struct llama_model * model) {
231
+ return model->vocab .type ;
232
+ }
233
+
234
+ int32_t llama_n_vocab (const struct llama_model * model) {
235
+ return model->hparams .n_vocab ;
236
+ }
237
+
238
+ int32_t llama_n_ctx_train (const struct llama_model * model) {
239
+ return model->hparams .n_ctx_train ;
240
+ }
241
+
242
+ int32_t llama_n_embd (const struct llama_model * model) {
243
+ return model->hparams .n_embd ;
244
+ }
245
+
246
+ int32_t llama_n_layer (const struct llama_model * model) {
247
+ return model->hparams .n_layer ;
248
+ }
249
+
250
+ int32_t llama_n_head (const struct llama_model * model) {
251
+ return model->hparams .n_head ();
252
+ }
253
+
254
+ enum llama_rope_type llama_rope_type (const struct llama_model * model) {
255
+ switch (model->arch ) {
256
+ // these models do not use RoPE
257
+ case LLM_ARCH_GPT2:
258
+ case LLM_ARCH_GPTJ:
259
+ case LLM_ARCH_MPT:
260
+ case LLM_ARCH_REFACT:
261
+ case LLM_ARCH_BLOOM:
262
+ case LLM_ARCH_MAMBA:
263
+ case LLM_ARCH_JINA_BERT_V2:
264
+ case LLM_ARCH_T5:
265
+ case LLM_ARCH_T5ENCODER:
266
+ case LLM_ARCH_JAIS:
267
+ case LLM_ARCH_RWKV6:
268
+ case LLM_ARCH_WAVTOKENIZER_DEC:
269
+ return LLAMA_ROPE_TYPE_NONE;
270
+
271
+ // use what we call a normal RoPE, operating on pairs of consecutive head values
272
+ case LLM_ARCH_LLAMA:
273
+ case LLM_ARCH_DECI:
274
+ case LLM_ARCH_BAICHUAN:
275
+ case LLM_ARCH_STARCODER:
276
+ case LLM_ARCH_PLAMO:
277
+ case LLM_ARCH_ORION:
278
+ case LLM_ARCH_INTERNLM2:
279
+ case LLM_ARCH_MINICPM:
280
+ case LLM_ARCH_XVERSE:
281
+ case LLM_ARCH_COMMAND_R:
282
+ case LLM_ARCH_OLMO:
283
+ case LLM_ARCH_ARCTIC:
284
+ case LLM_ARCH_DEEPSEEK:
285
+ case LLM_ARCH_DEEPSEEK2:
286
+ case LLM_ARCH_CHATGLM:
287
+ case LLM_ARCH_GRANITE:
288
+ case LLM_ARCH_GRANITE_MOE:
289
+ case LLM_ARCH_CHAMELEON:
290
+ return LLAMA_ROPE_TYPE_NORM;
291
+
292
+ // the pairs of head values are offset by n_rot/2
293
+ case LLM_ARCH_FALCON:
294
+ case LLM_ARCH_GROK:
295
+ case LLM_ARCH_DBRX:
296
+ case LLM_ARCH_BERT:
297
+ case LLM_ARCH_NOMIC_BERT:
298
+ case LLM_ARCH_STABLELM:
299
+ case LLM_ARCH_BITNET:
300
+ case LLM_ARCH_QWEN:
301
+ case LLM_ARCH_QWEN2:
302
+ case LLM_ARCH_QWEN2MOE:
303
+ case LLM_ARCH_OLMO2:
304
+ case LLM_ARCH_OLMOE:
305
+ case LLM_ARCH_PHI2:
306
+ case LLM_ARCH_PHI3:
307
+ case LLM_ARCH_GEMMA:
308
+ case LLM_ARCH_GEMMA2:
309
+ case LLM_ARCH_STARCODER2:
310
+ case LLM_ARCH_OPENELM:
311
+ case LLM_ARCH_GPTNEOX:
312
+ case LLM_ARCH_CODESHELL:
313
+ case LLM_ARCH_NEMOTRON:
314
+ case LLM_ARCH_EXAONE:
315
+ case LLM_ARCH_MINICPM3:
316
+ return LLAMA_ROPE_TYPE_NEOX;
317
+
318
+ case LLM_ARCH_QWEN2VL:
319
+ return LLAMA_ROPE_TYPE_MROPE;
320
+
321
+ // all model arches should be listed explicitly here
322
+ case LLM_ARCH_UNKNOWN:
323
+ GGML_ABORT (" unknown architecture" );
324
+ }
325
+
326
+ return LLAMA_ROPE_TYPE_NONE;
327
+ }
328
+
329
+ float llama_rope_freq_scale_train (const struct llama_model * model) {
330
+ return model->hparams .rope_freq_scale_train ;
331
+ }
332
+
333
+ int32_t llama_model_meta_val_str (const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
334
+ const auto & it = model->gguf_kv .find (key);
335
+ if (it == model->gguf_kv .end ()) {
336
+ if (buf_size > 0 ) {
337
+ buf[0 ] = ' \0 ' ;
338
+ }
339
+ return -1 ;
340
+ }
341
+ return snprintf (buf, buf_size, " %s" , it->second .c_str ());
342
+ }
343
+
344
+ int32_t llama_model_meta_count (const struct llama_model * model) {
345
+ return (int )model->gguf_kv .size ();
346
+ }
347
+
348
+ int32_t llama_model_meta_key_by_index (const struct llama_model * model, int i, char * buf, size_t buf_size) {
349
+ if (i < 0 || i >= (int )model->gguf_kv .size ()) {
350
+ if (buf_size > 0 ) {
351
+ buf[0 ] = ' \0 ' ;
352
+ }
353
+ return -1 ;
354
+ }
355
+ auto it = model->gguf_kv .begin ();
356
+ std::advance (it, i);
357
+ return snprintf (buf, buf_size, " %s" , it->first .c_str ());
358
+ }
359
+
360
+ int32_t llama_model_meta_val_str_by_index (const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
361
+ if (i < 0 || i >= (int )model->gguf_kv .size ()) {
362
+ if (buf_size > 0 ) {
363
+ buf[0 ] = ' \0 ' ;
364
+ }
365
+ return -1 ;
366
+ }
367
+ auto it = model->gguf_kv .begin ();
368
+ std::advance (it, i);
369
+ return snprintf (buf, buf_size, " %s" , it->second .c_str ());
370
+ }
371
+
372
+ int32_t llama_model_desc (const struct llama_model * model, char * buf, size_t buf_size) {
373
+ return snprintf (buf, buf_size, " %s %s %s" ,
374
+ llama_model_arch_name (*model).c_str (),
375
+ llama_model_type_name (*model).c_str (),
376
+ llama_model_ftype_name (*model).c_str ());
377
+ }
378
+
379
+ uint64_t llama_model_size (const struct llama_model * model) {
380
+ return model->n_bytes ;
381
+ }
382
+
383
+ uint64_t llama_model_n_params (const struct llama_model * model) {
384
+ return model->n_elements ;
385
+ }
386
+
387
+ bool llama_model_has_encoder (const struct llama_model * model) {
388
+ switch (model->arch ) {
389
+ case LLM_ARCH_T5: return true ;
390
+ case LLM_ARCH_T5ENCODER: return true ;
391
+ default : return false ;
392
+ }
393
+ }
394
+
395
+ bool llama_model_has_decoder (const struct llama_model * model) {
396
+ switch (model->arch ) {
397
+ case LLM_ARCH_T5ENCODER: return false ;
398
+ default : return true ;
399
+ }
400
+ }
401
+
402
+ llama_token llama_model_decoder_start_token (const struct llama_model * model) {
403
+ return model->hparams .dec_start_token_id ;
404
+ }
405
+
406
+ bool llama_model_is_recurrent (const struct llama_model * model) {
407
+ switch (model->arch ) {
408
+ case LLM_ARCH_MAMBA: return true ;
409
+ case LLM_ARCH_RWKV6: return true ;
410
+ default : return false ;
411
+ }
412
+ }
413
+
0 commit comments