@@ -676,22 +676,21 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
676
676
677
677
// default hparams (LLaMA 7B)
678
678
struct llama_hparams {
679
- uint32_t n_vocab = 32000 ;
680
- uint32_t n_ctx = 512 ;
681
- uint32_t n_embd = 4096 ;
682
- uint32_t n_head = 32 ;
683
- uint32_t n_head_kv = 32 ;
684
- uint32_t n_layer = 32 ;
685
- uint32_t n_rot = 64 ;
686
- uint32_t n_ff = 11008 ;
679
+ uint32_t n_vocab = 32000 ;
680
+ uint32_t n_ctx_train = 2048 ; // the context size used during training
681
+ uint32_t n_ctx = 512 ; // the context size used during inference
682
+ uint32_t n_embd = 4096 ;
683
+ uint32_t n_head = 32 ;
684
+ uint32_t n_head_kv = 32 ;
685
+ uint32_t n_layer = 32 ;
686
+ uint32_t n_rot = 64 ;
687
+ uint32_t n_ff = 11008 ;
687
688
688
689
float f_norm_rms_eps = 1e-5 ;
689
690
690
691
float rope_freq_base = 10000 .0f ;
691
692
float rope_freq_scale = 1 .0f ;
692
693
693
- enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
694
-
695
694
bool operator !=(const llama_hparams & other) const {
696
695
return static_cast <bool >(memcmp (this , &other, sizeof (llama_hparams))); // NOLINT
697
696
}
@@ -1325,7 +1324,7 @@ static void llama_model_load_internal(
1325
1324
}
1326
1325
1327
1326
GGUF_GET (hparams.n_vocab , gguf_get_arr_n, GGUF_TYPE_ARRAY, true , " tokenizer.ggml.tokens" );
1328
- GGUF_GET (hparams.n_ctx , gguf_get_val_u32, GGUF_TYPE_UINT32, true , " llama.context_length" );
1327
+ GGUF_GET (hparams.n_ctx_train , gguf_get_val_u32, GGUF_TYPE_UINT32, true , " llama.context_length" );
1329
1328
GGUF_GET (hparams.n_embd , gguf_get_val_u32, GGUF_TYPE_UINT32, true , " llama.embedding_length" );
1330
1329
GGUF_GET (hparams.n_ff , gguf_get_val_u32, GGUF_TYPE_UINT32, true , " llama.feed_forward_length" );
1331
1330
GGUF_GET (hparams.n_head , gguf_get_val_u32, GGUF_TYPE_UINT32, true , " llama.attention.head_count" );
@@ -1399,21 +1398,23 @@ static void llama_model_load_internal(
1399
1398
}
1400
1399
1401
1400
{
1402
- LLAMA_LOG_INFO (" %s: format = %s\n " , __func__, llama_file_version_name (ml->file_version ));
1403
- LLAMA_LOG_INFO (" %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1404
- LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1405
- LLAMA_LOG_INFO (" %s: n_embd = %u\n " , __func__, hparams.n_embd );
1406
- LLAMA_LOG_INFO (" %s: n_head = %u\n " , __func__, hparams.n_head );
1407
- LLAMA_LOG_INFO (" %s: n_head_kv = %u\n " , __func__, hparams.n_head_kv );
1408
- LLAMA_LOG_INFO (" %s: n_layer = %u\n " , __func__, hparams.n_layer );
1409
- LLAMA_LOG_INFO (" %s: n_rot = %u\n " , __func__, hparams.n_rot ); // a.k.a. n_embd_head, n_head_dim
1410
- LLAMA_LOG_INFO (" %s: n_gqa = %u\n " , __func__, hparams.n_gqa ());
1411
- LLAMA_LOG_INFO (" %s: f_norm_eps = %.1e\n " , __func__, hparams.f_norm_rms_eps );
1412
- LLAMA_LOG_INFO (" %s: n_ff = %u\n " , __func__, hparams.n_ff );
1413
- LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1414
- LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1415
- LLAMA_LOG_INFO (" %s: ftype = %u (%s)\n " , __func__, hparams.ftype , llama_ftype_name (hparams.ftype ));
1416
- LLAMA_LOG_INFO (" %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1401
+ LLAMA_LOG_INFO (" %s: format = %s\n " , __func__, llama_file_version_name (ml->file_version ));
1402
+ LLAMA_LOG_INFO (" %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1403
+ LLAMA_LOG_INFO (" %s: n_ctx_train = %u\n " , __func__, hparams.n_ctx_train );
1404
+ LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1405
+ LLAMA_LOG_INFO (" %s: n_embd = %u\n " , __func__, hparams.n_embd );
1406
+ LLAMA_LOG_INFO (" %s: n_head = %u\n " , __func__, hparams.n_head );
1407
+ LLAMA_LOG_INFO (" %s: n_head_kv = %u\n " , __func__, hparams.n_head_kv );
1408
+ LLAMA_LOG_INFO (" %s: n_layer = %u\n " , __func__, hparams.n_layer );
1409
+ LLAMA_LOG_INFO (" %s: n_rot = %u\n " , __func__, hparams.n_rot ); // a.k.a. n_embd_head, n_head_dim
1410
+ LLAMA_LOG_INFO (" %s: n_gqa = %u\n " , __func__, hparams.n_gqa ());
1411
+ LLAMA_LOG_INFO (" %s: f_norm_eps = %.1e\n " , __func__, hparams.f_norm_rms_eps );
1412
+ LLAMA_LOG_INFO (" %s: n_ff = %u\n " , __func__, hparams.n_ff );
1413
+ LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1414
+ LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1415
+ LLAMA_LOG_INFO (" %s: model size = %s\n " , __func__, llama_model_type_name (model.type ));
1416
+
1417
+ // TODO: print number of tensors for each quantization
1417
1418
}
1418
1419
1419
1420
if (vocab_only) {
@@ -3365,7 +3366,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
3365
3366
static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
3366
3367
ggml_type quantized_type;
3367
3368
llama_ftype ftype = params->ftype ;
3368
- int nthread = params->nthread ;
3369
3369
3370
3370
switch (params->ftype ) {
3371
3371
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break ;
@@ -3391,6 +3391,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3391
3391
default : throw std::runtime_error (format (" invalid output file type %d\n " , ftype));
3392
3392
}
3393
3393
3394
+ int nthread = params->nthread ;
3395
+
3394
3396
if (nthread <= 0 ) {
3395
3397
nthread = std::thread::hardware_concurrency ();
3396
3398
}
@@ -3661,6 +3663,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3661
3663
}
3662
3664
}
3663
3665
3666
+ // TODO: after the GGUF PR, this likely won't work and needs to be updated
3664
3667
int llama_apply_lora_from_file_internal (const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
3665
3668
LLAMA_LOG_INFO (" %s: applying lora adapter from '%s' - please wait ...\n " , __func__, path_lora);
3666
3669
0 commit comments