@@ -830,6 +830,7 @@ struct llama_hparams {
830
830
uint32_t n_rot = 64 ;
831
831
uint32_t n_ff = 11008 ;
832
832
833
+ float f_norm_eps = 1e-5 ;
833
834
float f_norm_rms_eps = 1e-5 ;
834
835
835
836
float rope_freq_base = 10000 .0f ;
@@ -1557,6 +1558,7 @@ static void llm_load_hparams(
1557
1558
} break ;
1558
1559
case LLM_ARCH_FALCON:
1559
1560
{
1561
+ GGUF_GET_KEY (ctx, hparams.f_norm_eps , gguf_get_val_f32, GGUF_TYPE_FLOAT32, true , kv (LLM_KV_ATTENTION_LAYERNORM_EPS));
1560
1562
} break ;
1561
1563
default : (void )0 ;
1562
1564
};
@@ -1672,28 +1674,29 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1672
1674
const auto & vocab = model.vocab ;
1673
1675
1674
1676
// hparams
1675
- LLAMA_LOG_INFO (" %s: format = %s\n " , __func__, llama_file_version_name (ml.fver ));
1676
- LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, LLM_ARCH_NAMES.at (model.arch ).c_str ());
1677
- LLAMA_LOG_INFO (" %s: vocab type = %s\n " , __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? " SPM" : " BPE" ); // TODO: fix
1678
- LLAMA_LOG_INFO (" %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1679
- LLAMA_LOG_INFO (" %s: n_ctx_train = %u\n " , __func__, hparams.n_ctx_train );
1680
- LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1681
- LLAMA_LOG_INFO (" %s: n_embd = %u\n " , __func__, hparams.n_embd );
1682
- LLAMA_LOG_INFO (" %s: n_head = %u\n " , __func__, hparams.n_head );
1683
- LLAMA_LOG_INFO (" %s: n_head_kv = %u\n " , __func__, hparams.n_head_kv );
1684
- LLAMA_LOG_INFO (" %s: n_layer = %u\n " , __func__, hparams.n_layer );
1685
- LLAMA_LOG_INFO (" %s: n_rot = %u\n " , __func__, hparams.n_rot ); // a.k.a. n_embd_head, n_head_dim
1686
- LLAMA_LOG_INFO (" %s: n_gqa = %u\n " , __func__, hparams.n_gqa ());
1687
- LLAMA_LOG_INFO (" %s: f_norm_eps = %.1e\n " , __func__, hparams.f_norm_rms_eps );
1688
- LLAMA_LOG_INFO (" %s: n_ff = %u\n " , __func__, hparams.n_ff );
1689
- LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1690
- LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1691
- LLAMA_LOG_INFO (" %s: model type = %s\n " , __func__, llama_model_type_name (model.type ));
1692
- LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
1693
- LLAMA_LOG_INFO (" %s: model size = %.2f B\n " , __func__, ml.n_elements *1e-9 );
1677
+ LLAMA_LOG_INFO (" %s: format = %s\n " , __func__, llama_file_version_name (ml.fver ));
1678
+ LLAMA_LOG_INFO (" %s: arch = %s\n " , __func__, LLM_ARCH_NAMES.at (model.arch ).c_str ());
1679
+ LLAMA_LOG_INFO (" %s: vocab type = %s\n " , __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? " SPM" : " BPE" ); // TODO: fix
1680
+ LLAMA_LOG_INFO (" %s: n_vocab = %u\n " , __func__, hparams.n_vocab );
1681
+ LLAMA_LOG_INFO (" %s: n_ctx_train = %u\n " , __func__, hparams.n_ctx_train );
1682
+ LLAMA_LOG_INFO (" %s: n_ctx = %u\n " , __func__, hparams.n_ctx );
1683
+ LLAMA_LOG_INFO (" %s: n_embd = %u\n " , __func__, hparams.n_embd );
1684
+ LLAMA_LOG_INFO (" %s: n_head = %u\n " , __func__, hparams.n_head );
1685
+ LLAMA_LOG_INFO (" %s: n_head_kv = %u\n " , __func__, hparams.n_head_kv );
1686
+ LLAMA_LOG_INFO (" %s: n_layer = %u\n " , __func__, hparams.n_layer );
1687
+ LLAMA_LOG_INFO (" %s: n_rot = %u\n " , __func__, hparams.n_rot ); // a.k.a. n_embd_head, n_head_dim
1688
+ LLAMA_LOG_INFO (" %s: n_gqa = %u\n " , __func__, hparams.n_gqa ());
1689
+ LLAMA_LOG_INFO (" %s: f_norm_eps = %.1e\n " , __func__, hparams.f_norm_eps );
1690
+ LLAMA_LOG_INFO (" %s: f_norm_rms_eps = %.1e\n " , __func__, hparams.f_norm_rms_eps );
1691
+ LLAMA_LOG_INFO (" %s: n_ff = %u\n " , __func__, hparams.n_ff );
1692
+ LLAMA_LOG_INFO (" %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1693
+ LLAMA_LOG_INFO (" %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
1694
+ LLAMA_LOG_INFO (" %s: model type = %s\n " , __func__, llama_model_type_name (model.type ));
1695
+ LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
1696
+ LLAMA_LOG_INFO (" %s: model size = %.2f B\n " , __func__, ml.n_elements *1e-9 );
1694
1697
1695
1698
// general kv
1696
- LLAMA_LOG_INFO (" %s: general.name = %s\n " , __func__, model.name .c_str ());
1699
+ LLAMA_LOG_INFO (" %s: general.name = %s\n " , __func__, model.name .c_str ());
1697
1700
1698
1701
// special tokens
1699
1702
if (vocab.special_bos_id != -1 ) { LLAMA_LOG_INFO ( " %s: BOS token = %d '%s'\n " , __func__, vocab.special_bos_id , vocab.id_to_token [vocab.special_bos_id ].text .c_str () ); }
@@ -1899,8 +1902,7 @@ static void llm_load_tensors(
1899
1902
mmapped_size - vram_weights; // weights in VRAM not in memory
1900
1903
1901
1904
// this is the memory required by one llama_state
1902
- const size_t mem_required_state =
1903
- scale*hparams.kv_size ();
1905
+ const size_t mem_required_state = scale*hparams.kv_size ();
1904
1906
1905
1907
LLAMA_LOG_INFO (" %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
1906
1908
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
@@ -2383,6 +2385,10 @@ static struct ggml_cgraph * llm_build_falcon(
2383
2385
2384
2386
GGML_ASSERT (n_embd_head == hparams.n_rot );
2385
2387
2388
+ const float freq_base = hparams.rope_freq_base ;
2389
+ const float freq_scale = hparams.rope_freq_scale ;
2390
+ const float norm_eps = hparams.f_norm_eps ;
2391
+
2386
2392
auto & buf_compute = lctx.buf_compute ;
2387
2393
2388
2394
struct ggml_init_params params = {
@@ -2436,7 +2442,7 @@ static struct ggml_cgraph * llm_build_falcon(
2436
2442
2437
2443
// self-attention
2438
2444
{
2439
- attn_norm = ggml_norm (ctx0, inpL);
2445
+ attn_norm = ggml_norm (ctx0, inpL, norm_eps );
2440
2446
2441
2447
attn_norm = ggml_add (ctx0,
2442
2448
ggml_mul (ctx0,
@@ -2445,7 +2451,7 @@ static struct ggml_cgraph * llm_build_falcon(
2445
2451
ggml_repeat (ctx0, model.layers [il].attn_norm_b , attn_norm));
2446
2452
2447
2453
if (model.layers [il].attn_norm_2 ) { // Falcon-40B
2448
- cur = ggml_norm (ctx0, inpL);
2454
+ cur = ggml_norm (ctx0, inpL, norm_eps );
2449
2455
2450
2456
cur = ggml_add (ctx0,
2451
2457
ggml_mul (ctx0,
@@ -2490,8 +2496,8 @@ static struct ggml_cgraph * llm_build_falcon(
2490
2496
wsize * n_embd_head * (n_head + n_head_kv));
2491
2497
2492
2498
// using mode = 2 for neox mode
2493
- Qcur = ggml_rope_inplace (ctx0, Qcur, n_past, n_embd_head, 2 , 0 );
2494
- Kcur = ggml_rope_inplace (ctx0, Kcur, n_past, n_embd_head, 2 , 0 );
2499
+ Qcur = ggml_rope_custom_inplace (ctx0, Qcur, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale );
2500
+ Kcur = ggml_rope_custom_inplace (ctx0, Kcur, n_past, n_embd_head, 2 , 0 , freq_base, freq_scale );
2495
2501
2496
2502
// store key and value to memory
2497
2503
{
@@ -2522,8 +2528,6 @@ static struct ggml_cgraph * llm_build_falcon(
2522
2528
2523
2529
// K * Q
2524
2530
2525
- // K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy));
2526
-
2527
2531
struct ggml_tensor * Q = ggml_permute (ctx0, Qcur, 0 , 2 , 1 , 3 );
2528
2532
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
2529
2533
@@ -2549,7 +2553,6 @@ static struct ggml_cgraph * llm_build_falcon(
2549
2553
n_embd_head, n_head_kv, n_past + N),
2550
2554
0 , 2 , 1 , 3 );
2551
2555
2552
- // V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy)));
2553
2556
V = ggml_cont (ctx0, ggml_transpose (ctx0, V));
2554
2557
2555
2558
// KQV = transpose(V) * KQ_soft_max
@@ -2589,7 +2592,7 @@ static struct ggml_cgraph * llm_build_falcon(
2589
2592
2590
2593
// norm
2591
2594
{
2592
- cur = ggml_norm (ctx0, inpL);
2595
+ cur = ggml_norm (ctx0, inpL, norm_eps );
2593
2596
2594
2597
cur = ggml_add (ctx0,
2595
2598
ggml_mul (ctx0,
0 commit comments