@@ -186,6 +186,7 @@ struct llama_hparams {
186
186
// LLaMAv2
187
187
// TODO: load from model data hparams
188
188
float f_ffn_mult = 1 .0f ;
189
+ float f_rms_norm_eps = 1e-6f ;
189
190
190
191
float rope_freq_base = 10000 .0f ;
191
192
float rope_freq_scale = 1 .0f ;
@@ -869,6 +870,7 @@ struct llama_context_params llama_context_default_params() {
869
870
/* .n_ctx =*/ 512 ,
870
871
/* .n_batch =*/ 512 ,
871
872
/* .n_gqa =*/ 1 ,
873
+ /* .rms_norm_eps =*/ 1e-6f ,
872
874
/* .gpu_layers =*/ 0 ,
873
875
/* .main_gpu =*/ 0 ,
874
876
/* .tensor_split =*/ nullptr ,
@@ -1000,6 +1002,7 @@ static void llama_model_load_internal(
1000
1002
int n_ctx,
1001
1003
int n_batch,
1002
1004
int n_gqa,
1005
+ float rms_norm_eps,
1003
1006
int n_gpu_layers,
1004
1007
int main_gpu,
1005
1008
const float * tensor_split,
@@ -1024,6 +1027,9 @@ static void llama_model_load_internal(
1024
1027
1025
1028
auto & hparams = model.hparams ;
1026
1029
1030
+ // TODO: read from file
1031
+ hparams.f_rms_norm_eps = rms_norm_eps;
1032
+
1027
1033
{
1028
1034
switch (hparams.n_layer ) {
1029
1035
case 26 : model.type = e_model::MODEL_3B; break ;
@@ -1072,6 +1078,7 @@ static void llama_model_load_internal(
1072
1078
fprintf (stderr, " %s: n_layer = %u\n " , __func__, hparams.n_layer );
1073
1079
fprintf (stderr, " %s: n_rot = %u\n " , __func__, hparams.n_rot ); // a.k.a. n_embd_head, n_head_dim
1074
1080
fprintf (stderr, " %s: n_gqa = %u\n " , __func__, hparams.n_gqa ());
1081
+ fprintf (stderr, " %s: rnorm_eps = %.1e\n " , __func__, hparams.f_rms_norm_eps );
1075
1082
fprintf (stderr, " %s: n_ff = %u\n " , __func__, n_ff);
1076
1083
fprintf (stderr, " %s: freq_base = %.1f\n " , __func__, hparams.rope_freq_base );
1077
1084
fprintf (stderr, " %s: freq_scale = %g\n " , __func__, hparams.rope_freq_scale );
@@ -1330,6 +1337,7 @@ static bool llama_model_load(
1330
1337
int n_ctx,
1331
1338
int n_batch,
1332
1339
int n_gqa,
1340
+ float rms_norm_eps,
1333
1341
int n_gpu_layers,
1334
1342
int main_gpu,
1335
1343
const float * tensor_split,
@@ -1343,7 +1351,7 @@ static bool llama_model_load(
1343
1351
llama_progress_callback progress_callback,
1344
1352
void *progress_callback_user_data) {
1345
1353
try {
1346
- llama_model_load_internal (fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1354
+ llama_model_load_internal (fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
1347
1355
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1348
1356
return true ;
1349
1357
} catch (const std::exception & err) {
@@ -1401,9 +1409,7 @@ static bool llama_eval_internal(
1401
1409
1402
1410
const float freq_base = hparams.rope_freq_base ;
1403
1411
const float freq_scale = hparams.rope_freq_scale ;
1404
-
1405
- // TODO: read from hparams
1406
- const float rms_norm_eps = 1e-6f ;
1412
+ const float rms_norm_eps = hparams.f_rms_norm_eps ;
1407
1413
1408
1414
const int n_gpu_layers = model.n_gpu_layers ;
1409
1415
@@ -3088,7 +3094,7 @@ struct llama_model * llama_load_model_from_file(
3088
3094
3089
3095
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
3090
3096
3091
- if (!llama_model_load (path_model, *model, model->vocab , params.n_ctx , params.n_batch , params.n_gqa , params.n_gpu_layers ,
3097
+ if (!llama_model_load (path_model, *model, model->vocab , params.n_ctx , params.n_batch , params.n_gqa , params.rms_norm_eps , params. n_gpu_layers ,
3092
3098
params.main_gpu , params.tensor_split , params.rope_freq_base , params.rope_freq_scale ,params.low_vram ,
3093
3099
memory_type, params.use_mmap , params.use_mlock , params.vocab_only , params.progress_callback ,
3094
3100
params.progress_callback_user_data )) {
0 commit comments