Skip to content

Commit 039f1ce

Browse files
committed
train-text-from-scratch: rename ff tensors
This commit renames the feed-forward tensors w1, w2 and w3 to ffn_gate, ffn_down and ffn_up respectively. The motivation for this change is to make it easier to understand the purpose of the tensors. This also seems to be inline with the names used in the llama_layer struct in llama.cpp Signed-off-by: Daniel Bevenius <[email protected]>
1 parent dad9208 commit 039f1ce

File tree

1 file changed

+33
-33
lines changed

1 file changed

+33
-33
lines changed

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ struct my_llama_layer {
5151
struct ggml_tensor * ffn_norm;
5252

5353
// ff
54-
struct ggml_tensor * w1;
55-
struct ggml_tensor * w2;
56-
struct ggml_tensor * w3;
54+
struct ggml_tensor * ffn_gate; // w1
55+
struct ggml_tensor * ffn_down; // w2
56+
struct ggml_tensor * ffn_up; // w3
5757
};
5858

5959
struct my_llama_model {
@@ -141,9 +141,9 @@ static void set_param_model(struct my_llama_model * model) {
141141
ggml_set_param(ctx, layer.wv);
142142
ggml_set_param(ctx, layer.wo);
143143
ggml_set_param(ctx, layer.ffn_norm);
144-
ggml_set_param(ctx, layer.w1);
145-
ggml_set_param(ctx, layer.w2);
146-
ggml_set_param(ctx, layer.w3);
144+
ggml_set_param(ctx, layer.ffn_gate);
145+
ggml_set_param(ctx, layer.ffn_down);
146+
ggml_set_param(ctx, layer.ffn_up);
147147
}
148148
}
149149

@@ -159,9 +159,9 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
159159
ggml_allocr_alloc(alloc, layer.wv);
160160
ggml_allocr_alloc(alloc, layer.wo);
161161
ggml_allocr_alloc(alloc, layer.ffn_norm);
162-
ggml_allocr_alloc(alloc, layer.w1);
163-
ggml_allocr_alloc(alloc, layer.w2);
164-
ggml_allocr_alloc(alloc, layer.w3);
162+
ggml_allocr_alloc(alloc, layer.ffn_gate);
163+
ggml_allocr_alloc(alloc, layer.ffn_down);
164+
ggml_allocr_alloc(alloc, layer.ffn_up);
165165
}
166166
ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
167167
ggml_allocr_alloc(alloc, model->norm->grad);
@@ -174,9 +174,9 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
174174
ggml_allocr_alloc(alloc, layer.wv->grad);
175175
ggml_allocr_alloc(alloc, layer.wo->grad);
176176
ggml_allocr_alloc(alloc, layer.ffn_norm->grad);
177-
ggml_allocr_alloc(alloc, layer.w1->grad);
178-
ggml_allocr_alloc(alloc, layer.w2->grad);
179-
ggml_allocr_alloc(alloc, layer.w3->grad);
177+
ggml_allocr_alloc(alloc, layer.ffn_gate->grad);
178+
ggml_allocr_alloc(alloc, layer.ffn_down->grad);
179+
ggml_allocr_alloc(alloc, layer.ffn_up->grad);
180180
}
181181
}
182182

@@ -232,9 +232,9 @@ static void init_model(struct my_llama_model * model) {
232232

233233
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
234234

235-
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
236-
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
237-
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
235+
layer.ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
236+
layer.ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
237+
layer.ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
238238

239239
ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i));
240240

@@ -245,9 +245,9 @@ static void init_model(struct my_llama_model * model) {
245245

246246
ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i));
247247

248-
ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i));
249-
ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i));
250-
ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i));
248+
ggml_set_name(layer.ffn_gate, tni(LLM_TENSOR_FFN_GATE, i));
249+
ggml_set_name(layer.ffn_down, tni(LLM_TENSOR_FFN_DOWN, i));
250+
ggml_set_name(layer.ffn_up, tni(LLM_TENSOR_FFN_UP, i));
251251
}
252252

253253
set_param_model(model);
@@ -288,9 +288,9 @@ static void randomize_model(struct my_llama_model * model, int seed, float mean,
288288

289289
randomize_tensor_normal(layer.ffn_norm, rnd);
290290

291-
randomize_tensor_normal(layer.w1, rnd);
292-
randomize_tensor_normal(layer.w2, rnd);
293-
randomize_tensor_normal(layer.w3, rnd);
291+
randomize_tensor_normal(layer.ffn_gate, rnd);
292+
randomize_tensor_normal(layer.ffn_down, rnd);
293+
randomize_tensor_normal(layer.ffn_up, rnd);
294294
}
295295

296296
free_random_normal_distribution(rnd);
@@ -405,11 +405,11 @@ static struct ggml_tensor * llama_build_train_graphs(
405405
struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch);
406406
struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch);
407407
struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch);
408-
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
409-
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
408+
struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch);
409+
struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch);
410410
struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch);
411411
struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch);
412-
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
412+
struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch);
413413
struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch);
414414
cur = t30;
415415
checkpoints.push_back(cur);
@@ -560,9 +560,9 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
560560
copy_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i));
561561
copy_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i));
562562
copy_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i));
563-
copy_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
564-
copy_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
565-
copy_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
563+
copy_tensor_by_name(layer.ffn_gate, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i));
564+
copy_tensor_by_name(layer.ffn_down, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i));
565+
copy_tensor_by_name(layer.ffn_up, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i));
566566
}
567567
}
568568

@@ -703,9 +703,9 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
703703
gguf_add_tensor(fctx, layer.wv);
704704
gguf_add_tensor(fctx, layer.wo);
705705
gguf_add_tensor(fctx, layer.ffn_norm);
706-
gguf_add_tensor(fctx, layer.w1);
707-
gguf_add_tensor(fctx, layer.w2);
708-
gguf_add_tensor(fctx, layer.w3);
706+
gguf_add_tensor(fctx, layer.ffn_gate);
707+
gguf_add_tensor(fctx, layer.ffn_down);
708+
gguf_add_tensor(fctx, layer.ffn_up);
709709
}
710710
}
711711

@@ -954,9 +954,9 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
954954
nx += ggml_nelements(layer.wv);
955955
nx += ggml_nelements(layer.wo);
956956
nx += ggml_nelements(layer.ffn_norm);
957-
nx += ggml_nelements(layer.w1);
958-
nx += ggml_nelements(layer.w2);
959-
nx += ggml_nelements(layer.w3);
957+
nx += ggml_nelements(layer.ffn_gate);
958+
nx += ggml_nelements(layer.ffn_down);
959+
nx += ggml_nelements(layer.ffn_up);
960960
}
961961
return nx;
962962
}

0 commit comments

Comments
 (0)