Skip to content

Commit 199f6bd

Browse files
committed
ggml : change ggml_scale to take a float instead of tensor
1 parent 8fe03ff commit 199f6bd

File tree

10 files changed

+68
-186
lines changed

10 files changed

+68
-186
lines changed

examples/baby-llama/baby-llama.cpp

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -575,10 +575,7 @@ static struct ggml_tensor * forward(
575575

576576
// KQ_scaled = KQ / sqrt(n_embd/n_head)
577577
// KQ_scaled shape [n_past + N, N, n_head, 1]
578-
struct ggml_tensor * KQ_scaled =
579-
ggml_scale(ctx0,
580-
KQ,
581-
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
578+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
582579

583580
// KQ_masked = mask_past(KQ_scaled)
584581
// KQ_masked shape [n_past + N, N, n_head, 1]
@@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch(
844841

845842
// KQ_scaled = KQ / sqrt(n_embd/n_head)
846843
// KQ_scaled shape [n_past + N, N, n_head, n_batch]
847-
struct ggml_tensor * KQ_scaled =
848-
ggml_scale(ctx0,
849-
KQ,
850-
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
844+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
851845
assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
852846

853847
// KQ_masked = mask_past(KQ_scaled)
@@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora(
11311125

11321126
// KQ_scaled = KQ / sqrt(n_embd/n_head)
11331127
// KQ_scaled shape [n_past + N, N, n_head, 1]
1134-
struct ggml_tensor * KQ_scaled =
1135-
ggml_scale(ctx0,
1136-
KQ,
1137-
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
1128+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
11381129

11391130
// KQ_masked = mask_past(KQ_scaled)
11401131
// KQ_masked shape [n_past + N, N, n_head, 1]

examples/export-lora/export-lora.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
309309
) {
310310
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
311311
if (scaling != 1.0f) {
312-
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
312+
ab = ggml_scale(ctx, ab, scaling);
313313
}
314314
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
315315

examples/finetune/finetune.cpp

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
269269
float rope_freq_scale = 1.0f;
270270
GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
271271
GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
272-
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
272+
GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
273273
if (rope_freq_scale != 1.0f) {
274274
hparams->rope_freq_scale = 1.0f / rope_freq_scale;
275275
}
@@ -612,6 +612,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
612612
const int n_rot = hparams.n_embd_head();
613613
const int n_embd_head = hparams.n_embd_head();
614614
const int n_embd_gqa = hparams.n_embd_gqa();
615+
615616
const float rms_norm_eps = hparams.f_norm_rms_eps;
616617
const float rope_freq_base = hparams.rope_freq_base;
617618
const float rope_freq_scale = hparams.rope_freq_scale;
@@ -680,10 +681,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
680681
checkpoints.push_back(t01);
681682
}
682683

683-
struct ggml_tensor * kv_scale = NULL;
684-
if (!enable_flash_attn) {
685-
kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
686-
}
684+
const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
687685

688686
for (int il = 0; il < n_layer; ++il) {
689687
struct my_llama_layer & layer = model->layers[il];
@@ -781,32 +779,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
781779
// make sure some tensors are not reallocated by inserting new temporary nodes depending on them
782780
int n_leafs_before = gb->n_leafs;
783781
int n_nodes_before = gb->n_nodes;
784-
struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
782+
785783
// output tensors
786-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
787-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
784+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
785+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
788786
// input gradient
789-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
787+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
790788
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
791789
ggml_allocr_alloc(alloc, t36->grad);
792790
// KQ_pos
793-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
791+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
794792

795793
// make sure base model tensors data cannot be used in viewable operations
796-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
797-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
798-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
794+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
795+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
796+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
799797
for (int il = 0; il < n_layer; ++il) {
800798
struct my_llama_layer & layer = model->layers[il];
801-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
802-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
803-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
804-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
805-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
806-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
807-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
808-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
809-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
799+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
800+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
801+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
802+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
803+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
804+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
805+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
806+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
807+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
810808
}
811809

812810
// allocating checkpoints in one block to reduce memory fragmentation

examples/llava/clip.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -330,12 +330,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
330330
ggml_repeat(ctx0, model.pre_ln_b, embeddings));
331331
}
332332

333-
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
334-
ggml_allocr_alloc(ctx->alloc, KQ_scale);
335-
if (!ggml_allocr_is_measure(ctx->alloc)) {
336-
ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head));
337-
}
338-
339333
// loop over layers
340334
for (int il = 0; il < n_layer - 1; il++) {
341335
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
@@ -356,7 +350,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
356350
struct ggml_tensor * Q =
357351
ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
358352

359-
Q = ggml_scale_inplace(ctx0, Q, KQ_scale);
353+
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
360354
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
361355
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
362356
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -369,10 +369,7 @@ static struct ggml_tensor * llama_build_train_graphs(
369369
checkpoints.push_back(t00);
370370
checkpoints.push_back(t01);
371371

372-
struct ggml_tensor * kv_scale = NULL;
373-
if (!enable_flash_attn) {
374-
kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
375-
}
372+
const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
376373

377374
for (int il = 0; il < n_layer; ++il) {
378375
struct my_llama_layer & layer = model->layers[il];
@@ -444,14 +441,13 @@ static struct ggml_tensor * llama_build_train_graphs(
444441
// make sure some tensors are not reallocated by inserting new temporary nodes depending on them
445442
int n_leafs_before = gb->n_leafs;
446443
int n_nodes_before = gb->n_nodes;
447-
struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
448444
// output tensors
449-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
450-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
445+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
446+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
451447
// input gradient
452-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
448+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
453449
// KQ_pos
454-
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
450+
ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
455451
GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
456452

457453
ggml_allocr_alloc(alloc, t36->grad);

ggml-cuda.cu

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7694,17 +7694,9 @@ inline void ggml_cuda_op_scale(
76947694
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
76957695

76967696
GGML_ASSERT(src0->type == GGML_TYPE_F32);
7697-
GGML_ASSERT(src1->type == GGML_TYPE_F32);
76987697
GGML_ASSERT( dst->type == GGML_TYPE_F32);
76997698

7700-
float scale;
7701-
// HACK: support for ggml backend interface
7702-
if (src1->backend == GGML_BACKEND_CPU) {
7703-
scale = ((float *) src1->data)[0];
7704-
} else {
7705-
// TODO: pass pointer to kernel instead of copying to host
7706-
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
7707-
}
7699+
const float scale = ((float *) dst->op_params)[0];
77087700

77097701
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
77107702
CUDA_CHECK(cudaGetLastError());
@@ -7751,8 +7743,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
77517743
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
77527744
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;
77537745

7754-
const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
7755-
77567746
// dd = data device
77577747
float * src0_ddf = nullptr;
77587748
float * src1_ddf = nullptr;
@@ -7773,7 +7763,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
77737763
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
77747764
}
77757765

7776-
if (use_src1 && !src1_stays_on_host) {
7766+
if (use_src1) {
77777767
if (src1_on_device) {
77787768
src1_ddf = (float *) src1_extra->data_device[g_main_device];
77797769
} else {

ggml-metal.m

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1261,7 +1261,7 @@ void ggml_metal_graph_compute(
12611261
{
12621262
GGML_ASSERT(ggml_is_contiguous(src0));
12631263

1264-
const float scale = *(const float *) src1->data;
1264+
const float scale = *(const float *) dst->op_params;
12651265

12661266
int64_t n = ggml_nelements(dst);
12671267

@@ -1272,8 +1272,8 @@ void ggml_metal_graph_compute(
12721272
[encoder setComputePipelineState:ctx->pipeline_scale];
12731273
}
12741274

1275-
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1276-
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1275+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1276+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
12771277
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
12781278

12791279
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];

ggml.c

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4183,39 +4183,39 @@ struct ggml_tensor * ggml_out_prod(
41834183
static struct ggml_tensor * ggml_scale_impl(
41844184
struct ggml_context * ctx,
41854185
struct ggml_tensor * a,
4186-
struct ggml_tensor * b,
4186+
float s,
41874187
bool inplace) {
4188-
GGML_ASSERT(ggml_is_scalar(b));
41894188
GGML_ASSERT(ggml_is_padded_1d(a));
41904189

41914190
bool is_node = false;
41924191

4193-
if (a->grad || b->grad) {
4192+
if (a->grad) {
41944193
is_node = true;
41954194
}
41964195

41974196
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
41984197

4198+
ggml_set_op_params(result, &s, sizeof(s));
4199+
41994200
result->op = GGML_OP_SCALE;
42004201
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
42014202
result->src[0] = a;
4202-
result->src[1] = b;
42034203

42044204
return result;
42054205
}
42064206

42074207
struct ggml_tensor * ggml_scale(
42084208
struct ggml_context * ctx,
42094209
struct ggml_tensor * a,
4210-
struct ggml_tensor * b) {
4211-
return ggml_scale_impl(ctx, a, b, false);
4210+
float s) {
4211+
return ggml_scale_impl(ctx, a, s, false);
42124212
}
42134213

42144214
struct ggml_tensor * ggml_scale_inplace(
42154215
struct ggml_context * ctx,
42164216
struct ggml_tensor * a,
4217-
struct ggml_tensor * b) {
4218-
return ggml_scale_impl(ctx, a, b, true);
4217+
float s) {
4218+
return ggml_scale_impl(ctx, a, s, true);
42194219
}
42204220

42214221
// ggml_set
@@ -14851,7 +14851,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
1485114851

1485214852
static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
1485314853
if (ggml_hash_contains(zero_table, a)) {
14854-
struct ggml_tensor * a_zero = ggml_scale(ctx, a, ggml_new_f32(ctx, 0));
14854+
struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
1485514855
return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
1485614856
} else {
1485714857
return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
@@ -14987,7 +14987,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
1498714987
src0->grad,
1498814988
ggml_scale(ctx,
1498914989
ggml_mul(ctx, src0, tensor->grad),
14990-
ggml_new_f32(ctx, 2.0f)),
14990+
2.0f),
1499114991
zero_table);
1499214992
}
1499314993
} break;
@@ -15001,7 +15001,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
1500115001
ggml_div(ctx,
1500215002
tensor->grad,
1500315003
tensor),
15004-
ggml_new_f32(ctx, 0.5f)),
15004+
0.5f),
1500515005
zero_table);
1500615006
}
1500715007
} break;
@@ -15167,17 +15167,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
1516715167
{
1516815168
// necessary for llama
1516915169
if (src0->grad) {
15170+
const float s = ((float *) tensor->op_params)[0];
15171+
1517015172
src0->grad =
1517115173
ggml_add_or_set(ctx,
1517215174
src0->grad,
15173-
ggml_scale_impl(ctx, tensor->grad, src1, false),
15174-
zero_table);
15175-
}
15176-
if (src1->grad) {
15177-
src1->grad =
15178-
ggml_add_or_set(ctx,
15179-
src1->grad,
15180-
ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)),
15175+
ggml_scale_impl(ctx, tensor->grad, s, false),
1518115176
zero_table);
1518215177
}
1518315178
} break;

ggml.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,13 +1094,13 @@ extern "C" {
10941094
GGML_API struct ggml_tensor * ggml_scale(
10951095
struct ggml_context * ctx,
10961096
struct ggml_tensor * a,
1097-
struct ggml_tensor * b);
1097+
float s);
10981098

10991099
// in-place, returns view(a)
11001100
GGML_API struct ggml_tensor * ggml_scale_inplace(
11011101
struct ggml_context * ctx,
11021102
struct ggml_tensor * a,
1103-
struct ggml_tensor * b);
1103+
float s);
11041104

11051105
// b -> view(a,offset,nb1,nb2,3), return modified a
11061106
GGML_API struct ggml_tensor * ggml_set(

0 commit comments

Comments
 (0)