Skip to content

Commit 4c0f243

Browse files
committed
offload KQ_mask with all models
1 parent 488c1fc commit 4c0f243

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

llama.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2692,15 +2692,16 @@ static struct ggml_cgraph * llm_build_llama(
26922692

26932693
// KQ_scale
26942694
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2695+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
26952696
ggml_allocr_alloc(lctx.alloc, KQ_scale);
26962697
if (!ggml_allocr_is_measure(lctx.alloc)) {
26972698
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
26982699
}
2699-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
27002700

27012701
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
27022702
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
27032703
offload_func_kq(KQ_mask);
2704+
ggml_set_name(KQ_mask, "KQ_mask");
27042705
ggml_allocr_alloc(lctx.alloc, KQ_mask);
27052706
if (!ggml_allocr_is_measure(lctx.alloc)) {
27062707
float * data = (float *) KQ_mask->data;
@@ -3081,14 +3082,16 @@ static struct ggml_cgraph * llm_build_baichaun(
30813082

30823083
// KQ_scale
30833084
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3085+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
30843086
ggml_allocr_alloc(lctx.alloc, KQ_scale);
30853087
if (!ggml_allocr_is_measure(lctx.alloc)) {
30863088
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
30873089
}
3088-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
30893090

30903091
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
30913092
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3093+
offload_func_kq(KQ_mask);
3094+
ggml_set_name(KQ_mask, "KQ_mask");
30923095
ggml_allocr_alloc(lctx.alloc, KQ_mask);
30933096
if (!ggml_allocr_is_measure(lctx.alloc)) {
30943097
float * data = (float *) KQ_mask->data;
@@ -3111,6 +3114,7 @@ static struct ggml_cgraph * llm_build_baichaun(
31113114
// KQ_pos - contains the positions
31123115
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
31133116
offload_func_kq(KQ_pos);
3117+
ggml_set_name(KQ_pos, "KQ_pos");
31143118
ggml_allocr_alloc(lctx.alloc, KQ_pos);
31153119
if (!ggml_allocr_is_measure(lctx.alloc)) {
31163120
int * data = (int *) KQ_pos->data;
@@ -3123,6 +3127,7 @@ static struct ggml_cgraph * llm_build_baichaun(
31233127
if (do_rope_shift) {
31243128
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
31253129
offload_func_kq(K_shift);
3130+
ggml_set_name(K_shift, "K_shift");
31263131
ggml_allocr_alloc(lctx.alloc, K_shift);
31273132
if (!ggml_allocr_is_measure(lctx.alloc)) {
31283133
int * data = (int *) K_shift->data;
@@ -3487,14 +3492,16 @@ static struct ggml_cgraph * llm_build_falcon(
34873492

34883493
// KQ_scale
34893494
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3495+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
34903496
ggml_allocr_alloc(lctx.alloc, KQ_scale);
34913497
if (!ggml_allocr_is_measure(lctx.alloc)) {
34923498
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
34933499
}
3494-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
34953500

34963501
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
34973502
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3503+
offload_func_kq(KQ_mask);
3504+
ggml_set_name(KQ_mask, "KQ_mask");
34983505
ggml_allocr_alloc(lctx.alloc, KQ_mask);
34993506
if (!ggml_allocr_is_measure(lctx.alloc)) {
35003507
float * data = (float *) KQ_mask->data;
@@ -3517,6 +3524,7 @@ static struct ggml_cgraph * llm_build_falcon(
35173524
// KQ_pos - contains the positions
35183525
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
35193526
offload_func_kq(KQ_pos);
3527+
ggml_set_name(KQ_pos, "KQ_pos");
35203528
ggml_allocr_alloc(lctx.alloc, KQ_pos);
35213529
if (!ggml_allocr_is_measure(lctx.alloc)) {
35223530
int * data = (int *) KQ_pos->data;
@@ -3529,6 +3537,7 @@ static struct ggml_cgraph * llm_build_falcon(
35293537
if (do_rope_shift) {
35303538
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
35313539
offload_func_kq(K_shift);
3540+
ggml_set_name(K_shift, "K_shift");
35323541
ggml_allocr_alloc(lctx.alloc, K_shift);
35333542
if (!ggml_allocr_is_measure(lctx.alloc)) {
35343543
int * data = (int *) K_shift->data;
@@ -3835,14 +3844,15 @@ static struct ggml_cgraph * llm_build_starcoder(
38353844

38363845
// KQ_scale
38373846
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
3847+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
38383848
ggml_allocr_alloc(lctx.alloc, KQ_scale);
38393849
if (!ggml_allocr_is_measure(lctx.alloc)) {
38403850
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
38413851
}
3842-
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
38433852

38443853
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
38453854
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
3855+
ggml_set_name(KQ_mask, "KQ_mask");
38463856
ggml_allocr_alloc(lctx.alloc, KQ_mask);
38473857
if (!ggml_allocr_is_measure(lctx.alloc)) {
38483858
float * data = (float *) KQ_mask->data;

0 commit comments

Comments
 (0)