Skip to content

Commit dcbb182

Browse files
fixed performance for CPU layers
1 parent 4900d3a commit dcbb182

File tree

1 file changed

+52
-11
lines changed

1 file changed

+52
-11
lines changed

llama.cpp

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ struct llama_model {
174174
struct ggml_tensor * output;
175175

176176
std::vector<llama_layer> layers;
177+
int n_gpu_layers;
177178

178179
// context
179180
struct ggml_context * ctx = NULL;
@@ -949,6 +950,7 @@ static void llama_model_load_internal(
949950
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
950951
auto & model = lctx.model;
951952
model.hparams = ml->file_loaders.at(0)->hparams;
953+
model.n_gpu_layers = n_gpu_layers;
952954
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
953955
auto & hparams = model.hparams;
954956
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
@@ -1253,12 +1255,13 @@ static bool llama_eval_internal(
12531255

12541256
LLAMA_ASSERT(!!kv_self.ctx);
12551257

1256-
const int n_embd = hparams.n_embd;
1257-
const int n_layer = hparams.n_layer;
1258-
const int n_ctx = hparams.n_ctx;
1259-
const int n_head = hparams.n_head;
1260-
const int n_vocab = hparams.n_vocab;
1261-
const int n_rot = hparams.n_embd/hparams.n_head;
1258+
const int n_embd = hparams.n_embd;
1259+
const int n_layer = hparams.n_layer;
1260+
const int n_ctx = hparams.n_ctx;
1261+
const int n_head = hparams.n_head;
1262+
const int n_vocab = hparams.n_vocab;
1263+
const int n_rot = hparams.n_embd/hparams.n_head;
1264+
const int n_gpu_layers = model.n_gpu_layers;
12621265

12631266
auto & mem_per_token = lctx.mem_per_token;
12641267
auto & buf_compute = lctx.buf_compute;
@@ -1289,31 +1292,50 @@ static bool llama_eval_internal(
12891292
struct ggml_tensor * cur;
12901293
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
12911294

1295+
const int i_gpu_start = n_layer - n_gpu_layers;
1296+
12921297
for (int il = 0; il < n_layer; ++il) {
1298+
ggml_backend backend_offload = GGML_BACKEND_CPU;
1299+
#ifdef GGML_USE_CUBLAS
1300+
if (il >= i_gpu_start) {
1301+
backend_offload = GGML_BACKEND_GPU;
1302+
}
1303+
#endif // GGML_USE_CUBLAS
1304+
12931305
struct ggml_tensor * inpSA = inpL;
12941306

12951307
lctx.use_buf(ctx0, 0);
12961308

12971309
// norm
12981310
{
1299-
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1311+
ggml_set_default_backend(ctx0, backend_offload);
13001312
cur = ggml_rms_norm(ctx0, inpL);
13011313
ggml_set_name(cur, "rms_norm_0");
13021314

13031315
// cur = cur*attention_norm(broadcasted)
13041316
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1317+
ggml_set_name(cur, "attention_norm_0");
13051318
}
13061319

13071320
// self-attention
13081321
{
13091322
// compute Q and K and RoPE them
13101323
struct ggml_tensor * tmpq = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
1324+
ggml_set_name(tmpq, "tmpq");
13111325
struct ggml_tensor * tmpk = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
1326+
ggml_set_name(tmpk, "tmpk");
13121327
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
13131328

13141329
#ifdef GGML_USE_CUBLAS
1315-
struct ggml_tensor * Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
1316-
struct ggml_tensor * Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
1330+
struct ggml_tensor * Kcur;
1331+
struct ggml_tensor * Qcur;
1332+
if (backend_offload == GGML_BACKEND_GPU) {
1333+
Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
1334+
Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
1335+
} else {
1336+
Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1337+
Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1338+
}
13171339
#else
13181340
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
13191341
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
@@ -1328,9 +1350,11 @@ static bool llama_eval_internal(
13281350
ggml_set_name(Vcur, "Vcur");
13291351

13301352
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
1353+
ggml_set_name(k, "k");
13311354
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
13321355
( n_ctx)*ggml_element_size(kv_self.v),
13331356
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
1357+
ggml_set_name(v, "v");
13341358

13351359
// important: storing RoPE-ed version of K in the KV cache!
13361360
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
@@ -1401,17 +1425,19 @@ static bool llama_eval_internal(
14011425
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
14021426
ggml_set_name(cur, "KQV_merged_contiguous");
14031427

1404-
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1428+
ggml_set_default_backend(ctx0, backend_offload);
14051429
// projection (no bias)
14061430
cur = ggml_mul_mat(ctx0,
14071431
model.layers[il].wo,
14081432
cur);
1433+
ggml_set_name(cur, "result_wo");
14091434
}
14101435

14111436
lctx.use_buf(ctx0, 1);
14121437
//ggml_cuda_set_scratch(1);
14131438

14141439
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1440+
ggml_set_name(inpFF, "inpFF");
14151441

14161442
// feed-forward network
14171443
{
@@ -1422,27 +1448,34 @@ static bool llama_eval_internal(
14221448

14231449
// cur = cur*ffn_norm(broadcasted)
14241450
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1451+
ggml_set_name(cur, "ffn_norm");
14251452
}
14261453

14271454
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
14281455
model.layers[il].w3,
14291456
cur);
1457+
ggml_set_name(cur, "result_w3");
14301458

14311459
cur = ggml_mul_mat(ctx0,
14321460
model.layers[il].w1,
14331461
cur);
1462+
ggml_set_name(cur, "result_w2");
14341463

14351464
// SILU activation
14361465
cur = ggml_silu(ctx0, cur);
1466+
ggml_set_name(cur, "silu");
14371467

14381468
cur = ggml_mul(ctx0, cur, tmp);
1469+
ggml_set_name(cur, "silu_x_result_w3");
14391470

14401471
cur = ggml_mul_mat(ctx0,
14411472
model.layers[il].w2,
14421473
cur);
1474+
ggml_set_name(cur, "result_w2");
14431475
}
14441476

14451477
cur = ggml_add(ctx0, cur, inpFF);
1478+
ggml_set_name(cur, "inpFF_+_result_w2");
14461479

14471480
// input for next layer
14481481
inpL = cur;
@@ -1456,16 +1489,23 @@ static bool llama_eval_internal(
14561489
// used at the end to optionally extract the embeddings
14571490
struct ggml_tensor * embeddings = NULL;
14581491

1459-
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1492+
#ifdef GGML_USE_CUBLAS
1493+
if (n_gpu_layers > n_layer) {
1494+
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1495+
}
1496+
#endif // GGML_USE_CUBLAS
14601497

14611498
// norm
14621499
{
14631500
cur = ggml_rms_norm(ctx0, inpL);
1501+
ggml_set_name(cur, "rms_norm_inpL");
14641502

14651503
cur = ggml_rms_norm(ctx0, cur);
1504+
ggml_set_name(cur, "rms_norm_after");
14661505

14671506
// cur = cur*norm(broadcasted)
14681507
cur = ggml_mul(ctx0, cur, model.norm);
1508+
ggml_set_name(cur, "result_norm");
14691509

14701510
embeddings = cur;
14711511
}
@@ -1474,6 +1514,7 @@ static bool llama_eval_internal(
14741514

14751515
// lm_head
14761516
cur = ggml_mul_mat(ctx0, model.output, cur);
1517+
ggml_set_name(cur, "result_output");
14771518

14781519
lctx.use_buf(ctx0, -1);
14791520

0 commit comments

Comments
 (0)