@@ -174,6 +174,7 @@ struct llama_model {
174
174
struct ggml_tensor * output;
175
175
176
176
std::vector<llama_layer> layers;
177
+ int n_gpu_layers;
177
178
178
179
// context
179
180
struct ggml_context * ctx = NULL ;
@@ -949,6 +950,7 @@ static void llama_model_load_internal(
949
950
lctx.vocab = std::move (ml->file_loaders .at (0 )->vocab );
950
951
auto & model = lctx.model ;
951
952
model.hparams = ml->file_loaders .at (0 )->hparams ;
953
+ model.n_gpu_layers = n_gpu_layers;
952
954
llama_file_version file_version = ml->file_loaders .at (0 )->file_version ;
953
955
auto & hparams = model.hparams ;
954
956
uint32_t n_ff = ((2 *(4 *hparams.n_embd )/3 + hparams.n_mult - 1 )/hparams.n_mult )*hparams.n_mult ;
@@ -1253,12 +1255,13 @@ static bool llama_eval_internal(
1253
1255
1254
1256
LLAMA_ASSERT (!!kv_self.ctx );
1255
1257
1256
- const int n_embd = hparams.n_embd ;
1257
- const int n_layer = hparams.n_layer ;
1258
- const int n_ctx = hparams.n_ctx ;
1259
- const int n_head = hparams.n_head ;
1260
- const int n_vocab = hparams.n_vocab ;
1261
- const int n_rot = hparams.n_embd /hparams.n_head ;
1258
+ const int n_embd = hparams.n_embd ;
1259
+ const int n_layer = hparams.n_layer ;
1260
+ const int n_ctx = hparams.n_ctx ;
1261
+ const int n_head = hparams.n_head ;
1262
+ const int n_vocab = hparams.n_vocab ;
1263
+ const int n_rot = hparams.n_embd /hparams.n_head ;
1264
+ const int n_gpu_layers = model.n_gpu_layers ;
1262
1265
1263
1266
auto & mem_per_token = lctx.mem_per_token ;
1264
1267
auto & buf_compute = lctx.buf_compute ;
@@ -1289,31 +1292,50 @@ static bool llama_eval_internal(
1289
1292
struct ggml_tensor * cur;
1290
1293
struct ggml_tensor * inpL = ggml_get_rows (ctx0, model.tok_embeddings , embd);
1291
1294
1295
+ const int i_gpu_start = n_layer - n_gpu_layers;
1296
+
1292
1297
for (int il = 0 ; il < n_layer; ++il) {
1298
+ ggml_backend backend_offload = GGML_BACKEND_CPU;
1299
+ #ifdef GGML_USE_CUBLAS
1300
+ if (il >= i_gpu_start) {
1301
+ backend_offload = GGML_BACKEND_GPU;
1302
+ }
1303
+ #endif // GGML_USE_CUBLAS
1304
+
1293
1305
struct ggml_tensor * inpSA = inpL;
1294
1306
1295
1307
lctx.use_buf (ctx0, 0 );
1296
1308
1297
1309
// norm
1298
1310
{
1299
- ggml_set_default_backend (ctx0, GGML_BACKEND_GPU );
1311
+ ggml_set_default_backend (ctx0, backend_offload );
1300
1312
cur = ggml_rms_norm (ctx0, inpL);
1301
1313
ggml_set_name (cur, " rms_norm_0" );
1302
1314
1303
1315
// cur = cur*attention_norm(broadcasted)
1304
1316
cur = ggml_mul (ctx0, cur, model.layers [il].attention_norm );
1317
+ ggml_set_name (cur, " attention_norm_0" );
1305
1318
}
1306
1319
1307
1320
// self-attention
1308
1321
{
1309
1322
// compute Q and K and RoPE them
1310
1323
struct ggml_tensor * tmpq = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N);
1324
+ ggml_set_name (tmpq, " tmpq" );
1311
1325
struct ggml_tensor * tmpk = ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N);
1326
+ ggml_set_name (tmpk, " tmpk" );
1312
1327
ggml_set_default_backend (ctx0, GGML_BACKEND_CPU);
1313
1328
1314
1329
#ifdef GGML_USE_CUBLAS
1315
- struct ggml_tensor * Kcur = ggml_rope (ctx0, tmpk, n_past, n_rot, 0 );
1316
- struct ggml_tensor * Qcur = ggml_rope (ctx0, tmpq, n_past, n_rot, 0 );
1330
+ struct ggml_tensor * Kcur;
1331
+ struct ggml_tensor * Qcur;
1332
+ if (backend_offload == GGML_BACKEND_GPU) {
1333
+ Kcur = ggml_rope (ctx0, tmpk, n_past, n_rot, 0 );
1334
+ Qcur = ggml_rope (ctx0, tmpq, n_past, n_rot, 0 );
1335
+ } else {
1336
+ Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1337
+ Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
1338
+ }
1317
1339
#else
1318
1340
struct ggml_tensor * Kcur = ggml_rope_inplace (ctx0, tmpk, n_past, n_rot, 0 );
1319
1341
struct ggml_tensor * Qcur = ggml_rope_inplace (ctx0, tmpq, n_past, n_rot, 0 );
@@ -1328,9 +1350,11 @@ static bool llama_eval_internal(
1328
1350
ggml_set_name (Vcur, " Vcur" );
1329
1351
1330
1352
struct ggml_tensor * k = ggml_view_1d (ctx0, kv_self.k , N*n_embd, (ggml_element_size (kv_self.k )*n_embd)*(il*n_ctx + n_past));
1353
+ ggml_set_name (k, " k" );
1331
1354
struct ggml_tensor * v = ggml_view_2d (ctx0, kv_self.v , N, n_embd,
1332
1355
( n_ctx)*ggml_element_size (kv_self.v ),
1333
1356
(il*n_ctx)*ggml_element_size (kv_self.v )*n_embd + n_past*ggml_element_size (kv_self.v ));
1357
+ ggml_set_name (v, " v" );
1334
1358
1335
1359
// important: storing RoPE-ed version of K in the KV cache!
1336
1360
ggml_build_forward_expand (&gf, ggml_cpy (ctx0, Kcur, k));
@@ -1401,17 +1425,19 @@ static bool llama_eval_internal(
1401
1425
ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
1402
1426
ggml_set_name (cur, " KQV_merged_contiguous" );
1403
1427
1404
- ggml_set_default_backend (ctx0, GGML_BACKEND_GPU );
1428
+ ggml_set_default_backend (ctx0, backend_offload );
1405
1429
// projection (no bias)
1406
1430
cur = ggml_mul_mat (ctx0,
1407
1431
model.layers [il].wo ,
1408
1432
cur);
1433
+ ggml_set_name (cur, " result_wo" );
1409
1434
}
1410
1435
1411
1436
lctx.use_buf (ctx0, 1 );
1412
1437
// ggml_cuda_set_scratch(1);
1413
1438
1414
1439
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
1440
+ ggml_set_name (inpFF, " inpFF" );
1415
1441
1416
1442
// feed-forward network
1417
1443
{
@@ -1422,27 +1448,34 @@ static bool llama_eval_internal(
1422
1448
1423
1449
// cur = cur*ffn_norm(broadcasted)
1424
1450
cur = ggml_mul (ctx0, cur, model.layers [il].ffn_norm );
1451
+ ggml_set_name (cur, " ffn_norm" );
1425
1452
}
1426
1453
1427
1454
struct ggml_tensor * tmp = ggml_mul_mat (ctx0,
1428
1455
model.layers [il].w3 ,
1429
1456
cur);
1457
+ ggml_set_name (cur, " result_w3" );
1430
1458
1431
1459
cur = ggml_mul_mat (ctx0,
1432
1460
model.layers [il].w1 ,
1433
1461
cur);
1462
+ ggml_set_name (cur, " result_w2" );
1434
1463
1435
1464
// SILU activation
1436
1465
cur = ggml_silu (ctx0, cur);
1466
+ ggml_set_name (cur, " silu" );
1437
1467
1438
1468
cur = ggml_mul (ctx0, cur, tmp);
1469
+ ggml_set_name (cur, " silu_x_result_w3" );
1439
1470
1440
1471
cur = ggml_mul_mat (ctx0,
1441
1472
model.layers [il].w2 ,
1442
1473
cur);
1474
+ ggml_set_name (cur, " result_w2" );
1443
1475
}
1444
1476
1445
1477
cur = ggml_add (ctx0, cur, inpFF);
1478
+ ggml_set_name (cur, " inpFF_+_result_w2" );
1446
1479
1447
1480
// input for next layer
1448
1481
inpL = cur;
@@ -1456,16 +1489,23 @@ static bool llama_eval_internal(
1456
1489
// used at the end to optionally extract the embeddings
1457
1490
struct ggml_tensor * embeddings = NULL ;
1458
1491
1459
- ggml_set_default_backend (ctx0, GGML_BACKEND_GPU);
1492
+ #ifdef GGML_USE_CUBLAS
1493
+ if (n_gpu_layers > n_layer) {
1494
+ ggml_set_default_backend (ctx0, GGML_BACKEND_GPU);
1495
+ }
1496
+ #endif // GGML_USE_CUBLAS
1460
1497
1461
1498
// norm
1462
1499
{
1463
1500
cur = ggml_rms_norm (ctx0, inpL);
1501
+ ggml_set_name (cur, " rms_norm_inpL" );
1464
1502
1465
1503
cur = ggml_rms_norm (ctx0, cur);
1504
+ ggml_set_name (cur, " rms_norm_after" );
1466
1505
1467
1506
// cur = cur*norm(broadcasted)
1468
1507
cur = ggml_mul (ctx0, cur, model.norm );
1508
+ ggml_set_name (cur, " result_norm" );
1469
1509
1470
1510
embeddings = cur;
1471
1511
}
@@ -1474,6 +1514,7 @@ static bool llama_eval_internal(
1474
1514
1475
1515
// lm_head
1476
1516
cur = ggml_mul_mat (ctx0, model.output , cur);
1517
+ ggml_set_name (cur, " result_output" );
1477
1518
1478
1519
lctx.use_buf (ctx0, -1 );
1479
1520
0 commit comments