Skip to content

Commit 4900d3a

Browse files
ggml_context default_backend
1 parent 32a2095 commit 4900d3a

File tree

5 files changed

+42
-29
lines changed

5 files changed

+42
-29
lines changed

ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ void ggml_init_cublas() {
600600
}
601601
}
602602

603-
void ggml_cuda_set_tensor_split(float * tensor_split) {
603+
void ggml_cuda_set_tensor_split(const float * tensor_split) {
604604
bool all_zero = true;
605605
for (int i = 0; i < g_device_count; ++i) {
606606
if (tensor_split[i] != 0.0f) {
@@ -1295,12 +1295,12 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
12951295

12961296
tensor->backend = GGML_BACKEND_GPU;
12971297
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
1298-
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
12991298

1300-
bool inplace = tensor->src0->data == tensor->data;
1299+
bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
13011300

13021301
CUDA_CHECK(cudaSetDevice(g_main_device));
13031302
if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
1303+
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
13041304
extra->data_device[g_main_device] = src0_extra->data_device;
13051305
GGML_ASSERT(false);
13061306
} else {

ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ struct ggml_tensor_extra_gpu {
1313
};
1414

1515
void ggml_init_cublas(void);
16-
void ggml_cuda_set_tensor_split(float * tensor_split);
16+
void ggml_cuda_set_tensor_split(const float * tensor_split);
1717

1818
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
1919
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);

ggml.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3639,6 +3639,8 @@ struct ggml_context {
36393639

36403640
struct ggml_scratch scratch;
36413641
struct ggml_scratch scratch_save;
3642+
3643+
enum ggml_backend default_backend;
36423644
};
36433645

36443646
struct ggml_context_container {
@@ -3965,6 +3967,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
39653967
/*.objects_end =*/ NULL,
39663968
/*.scratch =*/ { 0, 0, NULL, },
39673969
/*.scratch_save =*/ { 0, 0, NULL, },
3970+
/*.default_backend =*/ GGML_BACKEND_CPU,
39683971
};
39693972

39703973
GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -4023,6 +4026,10 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
40234026
ctx->no_alloc = no_alloc;
40244027
}
40254028

4029+
void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend) {
4030+
ctx->default_backend = backend;
4031+
}
4032+
40264033
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
40274034
return ctx->mem_buffer;
40284035
}
@@ -4134,7 +4141,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
41344141

41354142
*result = (struct ggml_tensor) {
41364143
/*.type =*/ type,
4137-
/*.backend =*/ GGML_BACKEND_CPU,
4144+
/*.backend =*/ ctx->default_backend,
41384145
/*.n_dims =*/ n_dims,
41394146
/*.ne =*/ { 1, 1, 1, 1 },
41404147
/*.nb =*/ { 0, 0, 0, 0 },
@@ -4167,6 +4174,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
41674174
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
41684175
}
41694176

4177+
#ifdef GGML_USE_CUBLAS
4178+
if (result->backend == GGML_BACKEND_GPU) {
4179+
ggml_cuda_assign_buffers(result);
4180+
}
4181+
#else
4182+
GGML_ASSERT(result->backend == GGML_BACKEND_CPU);
4183+
#endif // GGML_USE_CUBLAS
4184+
GGML_ASSERT(result->backend != GGML_BACKEND_GPU_SPLIT);
4185+
41704186
ctx->n_objects++;
41714187

41724188
return result;

ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ extern "C" {
479479

480480
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
481481
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
482+
GGML_API void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend);
482483

483484
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
484485
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);

llama.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -934,7 +934,7 @@ static void llama_model_load_internal(
934934
llama_context & lctx,
935935
int n_ctx,
936936
int n_gpu_layers,
937-
float * tensor_split,
937+
const float * tensor_split,
938938
ggml_type memory_type,
939939
bool use_mmap,
940940
bool use_mlock,
@@ -1293,30 +1293,31 @@ static bool llama_eval_internal(
12931293
struct ggml_tensor * inpSA = inpL;
12941294

12951295
lctx.use_buf(ctx0, 0);
1296-
//ggml_cuda_set_scratch(0);
12971296

12981297
// norm
12991298
{
1299+
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
13001300
cur = ggml_rms_norm(ctx0, inpL);
13011301
ggml_set_name(cur, "rms_norm_0");
1302-
ggml_cuda_assign_buffers(cur);
13031302

13041303
// cur = cur*attention_norm(broadcasted)
13051304
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1306-
ggml_cuda_assign_buffers(cur);
13071305
}
13081306

13091307
// self-attention
13101308
{
13111309
// compute Q and K and RoPE them
1312-
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1313-
ggml_cuda_assign_buffers(tmpq);
1314-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1315-
Qcur->backend = GGML_BACKEND_CPU;
1316-
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1317-
ggml_cuda_assign_buffers(tmpk);
1318-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
1319-
Kcur->backend = GGML_BACKEND_CPU;
1310+
struct ggml_tensor * tmpq = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
1311+
struct ggml_tensor * tmpk = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
1312+
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
1313+
1314+
#ifdef GGML_USE_CUBLAS
1315+
struct ggml_tensor * Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
1316+
struct ggml_tensor * Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
1317+
#else
1318+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1319+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1320+
#endif // GGML_USE_CUBLAS
13201321
ggml_set_name(Qcur, "Qcur");
13211322
ggml_set_name(Kcur, "Kcur");
13221323

@@ -1400,60 +1401,53 @@ static bool llama_eval_internal(
14001401
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
14011402
ggml_set_name(cur, "KQV_merged_contiguous");
14021403

1404+
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
14031405
// projection (no bias)
14041406
cur = ggml_mul_mat(ctx0,
14051407
model.layers[il].wo,
14061408
cur);
1407-
ggml_cuda_assign_buffers(cur);
14081409
}
14091410

14101411
lctx.use_buf(ctx0, 1);
14111412
//ggml_cuda_set_scratch(1);
14121413

14131414
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1414-
ggml_cuda_assign_buffers(inpFF);
14151415

14161416
// feed-forward network
14171417
{
14181418
// norm
14191419
{
14201420
cur = ggml_rms_norm(ctx0, inpFF);
14211421
ggml_set_name(cur, "rms_norm_1");
1422-
ggml_cuda_assign_buffers(cur);
14231422

14241423
// cur = cur*ffn_norm(broadcasted)
14251424
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1426-
ggml_cuda_assign_buffers(cur);
14271425
}
14281426

14291427
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
14301428
model.layers[il].w3,
14311429
cur);
1432-
ggml_cuda_assign_buffers(tmp);
14331430

14341431
cur = ggml_mul_mat(ctx0,
14351432
model.layers[il].w1,
14361433
cur);
1437-
ggml_cuda_assign_buffers(cur);
14381434

14391435
// SILU activation
14401436
cur = ggml_silu(ctx0, cur);
1441-
ggml_cuda_assign_buffers(cur);
14421437

14431438
cur = ggml_mul(ctx0, cur, tmp);
1444-
ggml_cuda_assign_buffers(cur);
14451439

14461440
cur = ggml_mul_mat(ctx0,
14471441
model.layers[il].w2,
14481442
cur);
1449-
ggml_cuda_assign_buffers(cur);
14501443
}
14511444

14521445
cur = ggml_add(ctx0, cur, inpFF);
1453-
ggml_cuda_assign_buffers(cur);
14541446

14551447
// input for next layer
14561448
inpL = cur;
1449+
1450+
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
14571451
}
14581452

14591453
lctx.use_buf(ctx0, 0);
@@ -1462,20 +1456,22 @@ static bool llama_eval_internal(
14621456
// used at the end to optionally extract the embeddings
14631457
struct ggml_tensor * embeddings = NULL;
14641458

1459+
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1460+
14651461
// norm
14661462
{
14671463
cur = ggml_rms_norm(ctx0, inpL);
14681464

14691465
cur = ggml_rms_norm(ctx0, cur);
1470-
ggml_cuda_assign_buffers(cur);
14711466

14721467
// cur = cur*norm(broadcasted)
14731468
cur = ggml_mul(ctx0, cur, model.norm);
1474-
ggml_cuda_assign_buffers(cur);
14751469

14761470
embeddings = cur;
14771471
}
14781472

1473+
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
1474+
14791475
// lm_head
14801476
cur = ggml_mul_mat(ctx0, model.output, cur);
14811477

0 commit comments

Comments
 (0)