@@ -1245,8 +1245,7 @@ struct llama_cparams {
1245
1245
float yarn_beta_slow;
1246
1246
1247
1247
bool mul_mat_q;
1248
- bool offload_k;
1249
- bool offload_v;
1248
+ bool offload_kqv;
1250
1249
1251
1250
};
1252
1251
@@ -1526,8 +1525,7 @@ static bool llama_kv_cache_init(
1526
1525
ggml_type wtype,
1527
1526
uint32_t n_ctx,
1528
1527
int n_gpu_layers,
1529
- bool offload_k,
1530
- bool offload_v) {
1528
+ bool offload) {
1531
1529
const uint32_t n_embd = hparams.n_embd_gqa ();
1532
1530
const uint32_t n_layer = hparams.n_layer ;
1533
1531
@@ -1574,11 +1572,9 @@ static bool llama_kv_cache_init(
1574
1572
cache.v_l .push_back (v);
1575
1573
#ifdef GGML_USE_CUBLAS
1576
1574
if (i >= i_gpu_start) {
1577
- if (offload_k ) {
1575
+ if (offload ) {
1578
1576
ggml_cuda_assign_buffers_no_scratch (k);
1579
1577
vram_kv_cache += ggml_nbytes (k);
1580
- }
1581
- if (offload_v) {
1582
1578
ggml_cuda_assign_buffers_no_scratch (v);
1583
1579
vram_kv_cache += ggml_nbytes (v);
1584
1580
}
@@ -5101,6 +5097,7 @@ enum llm_offload_func_e {
5101
5097
OFFLOAD_FUNC_NOP,
5102
5098
OFFLOAD_FUNC,
5103
5099
OFFLOAD_FUNC_FRC, // force offload
5100
+ OFFLOAD_FUNC_KQV,
5104
5101
OFFLOAD_FUNC_NR,
5105
5102
OFFLOAD_FUNC_EMB,
5106
5103
OFFLOAD_FUNC_OUT,
@@ -5204,38 +5201,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5204
5201
{ " attn_norm" , OFFLOAD_FUNC },
5205
5202
{ " attn_norm_2" , OFFLOAD_FUNC },
5206
5203
5207
- { " wqkv" , OFFLOAD_FUNC },
5208
- { " bqkv" , OFFLOAD_FUNC },
5209
- { " wqkv_clamped" , OFFLOAD_FUNC },
5210
-
5211
- { " tmpk" , OFFLOAD_FUNC },
5212
- { " tmpq" , OFFLOAD_FUNC },
5213
- { " tmpv" , OFFLOAD_FUNC },
5214
- { " Kcur" , OFFLOAD_FUNC },
5215
- { " Qcur" , OFFLOAD_FUNC },
5216
- { " Vcur" , OFFLOAD_FUNC },
5217
-
5218
- { " krot" , OFFLOAD_FUNC },
5219
- { " qrot" , OFFLOAD_FUNC },
5220
- { " kpass" , OFFLOAD_FUNC },
5221
- { " qpass" , OFFLOAD_FUNC },
5222
- { " krotated" , OFFLOAD_FUNC },
5223
- { " qrotated" , OFFLOAD_FUNC },
5224
-
5225
- { " q" , OFFLOAD_FUNC },
5226
- { " k" , OFFLOAD_FUNC },
5227
- { " kq" , OFFLOAD_FUNC },
5228
- { " kq_scaled" , OFFLOAD_FUNC },
5229
- { " kq_scaled_alibi" , OFFLOAD_FUNC },
5230
- { " kq_masked" , OFFLOAD_FUNC },
5231
- { " kq_soft_max" , OFFLOAD_FUNC },
5232
- { " kq_soft_max_ext" , OFFLOAD_FUNC },
5233
- { " v" , OFFLOAD_FUNC },
5234
- { " kqv" , OFFLOAD_FUNC },
5235
- { " kqv_merged" , OFFLOAD_FUNC },
5236
- { " kqv_merged_cont" , OFFLOAD_FUNC },
5237
- { " kqv_wo" , OFFLOAD_FUNC },
5238
- { " kqv_out" , OFFLOAD_FUNC },
5204
+ { " wqkv" , OFFLOAD_FUNC_KQV },
5205
+ { " bqkv" , OFFLOAD_FUNC_KQV },
5206
+ { " wqkv_clamped" , OFFLOAD_FUNC_KQV },
5207
+
5208
+ { " tmpk" , OFFLOAD_FUNC_KQV },
5209
+ { " tmpq" , OFFLOAD_FUNC_KQV },
5210
+ { " tmpv" , OFFLOAD_FUNC_KQV },
5211
+ { " Kcur" , OFFLOAD_FUNC_KQV },
5212
+ { " Qcur" , OFFLOAD_FUNC_KQV },
5213
+ { " Vcur" , OFFLOAD_FUNC_KQV },
5214
+
5215
+ { " krot" , OFFLOAD_FUNC_KQV },
5216
+ { " qrot" , OFFLOAD_FUNC_KQV },
5217
+ { " kpass" , OFFLOAD_FUNC_KQV },
5218
+ { " qpass" , OFFLOAD_FUNC_KQV },
5219
+ { " krotated" , OFFLOAD_FUNC_KQV },
5220
+ { " qrotated" , OFFLOAD_FUNC_KQV },
5221
+
5222
+ { " q" , OFFLOAD_FUNC_KQV },
5223
+ { " k" , OFFLOAD_FUNC_KQV },
5224
+ { " kq" , OFFLOAD_FUNC_KQV },
5225
+ { " kq_scaled" , OFFLOAD_FUNC_KQV },
5226
+ { " kq_scaled_alibi" , OFFLOAD_FUNC_KQV },
5227
+ { " kq_masked" , OFFLOAD_FUNC_KQV },
5228
+ { " kq_soft_max" , OFFLOAD_FUNC_KQV },
5229
+ { " kq_soft_max_ext" , OFFLOAD_FUNC_KQV },
5230
+ { " v" , OFFLOAD_FUNC_KQV },
5231
+ { " kqv" , OFFLOAD_FUNC_KQV },
5232
+ { " kqv_merged" , OFFLOAD_FUNC_KQV },
5233
+ { " kqv_merged_cont" , OFFLOAD_FUNC_KQV },
5234
+ { " kqv_wo" , OFFLOAD_FUNC_KQV },
5235
+ { " kqv_out" , OFFLOAD_FUNC_KQV },
5239
5236
5240
5237
{ " ffn_inp" , OFFLOAD_FUNC },
5241
5238
{ " ffn_norm" , OFFLOAD_FUNC },
@@ -5429,11 +5426,13 @@ static struct ggml_cgraph * llama_build_graph(
5429
5426
#ifdef GGML_USE_CUBLAS
5430
5427
{ OFFLOAD_FUNC, " GPU (CUDA)" },
5431
5428
{ OFFLOAD_FUNC_FRC, " GPU (CUDA) FRC" },
5429
+ { OFFLOAD_FUNC_KQV, " GPU (CUDA) KQV" },
5432
5430
{ OFFLOAD_FUNC_NR, " GPU (CUDA) NR" },
5433
5431
{ OFFLOAD_FUNC_EMB, " GPU (CUDA) EMB" },
5434
5432
#else
5435
5433
{ OFFLOAD_FUNC, " CPU" },
5436
5434
{ OFFLOAD_FUNC_FRC, " CPU" },
5435
+ { OFFLOAD_FUNC_KQV, " CPU" },
5437
5436
{ OFFLOAD_FUNC_NR, " CPU" },
5438
5437
{ OFFLOAD_FUNC_EMB, " CPU" },
5439
5438
#endif // GGML_USE_CUBLAS
@@ -5458,7 +5457,6 @@ static struct ggml_cgraph * llama_build_graph(
5458
5457
switch (func_e) {
5459
5458
case OFFLOAD_FUNC_NOP:
5460
5459
case OFFLOAD_FUNC_OUT:
5461
- case OFFLOAD_FUNC_FRC:
5462
5460
break ;
5463
5461
case OFFLOAD_FUNC:
5464
5462
if (n_gpu_layers < n_layer) {
@@ -5467,6 +5465,21 @@ static struct ggml_cgraph * llama_build_graph(
5467
5465
}
5468
5466
}
5469
5467
break ;
5468
+ case OFFLOAD_FUNC_FRC:
5469
+ if (!lctx.cparams .offload_kqv ) {
5470
+ func_e = OFFLOAD_FUNC_NOP;
5471
+ } break ;
5472
+ case OFFLOAD_FUNC_KQV:
5473
+ if (!lctx.cparams .offload_kqv ) {
5474
+ func_e = OFFLOAD_FUNC_NOP;
5475
+ } else {
5476
+ if (n_gpu_layers < n_layer) {
5477
+ if (il < i_gpu_start) {
5478
+ func_e = OFFLOAD_FUNC_NOP;
5479
+ }
5480
+ }
5481
+ }
5482
+ break ;
5470
5483
case OFFLOAD_FUNC_NR:
5471
5484
if (n_gpu_layers <= n_layer + 0 ) {
5472
5485
func_e = OFFLOAD_FUNC_NOP;
@@ -5493,6 +5506,7 @@ static struct ggml_cgraph * llama_build_graph(
5493
5506
case OFFLOAD_FUNC_NOP:
5494
5507
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break ;
5495
5508
case OFFLOAD_FUNC:
5509
+ case OFFLOAD_FUNC_KQV:
5496
5510
case OFFLOAD_FUNC_FRC:
5497
5511
case OFFLOAD_FUNC_NR:
5498
5512
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break ;
@@ -8567,8 +8581,7 @@ struct llama_context_params llama_context_default_params() {
8567
8581
/* .f16_kv =*/ true ,
8568
8582
/* .logits_all =*/ false ,
8569
8583
/* .embedding =*/ false ,
8570
- /* .offload_k =*/ true ,
8571
- /* .offload_q =*/ true ,
8584
+ /* .offload_kqv =*/ true ,
8572
8585
};
8573
8586
8574
8587
return result;
@@ -8685,8 +8698,7 @@ struct llama_context * llama_new_context_with_model(
8685
8698
cparams.yarn_beta_fast = params.yarn_beta_fast ;
8686
8699
cparams.yarn_beta_slow = params.yarn_beta_slow ;
8687
8700
cparams.mul_mat_q = params.mul_mat_q ;
8688
- cparams.offload_k = params.offload_k ;
8689
- cparams.offload_v = params.offload_v ;
8701
+ cparams.offload_kqv = params.offload_kqv ;
8690
8702
8691
8703
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx ;
8692
8704
cparams.rope_freq_base = params.rope_freq_base == 0 .0f ? hparams.rope_freq_base_train : params.rope_freq_base ;
@@ -8724,7 +8736,7 @@ struct llama_context * llama_new_context_with_model(
8724
8736
8725
8737
// reserve memory for context buffers
8726
8738
if (!hparams.vocab_only ) {
8727
- if (!llama_kv_cache_init (ctx->model .hparams , ctx->kv_self , memory_type, cparams.n_ctx , model->n_gpu_layers , cparams.offload_k , cparams. offload_v )) {
8739
+ if (!llama_kv_cache_init (ctx->model .hparams , ctx->kv_self , memory_type, cparams.n_ctx , model->n_gpu_layers , cparams.offload_kqv )) {
8728
8740
LLAMA_LOG_ERROR (" %s: llama_kv_cache_init() failed for self-attention cache\n " , __func__);
8729
8741
llama_free (ctx);
8730
8742
return nullptr ;
0 commit comments