@@ -1025,7 +1025,7 @@ struct llama_kv_cache {
1025
1025
uint32_t size = 0 ;
1026
1026
1027
1027
// computed before each graph build
1028
- uint32_t cell_max = 0 ;
1028
+ uint32_t n = 0 ;
1029
1029
1030
1030
std::vector<llama_kv_cell> cells;
1031
1031
@@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
2619
2619
const int n_gpu_layers = model.n_gpu_layers ;
2620
2620
2621
2621
const int32_t n_tokens = batch.n_tokens ;
2622
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : std::max ( 1 , ( int ) kv_self.cell_max ) ;
2622
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
2623
2623
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
2624
2624
2625
2625
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3011,7 +3011,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3011
3011
const int n_gpu_layers = model.n_gpu_layers ;
3012
3012
3013
3013
const int32_t n_tokens = batch.n_tokens ;
3014
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3014
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3015
3015
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3016
3016
3017
3017
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3418,7 +3418,7 @@ static struct ggml_cgraph * llm_build_falcon(
3418
3418
const int n_gpu_layers = model.n_gpu_layers ;
3419
3419
3420
3420
const int32_t n_tokens = batch.n_tokens ;
3421
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3421
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3422
3422
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3423
3423
3424
3424
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3783,7 +3783,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3783
3783
const float norm_eps = hparams.f_norm_eps ;
3784
3784
3785
3785
const int32_t n_tokens = batch.n_tokens ;
3786
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3786
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3787
3787
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3788
3788
3789
3789
auto & buf_compute = lctx.buf_compute ;
@@ -4115,8 +4115,10 @@ static int llama_decode_internal(
4115
4115
// a heuristic, to avoid attending the full cache if it is not yet utilized
4116
4116
// after enough generations, the benefit from this heuristic disappears
4117
4117
// if we start defragmenting the cache, the benefit from this will be more important
4118
- kv_self.cell_max = llama_kv_cache_cell_max (kv_self);
4119
- // printf("kv_self.cell_max = %d\n", kv_self.cell_max);
4118
+ // kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4119
+ kv_self.n = std::max (32 , llama_kv_cache_cell_max (kv_self));
4120
+
4121
+ // printf("kv_self.n = %d\n", kv_self.n);
4120
4122
4121
4123
ggml_allocr_reset (lctx.alloc );
4122
4124
0 commit comments