@@ -1025,7 +1025,7 @@ struct llama_kv_cache {
1025
1025
uint32_t size = 0 ;
1026
1026
1027
1027
// computed before each graph build
1028
- uint32_t cell_max = 0 ;
1028
+ uint32_t n = 0 ;
1029
1029
1030
1030
std::vector<llama_kv_cell> cells;
1031
1031
@@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
2619
2619
const int n_gpu_layers = model.n_gpu_layers ;
2620
2620
2621
2621
const int32_t n_tokens = batch.n_tokens ;
2622
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
2622
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
2623
2623
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
2624
2624
2625
2625
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3007,7 +3007,7 @@ static struct ggml_cgraph * llm_build_baichaun(
3007
3007
const int n_gpu_layers = model.n_gpu_layers ;
3008
3008
3009
3009
const int32_t n_tokens = batch.n_tokens ;
3010
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3010
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3011
3011
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3012
3012
3013
3013
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3410,7 +3410,7 @@ static struct ggml_cgraph * llm_build_falcon(
3410
3410
const int n_gpu_layers = model.n_gpu_layers ;
3411
3411
3412
3412
const int32_t n_tokens = batch.n_tokens ;
3413
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3413
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3414
3414
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3415
3415
3416
3416
const bool do_rope_shift = ggml_allocr_is_measure (lctx.alloc ) || kv_self.has_shift ;
@@ -3771,7 +3771,7 @@ static struct ggml_cgraph * llm_build_starcoder(
3771
3771
const float norm_eps = hparams.f_norm_eps ;
3772
3772
3773
3773
const int32_t n_tokens = batch.n_tokens ;
3774
- const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.cell_max ;
3774
+ const int32_t n_kv = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx : kv_self.n ;
3775
3775
const int32_t kv_head = ggml_allocr_is_measure (lctx.alloc ) ? n_ctx - n_tokens : kv_self.head ;
3776
3776
3777
3777
auto & buf_compute = lctx.buf_compute ;
@@ -4102,8 +4102,10 @@ static int llama_decode_internal(
4102
4102
// a heuristic, to avoid attending the full cache if it is not yet utilized
4103
4103
// after enough generations, the benefit from this heuristic disappears
4104
4104
// if we start defragmenting the cache, the benefit from this will be more important
4105
- kv_self.cell_max = llama_kv_cache_cell_max (kv_self);
4106
- // printf("kv_self.cell_max = %d\n", kv_self.cell_max);
4105
+ // kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4106
+ kv_self.n = std::max (32 , llama_kv_cache_cell_max (kv_self));
4107
+
4108
+ // printf("kv_self.n = %d\n", kv_self.n);
4107
4109
4108
4110
ggml_allocr_reset (lctx.alloc );
4109
4111
0 commit comments