Skip to content

Commit 2e92aef

Browse files
committed
Merge branch 'custom-attention-mask' into cam-cuda-2
2 parents 4c0f243 + e1067ef commit 2e92aef

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

examples/parallel/parallel.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,10 @@ int main(int argc, char ** argv) {
326326

327327
const auto t_main_end = ggml_time_us();
328328

329-
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, cache miss %d \033[0m: \n\nInput: %s\nResponse: %s\n\n",
329+
LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\nResponse: %s\n\n",
330330
client.id, client.seq_id, client.n_prompt, client.n_decoded,
331331
(t_main_end - client.t_start_prompt) / 1e6,
332+
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
332333
n_cache_miss,
333334
::trim(client.input).c_str(),
334335
::trim(client.response).c_str());

llama.cpp

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ struct llama_kv_cache {
10251025
uint32_t size = 0;
10261026

10271027
// computed before each graph build
1028-
uint32_t cell_max = 0;
1028+
uint32_t n = 0;
10291029

10301030
std::vector<llama_kv_cell> cells;
10311031

@@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
26192619
const int n_gpu_layers = model.n_gpu_layers;
26202620

26212621
const int32_t n_tokens = batch.n_tokens;
2622-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : std::max(1, (int)kv_self.cell_max);
2622+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
26232623
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
26242624

26252625
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3011,7 +3011,7 @@ static struct ggml_cgraph * llm_build_baichaun(
30113011
const int n_gpu_layers = model.n_gpu_layers;
30123012

30133013
const int32_t n_tokens = batch.n_tokens;
3014-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3014+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
30153015
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
30163016

30173017
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3418,7 +3418,7 @@ static struct ggml_cgraph * llm_build_falcon(
34183418
const int n_gpu_layers = model.n_gpu_layers;
34193419

34203420
const int32_t n_tokens = batch.n_tokens;
3421-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3421+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
34223422
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
34233423

34243424
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3783,7 +3783,7 @@ static struct ggml_cgraph * llm_build_starcoder(
37833783
const float norm_eps = hparams.f_norm_eps;
37843784

37853785
const int32_t n_tokens = batch.n_tokens;
3786-
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.cell_max;
3786+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
37873787
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
37883788

37893789
auto & buf_compute = lctx.buf_compute;
@@ -4115,8 +4115,10 @@ static int llama_decode_internal(
41154115
// a heuristic, to avoid attending the full cache if it is not yet utilized
41164116
// after enough generations, the benefit from this heuristic disappears
41174117
// if we start defragmenting the cache, the benefit from this will be more important
4118-
kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
4119-
//printf("kv_self.cell_max = %d\n", kv_self.cell_max);
4118+
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
4119+
kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
4120+
4121+
//printf("kv_self.n = %d\n", kv_self.n);
41204122

41214123
ggml_allocr_reset(lctx.alloc);
41224124

0 commit comments

Comments
 (0)