Skip to content

Commit 05b0621

Browse files
authored
llama : more consistent names of count variables (ggml-org#5994)
* llama : more consistent names of count variables ggml-ci * llama : n_parallel -> n_seq_max * common : fix param name * examples : fix param name
1 parent 83796e6 commit 05b0621

File tree

8 files changed

+35
-34
lines changed

8 files changed

+35
-34
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13-
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
13+
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
1414
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
1515
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
1616

common/common.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,7 +1288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12881288

12891289
cparams.n_ctx = params.n_ctx;
12901290
cparams.n_batch = params.n_batch;
1291-
cparams.n_parallel = params.n_parallel;
1291+
cparams.n_seq_max = params.n_parallel;
12921292
cparams.n_threads = params.n_threads;
12931293
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
12941294
cparams.seed = params.seed;
@@ -1786,17 +1786,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
17861786
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
17871787

17881788
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1789-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1789+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
17901790

17911791
llama_kv_cache_view_cell * c_curr = view.cells;
17921792
llama_seq_id * cs_curr = view.cells_sequences;
17931793

1794-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1794+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
17951795
if (i % row_size == 0) {
17961796
printf("\n%5d: ", i);
17971797
}
17981798
int seq_count = 0;
1799-
for (int j = 0; j < view.n_max_seq; j++) {
1799+
for (int j = 0; j < view.n_seq_max; j++) {
18001800
if (cs_curr[j] >= 0) { seq_count++; }
18011801
}
18021802
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
@@ -1809,14 +1809,14 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18091809
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
18101810

18111811
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1812-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1812+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
18131813

18141814
std::unordered_map<llama_seq_id, size_t> seqs;
18151815
llama_kv_cache_view_cell * c_curr = view.cells;
18161816
llama_seq_id * cs_curr = view.cells_sequences;
18171817

1818-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1819-
for (int j = 0; j < view.n_max_seq; j++) {
1818+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1819+
for (int j = 0; j < view.n_seq_max; j++) {
18201820
if (cs_curr[j] < 0) { continue; }
18211821
if (seqs.find(cs_curr[j]) == seqs.end()) {
18221822
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1835,11 +1835,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18351835

18361836
c_curr = view.cells;
18371837
cs_curr = view.cells_sequences;
1838-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1838+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
18391839
if (i % row_size == 0) {
18401840
printf("\n%5d: ", i);
18411841
}
1842-
for (int j = 0; j < view.n_max_seq; j++) {
1842+
for (int j = 0; j < view.n_seq_max; j++) {
18431843
if (cs_curr[j] >= 0) {
18441844
const auto & it = seqs.find(cs_curr[j]);
18451845
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');

examples/batched-bench/batched-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
106106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
107107

108108
// ensure enough sequences are available
109-
ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
109+
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
110110

111111
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
112112

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
8080
ctx_params.seed = 1234;
8181
ctx_params.n_ctx = n_kv_req;
8282
ctx_params.n_batch = std::max(n_len, n_parallel);
83-
ctx_params.n_parallel = n_parallel;
83+
ctx_params.n_seq_max = n_parallel;
8484
ctx_params.n_threads = params.n_threads;
8585
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
8686

examples/main/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,7 @@ int main(int argc, char ** argv) {
878878
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
879879
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
880880
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
881+
881882
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
882883

883884
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());

examples/perplexity/perplexity.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
841841
const int n_batch = params.n_batch;
842842

843843
const int max_tasks_per_batch = 32;
844-
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
844+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
845845

846846
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
847847

@@ -1118,7 +1118,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
11181118
const int n_batch = params.n_batch;
11191119

11201120
const int max_tasks_per_batch = 128;
1121-
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
1121+
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
11221122

11231123
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
11241124

@@ -1470,7 +1470,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
14701470
const int n_batch = params.n_batch;
14711471

14721472
const int max_tasks_per_batch = 32;
1473-
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
1473+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
14741474

14751475
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
14761476

llama.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12538,7 +12538,7 @@ struct llama_context_params llama_context_default_params() {
1253812538
/*.seed =*/ LLAMA_DEFAULT_SEED,
1253912539
/*.n_ctx =*/ 512,
1254012540
/*.n_batch =*/ 512,
12541-
/*.n_parallel =*/ 1,
12541+
/*.n_seq_max =*/ 1,
1254212542
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
1254312543
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
1254412544
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
@@ -12700,7 +12700,7 @@ struct llama_context * llama_new_context_with_model(
1270012700
auto & cparams = ctx->cparams;
1270112701

1270212702
cparams.n_batch = params.n_batch;
12703-
// TODO: maybe add n_parallel here too
12703+
// TODO: maybe add n_seq_max here too
1270412704
cparams.n_threads = params.n_threads;
1270512705
cparams.n_threads_batch = params.n_threads_batch;
1270612706
cparams.yarn_ext_factor = params.yarn_ext_factor;
@@ -12767,7 +12767,7 @@ struct llama_context * llama_new_context_with_model(
1276712767
// Mamba only needs a constant number of KV cache cells per sequence
1276812768
if (model->arch == LLM_ARCH_MAMBA) {
1276912769
// Mamba needs at least as many KV cells as there are sequences kept at any time
12770-
kv_size = std::max((uint32_t) 1, params.n_parallel);
12770+
kv_size = std::max((uint32_t) 1, params.n_seq_max);
1277112771
// it's probably best to keep as much precision as possible for the states
1277212772
type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
1277312773
type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
@@ -13024,7 +13024,7 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
1302413024
return ctx->cparams.n_batch;
1302513025
}
1302613026

13027-
uint32_t llama_n_max_seq(const struct llama_context * ctx) {
13027+
uint32_t llama_n_seq_max(const struct llama_context * ctx) {
1302813028
return ctx->kv_self.size;
1302913029
}
1303013030

@@ -13188,10 +13188,10 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
1318813188
}
1318913189
}
1319013190

13191-
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
13191+
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
1319213192
struct llama_kv_cache_view result = {
1319313193
/*.n_cells = */ 0,
13194-
/*.n_max_seq = */ n_max_seq,
13194+
/*.n_seq_max = */ n_seq_max,
1319513195
/*.token_count = */ 0,
1319613196
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
1319713197
/*.max_contiguous = */ 0,
@@ -13219,7 +13219,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
1321913219
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
1322013220
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
1322113221
view->cells = (struct llama_kv_cache_view_cell *)p;
13222-
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
13222+
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
1322313223
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
1322413224
view->cells_sequences = (llama_seq_id *)p;
1322513225
}
@@ -13233,7 +13233,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
1323313233
uint32_t max_contig = 0;
1323413234
int32_t max_contig_idx = -1;
1323513235

13236-
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
13236+
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
1323713237
const size_t curr_size = kv_cells[i].seq_id.size();
1323813238
token_count += curr_size;
1323913239
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
@@ -13250,7 +13250,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
1325013250

1325113251
int seq_idx = 0;
1325213252
for (const llama_seq_id it : kv_cells[i].seq_id) {
13253-
if (seq_idx >= view->n_max_seq) {
13253+
if (seq_idx >= view->n_seq_max) {
1325413254
break;
1325513255
}
1325613256
cs_curr[seq_idx] = it;
@@ -13259,7 +13259,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
1325913259
if (seq_idx != 0) {
1326013260
used_cells++;
1326113261
}
13262-
for (; seq_idx < view->n_max_seq; seq_idx++) {
13262+
for (; seq_idx < view->n_seq_max; seq_idx++) {
1326313263
cs_curr[seq_idx] = -1;
1326413264
}
1326513265
}
@@ -13921,12 +13921,12 @@ int32_t llama_tokenize(
1392113921
const char * text,
1392213922
int32_t text_len,
1392313923
llama_token * tokens,
13924-
int32_t n_max_tokens,
13924+
int32_t n_tokens_max,
1392513925
bool add_bos,
1392613926
bool special) {
1392713927
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
1392813928

13929-
if (n_max_tokens < (int) res.size()) {
13929+
if (n_tokens_max < (int) res.size()) {
1393013930
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
1393113931
return -((int) res.size());
1393213932
}

llama.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ extern "C" {
235235
uint32_t seed; // RNG seed, -1 for random
236236
uint32_t n_ctx; // text context, 0 = from model
237237
uint32_t n_batch; // prompt processing maximum batch size
238-
uint32_t n_parallel; // number of parallel sequences (i.e. distinct states for recurrent models)
238+
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
239239
uint32_t n_threads; // number of threads to use for generation
240240
uint32_t n_threads_batch; // number of threads to use for batch processing
241241

@@ -377,7 +377,7 @@ extern "C" {
377377

378378
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
379379
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
380-
LLAMA_API uint32_t llama_n_max_seq (const struct llama_context * ctx);
380+
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
381381

382382
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
383383
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
@@ -456,7 +456,7 @@ extern "C" {
456456
// Maximum number of sequences that can exist in a cell. It's not an error
457457
// if there are more sequences in a cell than this value, however they will
458458
// not be visible in the view cells_sequences.
459-
int32_t n_max_seq;
459+
int32_t n_seq_max;
460460

461461
// Number of tokens in the cache. For example, if there are two populated
462462
// cells, the first with 1 sequence id in it and the second with 2 sequence
@@ -476,12 +476,12 @@ extern "C" {
476476
// Information for an individual cell.
477477
struct llama_kv_cache_view_cell * cells;
478478

479-
// The sequences for each cell. There will be n_max_seq items per cell.
479+
// The sequences for each cell. There will be n_seq_max items per cell.
480480
llama_seq_id * cells_sequences;
481481
};
482482

483483
// Create an empty KV cache view. (use only for debugging purposes)
484-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
484+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
485485

486486
// Free a KV cache view. (use only for debugging purposes)
487487
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@@ -708,7 +708,7 @@ extern "C" {
708708

709709
/// @details Convert the provided text into tokens.
710710
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
711-
/// @return Returns the number of tokens on success, no more than n_max_tokens
711+
/// @return Returns the number of tokens on success, no more than n_tokens_max
712712
/// @return Returns a negative number on failure - the number of tokens that would have been returned
713713
/// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
714714
/// Does not insert a leading space.
@@ -717,7 +717,7 @@ extern "C" {
717717
const char * text,
718718
int32_t text_len,
719719
llama_token * tokens,
720-
int32_t n_max_tokens,
720+
int32_t n_tokens_max,
721721
bool add_bos,
722722
bool special);
723723

0 commit comments

Comments
 (0)