Skip to content

Commit 6f63d64

Browse files
authored
tokenize : add --show-count (token) option (#8299)
This commit adds a new option to the tokenize example, --show-count. When this is set the total number of tokens are printed to stdout. This was added as an option as I was concerned that there might be scripts that use the output from this program and it might be better to not print this information by default. The motivation for this is that can be useful to find out how many tokens a file contains, for example when trying to determine prompt input file sizes for testing. Signed-off-by: Daniel Bevenius <[email protected]>
1 parent 51d2eba commit 6f63d64

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

examples/tokenize/tokenize.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
3030
fprintf(stream, " --stdin read prompt from standard input.\n");
3131
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
3232
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
33+
fprintf(stream, " --show-count print the total number of tokens.\n");
3334
}
3435

3536
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
195196
bool printing_ids = false;
196197
bool no_bos = false;
197198
bool disable_logging = false;
199+
bool show_token_count = false;
198200
const char * model_path = NULL;
199201
const char * prompt_path = NULL;
200202
const char * prompt_arg = NULL;
@@ -249,6 +251,9 @@ int main(int raw_argc, char ** raw_argv) {
249251
else if (arg == "--log-disable") {
250252
disable_logging = true;
251253
}
254+
else if (arg == "--show-count") {
255+
show_token_count = true;
256+
}
252257
else {
253258
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
254259
return 1;
@@ -384,6 +389,9 @@ int main(int raw_argc, char ** raw_argv) {
384389
printf("]\n");
385390
}
386391

392+
if (show_token_count) {
393+
printf("Total number of tokens: %ld\n", tokens.size());
394+
}
387395
// silence valgrind
388396
llama_free(ctx);
389397
llama_free_model(model);

0 commit comments

Comments
 (0)