Skip to content

Add a option to force the token end of text apears even on interative, and also shows loading porcentage #1058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.input_prefix = argv[i];
} else if (arg == "--forceendtoken") {
params.forceendtoken = true;
} else if (arg == "--eot_token") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.eot_token = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
Expand Down
2 changes: 2 additions & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct gpt_params {

std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
std::string eot_token = "[end of text]";

bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
Expand All @@ -52,6 +53,7 @@ struct gpt_params {
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
bool forceendtoken = true; // Force show the "[end of text]" token after the generation
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand Down
16 changes: 13 additions & 3 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ int main(int argc, char ** argv) {
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
" - If you want to submit another line, end your input in '\\'.\n"
"[model ready]\n");
is_interacting = params.interactive_start;
}

Expand Down Expand Up @@ -377,6 +378,9 @@ int main(int argc, char ** argv) {
is_antiprompt = true;
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
fflush(stdout);
if (params.forceendtoken) {
fprintf(stderr, (params.eot_token + "\n").c_str());
}
break;
}
}
Expand Down Expand Up @@ -459,17 +463,23 @@ int main(int argc, char ** argv) {

// end of text token
if (!embd.empty() && embd.back() == llama_token_eos()) {
if (params.forceendtoken || !params.instruct) {
fprintf(stderr, (params.eot_token + "\n").c_str());
}
if (params.instruct) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
}
else {
break;
}
}

// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
n_remain = params.n_predict;
if (params.forceendtoken) {
fprintf(stderr, (params.eot_token + "\n").c_str());
}
is_interacting = true;
}
}
Expand Down
3 changes: 2 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,7 @@ static void llama_model_load_internal(

fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
fprintf(stderr, "[model loading]\n");
}

// create the ggml context
Expand Down Expand Up @@ -1711,7 +1712,7 @@ struct llama_context * llama_init_from_file(
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
++*cur_percentage_p;
fprintf(stderr, ".");
fprintf(stderr, "[porcentage] %u%%\n", *cur_percentage_p);
fflush(stderr);
if (percentage >= 100) {
fprintf(stderr, "\n");
Expand Down