Skip to content

Commit 400dcce

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 01f45e1 + 9ca4abe commit 400dcce

File tree

5 files changed

+25
-11
lines changed

5 files changed

+25
-11
lines changed

examples/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
554554
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
555555
fprintf(stdout, " -f FNAME, --file FNAME\n");
556556
fprintf(stdout, " prompt file to start generation.\n");
557-
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
557+
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
558558
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
559559
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
560560
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);

examples/console.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#include <windows.h>
1111
#include <fcntl.h>
1212
#include <io.h>
13+
#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
14+
#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
15+
#endif
1316
#else
1417
#include <climits>
1518
#include <sys/ioctl.h>
@@ -68,9 +71,10 @@ namespace console {
6871
}
6972
}
7073
if (hConsole) {
71-
// Enable ANSI colors on Windows 10+
72-
if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
73-
SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
74+
// Check conditions combined to reduce nesting
75+
if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
76+
!SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
77+
advanced_display = false;
7478
}
7579
// Set console output codepage to UTF8
7680
SetConsoleOutputCP(CP_UTF8);

examples/main/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,13 @@ The following options allow you to control the text generation process and fine-
160160

161161
### Number of Tokens to Predict
162162

163-
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
163+
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)
164164

165-
The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
165+
The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
166+
167+
A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
168+
169+
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
166170

167171
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
168172

examples/main/main.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,8 +431,12 @@ int main(int argc, char ** argv) {
431431
// - take the n_keep first tokens from the original prompt (via n_past)
432432
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
433433
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
434-
const int n_left = n_past - params.n_keep;
434+
if (params.n_predict == -2) {
435+
fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
436+
break;
437+
}
435438

439+
const int n_left = n_past - params.n_keep;
436440
// always keep the first token - BOS
437441
n_past = std::max(1, params.n_keep);
438442
n_past_guidance = std::max(1, params.n_keep + guidance_offset);

examples/server/server.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ struct llama_server_context
196196
llama_context *ctx = nullptr;
197197
gpt_params params;
198198

199+
grammar_parser::parse_state parsed_grammar;
199200
llama_grammar *grammar = nullptr;
200201

201202
bool truncated = false;
@@ -241,10 +242,13 @@ struct llama_server_context
241242
stopped_limit = false;
242243
stopping_word = "";
243244
multibyte_pending = 0;
244-
grammar = nullptr;
245-
246245
n_remain = 0;
247246
n_past = 0;
247+
248+
if (grammar != nullptr) {
249+
llama_grammar_free(grammar);
250+
grammar = nullptr;
251+
}
248252
}
249253

250254
bool loadModel(const gpt_params &params_)
@@ -265,8 +269,6 @@ struct llama_server_context
265269
bool loadGrammar()
266270
{
267271
if (!params.grammar.empty()) {
268-
grammar_parser::parse_state parsed_grammar;
269-
270272
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
271273
// will be empty (default) if there are parse errors
272274
if (parsed_grammar.rules.empty()) {

0 commit comments

Comments
 (0)