27
27
#define ANSI_COLOR_RESET " \x1b [0m"
28
28
#define ANSI_BOLD " \x1b [1m"
29
29
30
- static const int EOS_TOKEN_ID = 2 ;
31
30
32
31
// determine number of model parts based on the dimension
33
32
static const std::map<int , int > LLAMA_N_PARTS = {
@@ -55,6 +54,8 @@ void sigint_handler(int signo) {
55
54
#endif
56
55
57
56
57
+ void process_interactive_input (llama_context& ctx, const gpt_params& params);
58
+
58
59
int main (int argc, char ** argv) {
59
60
ggml_time_init ();
60
61
const int64_t t_main_start_us = ggml_time_us ();
@@ -85,15 +86,18 @@ int main(int argc, char ** argv) {
85
86
// params.prompt = R"(// this function checks if the number n is prime
86
87
// bool is_prime(int n) {)";
87
88
88
- int64_t t_load_us = 0 ;
89
-
90
89
// load the model
91
- llama_context* ctx_ptr = llama_init_from_params (params);
90
+ llama_context* ctx_ptr = nullptr ;
91
+ {
92
+ ctx_ptr = llama_init_from_params (params);
93
+ if (!ctx_ptr) {
94
+ fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
95
+ return 1 ;
96
+ }
97
+ }
98
+
92
99
llama_context & ctx = *ctx_ptr;
93
- gpt_vocab & vocab = llama_context_get_vocab (ctx);
94
-
95
- // print system information
96
- llama_print_context_info (ctx);
100
+ const gpt_vocab & vocab = llama_context_get_vocab (ctx);
97
101
98
102
// Add a space in front of the first character to match OG llama tokenizer behavior
99
103
params.prompt .insert (0 , 1 , ' ' );
@@ -109,8 +113,9 @@ int main(int argc, char ** argv) {
109
113
}
110
114
111
115
// tokenize the reverse prompt
112
- std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text (ctx, params.prompt );
116
+ std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text (ctx, params.antiprompt );
113
117
118
+ // Setup interactive mode
114
119
if (params.interactive ) {
115
120
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
116
121
struct sigaction sigint_action;
@@ -146,94 +151,69 @@ int main(int argc, char ** argv) {
146
151
is_interacting = true ;
147
152
}
148
153
149
- bool input_noecho = false ;
150
-
151
- int remaining_tokens = params.n_predict ;
152
-
153
154
// set the color for the prompt which will be output initially
154
155
if (params.use_color ) {
155
156
printf (ANSI_COLOR_YELLOW);
156
157
}
157
158
158
- if (!llama_ingest_input (ctx, params.prompt ))
159
+ // Prepare the context with input
160
+ // Send "beginning of string"
161
+ llama_add_bos (ctx);
162
+
163
+ // load the input
164
+ llama_update_input (ctx, params.prompt );
165
+
166
+ llama_print_startup_stats (ctx);
167
+
168
+ if (!llama_prepare_context (ctx))
159
169
{
160
- fprintf (stderr, " Failed to ingest prompt \n " );
170
+ fprintf (stderr, " %s: failed to prepare context \n " , __func__ );
161
171
return 1 ;
162
- };
163
-
164
- // display text
165
- input_noecho = false ;
166
- const std::vector<gpt_vocab::id>& embd = llama_context_get_embedding (ctx);
167
- if (!input_noecho) {
168
- for (auto id : embd) {
169
- printf (" %s" , vocab.id_to_token [id].c_str ());
170
- }
171
- fflush (stdout);
172
172
}
173
173
174
- if (!input_noecho && params.use_color ) {
175
- printf (ANSI_COLOR_RESET);
176
- }
177
-
178
- const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens (ctx);
179
-
180
- while (llama_context_is_finished (ctx) != true ) {
181
- gpt_vocab::id model_output = 0 ;
182
- bool response = llama_infer (ctx, model_output);
183
- if (response) {
184
- printf (" %s" , vocab.id_to_token [model_output].c_str ());
185
- fflush (stdout);
174
+ bool input_noecho = false ;
175
+ bool is_end_of_text = false ;
176
+ while (llama_context_is_finished (ctx) == false ) {
177
+ std::string model_output{};
178
+
179
+ if (llama_has_unconsumed_input (ctx)) {
180
+ llama_ingest_all_pending_input (ctx, !input_noecho);
181
+ // reset color to default if we there is no pending user input
182
+ if (!input_noecho && params.use_color ) {
183
+ printf (ANSI_COLOR_RESET);
184
+ }
185
+ }else {
186
+ // Run inference if we don't have any pending input
187
+ llama_infer (ctx, model_output, is_end_of_text);
188
+ // print the single token output
189
+ printf (" %s" , model_output.c_str ());
190
+ input_noecho = false ;
186
191
}
187
192
188
193
// in interactive mode, and not currently processing queued inputs;
189
194
// check if we should prompt the user for more
190
- if (params.interactive ) {
195
+ if (params.interactive && ! llama_has_unconsumed_input (ctx) ) {
191
196
// check for reverse prompt
192
- if (antiprompt_inp.size () && std::equal (antiprompt_inp. rbegin () , antiprompt_inp. rend (), last_n_tokens. rbegin () )) {
197
+ if (antiprompt_inp.size () && llama_is_anti_prompt_present (ctx , antiprompt_inp)) {
193
198
// reverse prompt found
194
199
is_interacting = true ;
195
200
}
196
201
if (is_interacting) {
197
202
if (params.instruct ) {
198
- input_consumed = embd_inp.size ();
199
- embd_inp.insert (embd_inp.end (), inp_pfx.begin (), inp_pfx.end ());
203
+ llama_update_input (ctx, " \n\n ### Instruction:\n\n " );
200
204
201
205
printf (" \n > " );
202
206
}
203
207
204
208
// currently being interactive
205
- bool another_line = true ;
206
- while (another_line) {
207
- fflush (stdout);
208
- char buf[256 ] = {0 };
209
- int n_read;
210
- if (params.use_color ) printf (ANSI_BOLD ANSI_COLOR_GREEN);
211
- if (scanf (" %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
212
- // presumable empty line, consume the newline
213
- std::ignore = scanf (" %*c" );
214
- n_read=0 ;
215
- }
216
- if (params.use_color ) printf (ANSI_COLOR_RESET);
217
-
218
- if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
219
- another_line = true ;
220
- buf[n_read-1 ] = ' \n ' ;
221
- buf[n_read] = 0 ;
222
- } else {
223
- another_line = false ;
224
- buf[n_read] = ' \n ' ;
225
- buf[n_read+1 ] = 0 ;
226
- }
227
- // Do not clear existing context in interactive mode
228
- llama_update_context_with_prompt (ctx, buf, false );
229
- }
230
-
209
+ process_interactive_input (ctx, params);
210
+ input_noecho = true ; // do not echo this input again
231
211
is_interacting = false ;
232
212
}
233
213
}
234
214
235
215
// end of text token
236
- if (embd. back () == EOS_TOKEN_ID ) {
216
+ if (is_end_of_text ) {
237
217
if (params.interactive ) {
238
218
is_interacting = true ;
239
219
} else {
@@ -243,23 +223,58 @@ int main(int argc, char ** argv) {
243
223
}
244
224
245
225
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
246
- if (params.interactive && remaining_tokens <= 0 ) {
247
- remaining_tokens = params. n_predict ;
226
+ if (params.interactive && llama_context_is_finished (ctx) ) {
227
+ llama_reset_remaining_tokens (ctx) ;
248
228
is_interacting = true ;
249
229
}
250
230
}
251
231
252
- // report timing from context
232
+
233
+ #if defined (_WIN32)
234
+ signal (SIGINT, SIG_DFL);
235
+ #endif
236
+
237
+ // report timing
253
238
{
254
239
const int64_t t_main_end_us = ggml_time_us ();
255
240
llama_print_end_stats (ctx);
256
241
fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
257
242
}
258
- llama_free_context (ctx_ptr);
243
+
244
+ llama_free_context (ctx_ptr);
259
245
260
246
if (params.use_color ) {
261
247
printf (ANSI_COLOR_RESET);
262
248
}
263
-
264
249
return 0 ;
265
250
}
251
+
252
+ void process_interactive_input (llama_context& ctx, const gpt_params& params)
253
+ {
254
+ bool another_line = true ;
255
+ while (another_line) {
256
+ fflush (stdout);
257
+ char buf[256 ] = {0 };
258
+ int n_read;
259
+ if (params.use_color ) printf (ANSI_BOLD ANSI_COLOR_GREEN);
260
+ if (scanf (" %255[^\n ]%n%*c" , buf, &n_read) <= 0 ) {
261
+ // presumable empty line, consume the newline
262
+ std::ignore = scanf (" %*c" );
263
+ n_read=0 ;
264
+ }
265
+ if (params.use_color ) printf (ANSI_COLOR_RESET);
266
+
267
+ if (n_read > 0 && buf[n_read-1 ]==' \\ ' ) {
268
+ another_line = true ;
269
+ buf[n_read-1 ] = ' \n ' ;
270
+ buf[n_read] = 0 ;
271
+ } else {
272
+ another_line = false ;
273
+ buf[n_read] = ' \n ' ;
274
+ buf[n_read+1 ] = 0 ;
275
+ }
276
+
277
+ // Do not clear existing context in interactive mode
278
+ llama_update_input (ctx, buf);
279
+ }
280
+ }
0 commit comments