@@ -55,25 +55,6 @@ void sigint_handler(int signo) {
55
55
}
56
56
#endif
57
57
58
- const char * llama_print_system_info (void ) {
59
- static std::string s;
60
-
61
- s = " " ;
62
- s += " AVX = " + std::to_string (ggml_cpu_has_avx ()) + " | " ;
63
- s += " AVX2 = " + std::to_string (ggml_cpu_has_avx2 ()) + " | " ;
64
- s += " AVX512 = " + std::to_string (ggml_cpu_has_avx512 ()) + " | " ;
65
- s += " FMA = " + std::to_string (ggml_cpu_has_fma ()) + " | " ;
66
- s += " NEON = " + std::to_string (ggml_cpu_has_neon ()) + " | " ;
67
- s += " ARM_FMA = " + std::to_string (ggml_cpu_has_arm_fma ()) + " | " ;
68
- s += " F16C = " + std::to_string (ggml_cpu_has_f16c ()) + " | " ;
69
- s += " FP16_VA = " + std::to_string (ggml_cpu_has_fp16_va ()) + " | " ;
70
- s += " WASM_SIMD = " + std::to_string (ggml_cpu_has_wasm_simd ()) + " | " ;
71
- s += " BLAS = " + std::to_string (ggml_cpu_has_blas ()) + " | " ;
72
- s += " SSE3 = " + std::to_string (ggml_cpu_has_sse3 ()) + " | " ;
73
- s += " VSX = " + std::to_string (ggml_cpu_has_vsx ()) + " | " ;
74
-
75
- return s.c_str ();
76
- }
77
58
78
59
int main (int argc, char ** argv) {
79
60
ggml_time_init ();
@@ -107,41 +88,18 @@ int main(int argc, char ** argv) {
107
88
108
89
int64_t t_load_us = 0 ;
109
90
110
- gpt_vocab vocab;
111
- llama_model model;
112
-
113
91
// load the model
114
- {
115
- const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
116
- const int64_t t_start_us = ggml_time_us ();
117
- if (!llama_model_load (params.model , model, vocab, params.n_ctx , memory_type)) {
118
- fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
119
- return 1 ;
120
- }
121
-
122
- t_load_us = ggml_time_us () - t_start_us;
123
- }
92
+ llama_context* ctx_ptr = llama_init_from_params (params);
93
+ llama_context & ctx = *ctx_ptr;
94
+ gpt_vocab & vocab = llama_context_get_vocab (ctx);
124
95
125
96
// print system information
126
- {
127
- fprintf (stderr, " \n " );
128
- fprintf (stderr, " system_info: n_threads = %d / %d | %s\n " ,
129
- params.n_threads , std::thread::hardware_concurrency (), llama_print_system_info ());
130
- }
131
-
132
- int n_past = 0 ;
133
-
134
- int64_t t_sample_us = 0 ;
135
- int64_t t_predict_us = 0 ;
136
-
137
- std::vector<float > logits;
97
+ llama_print_context_info (ctx);
138
98
139
99
// Add a space in front of the first character to match OG llama tokenizer behavior
140
100
params.prompt .insert (0 , 1 , ' ' );
141
101
// tokenize the prompt
142
- std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize (vocab, params.prompt , true );
143
-
144
- params.n_predict = std::min (params.n_predict , model.hparams .n_ctx - (int ) embd_inp.size ());
102
+ std::vector<gpt_vocab::id> embd_inp = llama_tokenize_text (ctx, params.prompt );
145
103
146
104
// prefix & suffix for instruct mode
147
105
const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize (vocab, " \n\n ### Instruction:\n\n " , true );
@@ -154,24 +112,8 @@ int main(int argc, char ** argv) {
154
112
}
155
113
156
114
// tokenize the reverse prompt
157
- std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
158
-
159
- for (auto antiprompt : params.antiprompt ) {
160
- antipromptv_inp.push_back (::llama_tokenize (vocab, antiprompt, false ));
161
- }
162
-
163
- // enable interactive mode if reverse prompt is specified
164
- if (!antipromptv_inp.size ()) {
165
- params.interactive = true ;
166
- }
115
+ std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text (ctx, params.prompt );
167
116
168
- fprintf (stderr, " \n " );
169
- fprintf (stderr, " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
170
- fprintf (stderr, " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
171
- for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
172
- fprintf (stderr, " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
173
- }
174
- fprintf (stderr, " \n " );
175
117
if (params.interactive ) {
176
118
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
177
119
struct sigaction sigint_action;
@@ -200,16 +142,6 @@ int main(int argc, char ** argv) {
200
142
fprintf (stderr, " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
201
143
fprintf (stderr, " \n\n " );
202
144
203
- std::vector<gpt_vocab::id> embd;
204
-
205
- // determine the required inference memory per token:
206
- size_t mem_per_token = 0 ;
207
- llama_eval (model, params.n_threads , 0 , { 0 , 1 , 2 , 3 }, logits, mem_per_token);
208
-
209
- int last_n_size = params.repeat_last_n ;
210
- std::vector<gpt_vocab::id> last_n_tokens (last_n_size);
211
- std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
212
-
213
145
if (params.interactive ) {
214
146
fprintf (stderr, " == Running in interactive mode. ==\n "
215
147
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -220,7 +152,6 @@ int main(int argc, char ** argv) {
220
152
is_interacting = true ;
221
153
}
222
154
223
- int input_consumed = 0 ;
224
155
bool input_noecho = false ;
225
156
226
157
int remaining_tokens = params.n_predict ;
@@ -230,85 +161,44 @@ int main(int argc, char ** argv) {
230
161
printf (ANSI_COLOR_YELLOW);
231
162
}
232
163
233
- while (remaining_tokens > 0 || params.interactive ) {
234
- // predict
235
- if (embd.size () > 0 ) {
236
- const int64_t t_start_us = ggml_time_us ();
237
-
238
- if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
239
- fprintf (stderr, " Failed to predict\n " );
240
- return 1 ;
241
- }
242
-
243
- t_predict_us += ggml_time_us () - t_start_us;
244
- }
245
-
246
- n_past += embd.size ();
247
- embd.clear ();
248
-
249
- if (embd_inp.size () <= input_consumed) {
250
- // out of user input, sample next token
251
- const float top_k = params.top_k ;
252
- const float top_p = params.top_p ;
253
- const float temp = params.temp ;
254
- const float repeat_penalty = params.repeat_penalty ;
255
-
256
- const int n_vocab = model.hparams .n_vocab ;
257
-
258
- gpt_vocab::id id = 0 ;
259
-
260
- {
261
- const int64_t t_start_sample_us = ggml_time_us ();
262
-
263
- if (params.ignore_eos ) {
264
- // set the logit of the eos token to zero to avoid sampling it
265
- logits[logits.size () - n_vocab + EOS_TOKEN_ID] = 0 ;
266
- }
267
-
268
- id = llama_sample_top_p_top_k (vocab, logits.data () + (logits.size () - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
269
-
270
- last_n_tokens.erase (last_n_tokens.begin ());
271
- last_n_tokens.push_back (id);
164
+ if (!llama_injest_input (ctx, params.prompt ))
165
+ {
166
+ fprintf (stderr, " Failed to injest prompt\n " );
167
+ return 1 ;
168
+ };
272
169
273
- t_sample_us += ggml_time_us () - t_start_sample_us;
274
- }
170
+ // display text
171
+ input_noecho = false ;
172
+ const std::vector<gpt_vocab::id>& embd = llama_context_get_embd (ctx);
173
+ if (!input_noecho) {
174
+ for (auto id : embd) {
175
+ printf (" %s" , vocab.id_to_token [id].c_str ());
176
+ }
177
+ fflush (stdout);
178
+ }
275
179
276
- // add it to the context
277
- embd.push_back (id);
180
+ if (!input_noecho && params.use_color ) {
181
+ printf (ANSI_COLOR_RESET);
182
+ }
278
183
279
- // echo this to console
280
- input_noecho = false ;
184
+ const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens (ctx);
281
185
282
- // decrement remaining sampling budget
283
- --remaining_tokens;
284
- } else {
285
- // some user input remains from prompt or interaction, forward it to processing
286
- while (embd_inp.size () > input_consumed) {
287
- embd.push_back (embd_inp[input_consumed]);
288
- last_n_tokens.erase (last_n_tokens.begin ());
289
- last_n_tokens.push_back (embd_inp[input_consumed]);
290
- ++input_consumed;
291
- if ((int ) embd.size () >= params.n_batch ) {
292
- break ;
293
- }
294
- }
295
- }
296
-
297
- // display text
298
- if (!input_noecho) {
299
- for (auto id : embd) {
300
- printf (" %s" , vocab.id_to_token [id].c_str ());
301
- }
186
+ while (llama_context_not_finished (ctx) > 0 ) {
187
+ gpt_vocab::id model_output = 0 ;
188
+ bool response = llama_inference (ctx, model_output);
189
+ if (response) {
190
+ printf (" %s" , vocab.id_to_token [model_output].c_str ());
302
191
fflush (stdout);
303
192
}
304
193
// reset color to default if we there is no pending user input
305
194
if (!input_noecho && params.use_color && (int )embd_inp.size () == input_consumed) {
306
195
printf (ANSI_COLOR_RESET);
307
196
}
308
197
198
+
309
199
// in interactive mode, and not currently processing queued inputs;
310
200
// check if we should prompt the user for more
311
- if (params.interactive && embd_inp. size () <= input_consumed ) {
201
+ if (params.interactive ) {
312
202
// check for reverse prompt
313
203
for (auto antiprompt_inp : antipromptv_inp) {
314
204
if (antiprompt_inp.size () && std::equal (antiprompt_inp.rbegin (), antiprompt_inp.rend (), last_n_tokens.rbegin ())) {
@@ -337,15 +227,8 @@ int main(int argc, char ** argv) {
337
227
} else {
338
228
line.pop_back (); // Remove the continue character
339
229
}
340
- buffer += line + ' \n ' ; // Append the line to the result
341
- } while (another_line);
342
- if (params.use_color ) printf (ANSI_COLOR_RESET);
343
-
344
- std::vector<gpt_vocab::id> line_inp = ::llama_tokenize (vocab, buffer, false );
345
- embd_inp.insert (embd_inp.end (), line_inp.begin (), line_inp.end ());
346
-
347
- if (params.instruct ) {
348
- embd_inp.insert (embd_inp.end (), inp_sfx.begin (), inp_sfx.end ());
230
+ // Do not clear existing context in interactive mode
231
+ llama_init_context_with_prompt (ctx, buf, false );
349
232
}
350
233
351
234
remaining_tokens -= line_inp.size ();
@@ -371,24 +254,14 @@ int main(int argc, char ** argv) {
371
254
is_interacting = true ;
372
255
}
373
256
}
374
-
375
- #if defined (_WIN32)
376
- signal (SIGINT, SIG_DFL);
377
- #endif
378
-
379
- // report timing
257
+
258
+ // report timing from context
380
259
{
381
260
const int64_t t_main_end_us = ggml_time_us ();
382
-
383
- fprintf (stderr, " \n\n " );
384
- fprintf (stderr, " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
385
- fprintf (stderr, " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
386
- fprintf (stderr, " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
387
- fprintf (stderr, " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
261
+ llama_print_end_stats (ctx);
388
262
fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
389
263
}
390
-
391
- ggml_free (model.ctx );
264
+ llama_free_context (ctx_ptr);
392
265
393
266
if (params.use_color ) {
394
267
printf (ANSI_COLOR_RESET);
0 commit comments