@@ -161,8 +161,14 @@ Error Runner::generate(
161
161
// Prepare the inputs.
162
162
// Use ones-initialized inputs.
163
163
ET_CHECK_MSG (!prompt.empty (), " Prompt cannot be null" );
164
+ timers_.model_load = util::time_in_ms ();
164
165
ET_CHECK_OK_OR_RETURN_ERROR (load ());
166
+ timers_.model_load = util::time_in_ms () - timers_.model_load ;
165
167
168
+ // First token time only measures the time it takes to encode the prompt and
169
+ // return a response token.
170
+
171
+ timers_.start = util::time_in_ms ();
166
172
shouldStop_ = false ;
167
173
168
174
// encode the (string) prompt into tokens sequence
@@ -173,12 +179,14 @@ Error Runner::generate(
173
179
// Set the sequence length to the max seq length if not provided
174
180
seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
175
181
182
+
176
183
tokenizer_->encode (
177
184
prompt.c_str (),
178
185
n_bos_,
179
186
append_eos_ ? n_eos_ : 0 ,
180
187
prompt_tokens,
181
188
&num_prompt_tokens);
189
+
182
190
for (int i = 0 ; i < num_prompt_tokens; i++) {
183
191
ET_LOG (Info, " prompt_tokens[%d]: %d" , i, prompt_tokens[i]);
184
192
}
@@ -192,8 +200,6 @@ Error Runner::generate(
192
200
" Sequence length exceeded - please increase the seq_len value passed to generate()" );
193
201
194
202
// start the main loop
195
- long start =
196
- 0 ; // used to time our code, only initialized after first iteration
197
203
int next; // will store the next token in the sequence
198
204
int64_t pos = num_prompt_tokens - 1 ; // position in the sequence
199
205
int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
@@ -255,6 +261,7 @@ Error Runner::generate(
255
261
tokenizer_->decode (prompt_tokens[i - 1 ], prompt_tokens[i])));
256
262
}
257
263
}
264
+
258
265
// create a 1xN int tensor with next as value
259
266
while (pos < seq_len) {
260
267
// ET_LOG(Info, "Generating step %d...", pos);
@@ -290,7 +297,12 @@ Error Runner::generate(
290
297
outputs.size () > 0 ,
291
298
" Expecting output to have at least one evalue. Got %zu" ,
292
299
outputs.size ());
293
-
300
+ if (pos == num_prompt_tokens) {
301
+ timers_.first_token = util::time_in_ms () - timers_.start ;
302
+ timers_.remaining_tokens = util::time_in_ms ();
303
+ } else if (pos == num_prompt_tokens - 1 ) {
304
+ timers_.prompt_eval = util::time_in_ms () - timers_.start ;
305
+ }
294
306
int32_t next_tok;
295
307
exec_aten::Tensor logits_tensor = outputs.at (logits_index).toTensor ();
296
308
@@ -342,6 +354,7 @@ Error Runner::generate(
342
354
if (pos >= num_prompt_tokens && next == eos_id_) {
343
355
eos_counter++;
344
356
if (eos_counter == n_eos_) {
357
+ printf (" \n " );
345
358
ET_LOG (Info, " Reached to the end of generation" );
346
359
break ;
347
360
}
@@ -351,10 +364,6 @@ Error Runner::generate(
351
364
352
365
token = next;
353
366
354
- // init the timer here because the first iteration can be slower
355
- if (start == 0 ) {
356
- start = util::time_in_ms ();
357
- }
358
367
if (use_kv_cache_) {
359
368
// outputs: [k_cache, v_cache, logits, k_cache, v_cache]
360
369
memcpy (
@@ -367,23 +376,68 @@ Error Runner::generate(
367
376
v_data.size ());
368
377
}
369
378
}
379
+ timers_.remaining_tokens = util::time_in_ms () - timers_.remaining_tokens ;
380
+ timers_.end = util::time_in_ms ();
370
381
printf (" \n " );
371
382
372
383
if (pos == seq_len) {
373
384
ET_LOG (Info, " Sequence length (%i tokens) reached!" , seq_len);
374
385
}
375
- // report achieved tok/s (pos-1 because the timer starts after first
376
- // iteration)
377
- if (pos >= 1 ) {
378
- long end = util::time_in_ms ();
379
- ET_LOG (
380
- Info, " Achieved tok/s: %f\n " , (pos - 1 ) / (double )(end - start) * 1000 );
381
- }
386
+
387
+ printReport (num_prompt_tokens, pos - num_prompt_tokens);
382
388
383
389
delete[] prompt_tokens;
384
390
return Error::Ok;
385
391
}
386
392
393
+ void Runner::printReport (
394
+ int64_t num_prompt_tokens,
395
+ int64_t num_generated_tokens) {
396
+ printf (" \n " );
397
+ double net_eval_time =
398
+ (double )(timers_.first_token + timers_.remaining_tokens - timers_.prompt_eval );
399
+
400
+ ET_LOG (
401
+ Info,
402
+ " \t Prompt Tokens: %ld Generated Tokens: %ld" ,
403
+ num_prompt_tokens,
404
+ num_generated_tokens);
405
+
406
+ ET_LOG (
407
+ Info,
408
+ " \t Model Load Time:\t\t %f (seconds)" ,
409
+ ((double )(timers_.model_load ) / 1000 ));
410
+ ET_LOG (
411
+ Info,
412
+ " \t Total inference time:\t\t %f (seconds)\t\t Token Rate: \t %f (tokens/second)" ,
413
+ (double )(timers_.end - timers_.start ) / 1000 ,
414
+
415
+ (num_generated_tokens) / (double )(timers_.end - timers_.start ) * 1000 );
416
+ ET_LOG (
417
+ Info,
418
+ " \t\t Time to first token:\t %f (seconds)" ,
419
+ ((double )(timers_.first_token ) / 1000 ));
420
+ ET_LOG (
421
+ Info,
422
+ " \t\t\t Prompt eval:\t %f (seconds)\t\t Token Rate: \t %f (tokens/second)" ,
423
+ ((double )(timers_.prompt_eval ) / 1000 ),
424
+ (num_prompt_tokens) / (double )(timers_.prompt_eval ) * 1000 );
425
+
426
+ ET_LOG (
427
+ Info,
428
+ " \t\t Remaining %ld tokens:\t %f (seconds)\t\t Token Rate: \t %f (tokens/second)" ,
429
+ num_generated_tokens - 1 ,
430
+ (double )(timers_.remaining_tokens ) / 1000 ,
431
+
432
+ (num_generated_tokens - 1 ) / (double )(timers_.remaining_tokens ) * 1000 );
433
+ ET_LOG (
434
+ Info,
435
+ " \t\t Net evaluation time:\t %f (seconds)\t\t Token Rate: \t %f (tokens/second)" ,
436
+ (net_eval_time / 1000 ),
437
+ (num_generated_tokens) / net_eval_time * 1000 );
438
+
439
+ }
440
+
387
441
void Runner::stop () {
388
442
shouldStop_ = true ;
389
443
}
0 commit comments