Skip to content

Commit 095524e

Browse files
Varun Purifacebook-github-bot
authored andcommitted
Add time to first token for llama runner (#2141)
Summary: Add time to first generated token & other features - Since we're measuring the first token time, the token rate is measured both at the * Model Load Time - just a timer around ET_CHECK_OK_OR_RETURN_ERROR(load()); * Total inference time - Immediately after model load until the end of the inference loop * >>First token time - From immediately after the model load until the first generated (not prompt) token is printed. * >>>>Prompt eval - (comparable to llama.cpp prompt_eval_time) prompt array allocation and tokenization. Ends right before the inference loop starts * >>Remaining tokens - immediately after the first token is outputted until the end of the inference loop * >>Net eval time - (comparable to llama.cpp eval_time) Total time spent generating tokens. To implement: * Sample time - amount of time spent sampling per token (present in llama.cpp) Differential Revision: D54223564
1 parent 0570294 commit 095524e

File tree

2 files changed

+82
-14
lines changed

2 files changed

+82
-14
lines changed

examples/models/llama2/runner/runner.cpp

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,14 @@ Error Runner::generate(
161161
// Prepare the inputs.
162162
// Use ones-initialized inputs.
163163
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
164+
timers_.model_load = util::time_in_ms();
164165
ET_CHECK_OK_OR_RETURN_ERROR(load());
166+
timers_.model_load = util::time_in_ms() - timers_.model_load;
165167

168+
// First token time only measures the time it takes to encode the prompt and
169+
// return a response token.
170+
171+
timers_.start = util::time_in_ms();
166172
shouldStop_ = false;
167173

168174
// encode the (string) prompt into tokens sequence
@@ -173,12 +179,14 @@ Error Runner::generate(
173179
// Set the sequence length to the max seq length if not provided
174180
seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
175181

182+
176183
tokenizer_->encode(
177184
prompt.c_str(),
178185
n_bos_,
179186
append_eos_ ? n_eos_ : 0,
180187
prompt_tokens,
181188
&num_prompt_tokens);
189+
182190
for (int i = 0; i < num_prompt_tokens; i++) {
183191
ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
184192
}
@@ -192,8 +200,6 @@ Error Runner::generate(
192200
"Sequence length exceeded - please increase the seq_len value passed to generate()");
193201

194202
// start the main loop
195-
long start =
196-
0; // used to time our code, only initialized after first iteration
197203
int next; // will store the next token in the sequence
198204
int64_t pos = num_prompt_tokens - 1; // position in the sequence
199205
int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
@@ -255,6 +261,7 @@ Error Runner::generate(
255261
tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
256262
}
257263
}
264+
258265
// create a 1xN int tensor with next as value
259266
while (pos < seq_len) {
260267
// ET_LOG(Info, "Generating step %d...", pos);
@@ -290,7 +297,12 @@ Error Runner::generate(
290297
outputs.size() > 0,
291298
"Expecting output to have at least one evalue. Got %zu",
292299
outputs.size());
293-
300+
if (pos == num_prompt_tokens) {
301+
timers_.first_token = util::time_in_ms() - timers_.start;
302+
timers_.remaining_tokens = util::time_in_ms();
303+
} else if (pos == num_prompt_tokens - 1) {
304+
timers_.prompt_eval = util::time_in_ms() - timers_.start;
305+
}
294306
int32_t next_tok;
295307
exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();
296308

@@ -342,6 +354,7 @@ Error Runner::generate(
342354
if (pos >= num_prompt_tokens && next == eos_id_) {
343355
eos_counter++;
344356
if (eos_counter == n_eos_) {
357+
printf("\n");
345358
ET_LOG(Info, "Reached to the end of generation");
346359
break;
347360
}
@@ -351,10 +364,6 @@ Error Runner::generate(
351364

352365
token = next;
353366

354-
// init the timer here because the first iteration can be slower
355-
if (start == 0) {
356-
start = util::time_in_ms();
357-
}
358367
if (use_kv_cache_) {
359368
// outputs: [k_cache, v_cache, logits, k_cache, v_cache]
360369
memcpy(
@@ -367,23 +376,68 @@ Error Runner::generate(
367376
v_data.size());
368377
}
369378
}
379+
timers_.remaining_tokens = util::time_in_ms() - timers_.remaining_tokens;
380+
timers_.end = util::time_in_ms();
370381
printf("\n");
371382

372383
if (pos == seq_len) {
373384
ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
374385
}
375-
// report achieved tok/s (pos-1 because the timer starts after first
376-
// iteration)
377-
if (pos >= 1) {
378-
long end = util::time_in_ms();
379-
ET_LOG(
380-
Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000);
381-
}
386+
387+
printReport(num_prompt_tokens, pos - num_prompt_tokens);
382388

383389
delete[] prompt_tokens;
384390
return Error::Ok;
385391
}
386392

393+
void Runner::printReport(
394+
int64_t num_prompt_tokens,
395+
int64_t num_generated_tokens) {
396+
printf("\n");
397+
double net_eval_time =
398+
(double)(timers_.first_token + timers_.remaining_tokens - timers_.prompt_eval);
399+
400+
ET_LOG(
401+
Info,
402+
"\tPrompt Tokens: %ld Generated Tokens: %ld",
403+
num_prompt_tokens,
404+
num_generated_tokens);
405+
406+
ET_LOG(
407+
Info,
408+
"\tModel Load Time:\t\t%f (seconds)",
409+
((double)(timers_.model_load) / 1000));
410+
ET_LOG(
411+
Info,
412+
"\tTotal inference time:\t\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
413+
(double)(timers_.end - timers_.start) / 1000,
414+
415+
(num_generated_tokens) / (double)(timers_.end - timers_.start) * 1000);
416+
ET_LOG(
417+
Info,
418+
"\t\tTime to first token:\t%f (seconds)",
419+
((double)(timers_.first_token) / 1000));
420+
ET_LOG(
421+
Info,
422+
"\t\t\tPrompt eval:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
423+
((double)(timers_.prompt_eval) / 1000),
424+
(num_prompt_tokens) / (double)(timers_.prompt_eval) * 1000);
425+
426+
ET_LOG(
427+
Info,
428+
"\t\tRemaining %ld tokens:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
429+
num_generated_tokens - 1,
430+
(double)(timers_.remaining_tokens) / 1000,
431+
432+
(num_generated_tokens - 1) / (double)(timers_.remaining_tokens) * 1000);
433+
ET_LOG(
434+
Info,
435+
"\t\tNet evaluation time:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
436+
(net_eval_time / 1000),
437+
(num_generated_tokens) / net_eval_time * 1000);
438+
439+
}
440+
387441
void Runner::stop() {
388442
shouldStop_ = true;
389443
}

examples/models/llama2/runner/runner.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,20 @@ class Runner {
6363
std::unique_ptr<Tokenizer> tokenizer_;
6464
std::unique_ptr<Sampler> sampler_;
6565
bool shouldStop_{false};
66+
67+
struct timers {
68+
69+
long start;
70+
long model_load;
71+
long prompt_eval;
72+
long eval;
73+
long first_token;
74+
long remaining_tokens;
75+
long end;
76+
77+
};
78+
timers timers_;
79+
void printReport(int64_t num_prompt_tokens, int64_t num_generated_tokens);
6680
};
6781

6882
} // namespace torch::executor

0 commit comments

Comments
 (0)