Skip to content

Commit c7fa90f

Browse files
Varun Purifacebook-github-bot
authored andcommitted
Add time to first token for llama runner
Summary: Add time to first generated token & other features - Since we're measuring the first token time, the token rate is measured both at the * Model Load Time - just a timer around ET_CHECK_OK_OR_RETURN_ERROR(load()); * Total inference time - Immediately after model load until the end of the inference loop * >>First token time - From immediately after the model load until the first generated (not prompt) token is printed. * >>>>Prompt eval - (comparable to llama.cpp prompt_eval_time) prompt array allocation and tokenization. Ends right before the inference loop starts * >>Remaining tokens - immediately after the first token is outputted until the end of the inference loop * >>Net eval time - (comparable to llama.cpp eval_time) Total time spent generating tokens. To implement: * Sample time - amount of time spent sampling per token (present in llama.cpp) Differential Revision: D54223564
1 parent 0570294 commit c7fa90f

File tree

2 files changed

+62
-14
lines changed

2 files changed

+62
-14
lines changed

examples/models/llama2/runner/runner.cpp

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,12 @@ Error Runner::generate(
161161
// Prepare the inputs.
162162
// Use ones-initialized inputs.
163163
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
164+
timers_.model_load = util::time_in_ms();
164165
ET_CHECK_OK_OR_RETURN_ERROR(load());
166+
timers_.model_load = util::time_in_ms() - timers_.model_load;
165167

168+
// First token time only measures the time it takes to encode the prompt and return a response token.
169+
timers_.start = util::time_in_ms();
166170
shouldStop_ = false;
167171

168172
// encode the (string) prompt into tokens sequence
@@ -173,12 +177,14 @@ Error Runner::generate(
173177
// Set the sequence length to the max seq length if not provided
174178
seq_len = (seq_len > 0 && seq_len <= max_seq_len_) ? seq_len : max_seq_len_;
175179

180+
176181
tokenizer_->encode(
177182
prompt.c_str(),
178183
n_bos_,
179184
append_eos_ ? n_eos_ : 0,
180185
prompt_tokens,
181186
&num_prompt_tokens);
187+
182188
for (int i = 0; i < num_prompt_tokens; i++) {
183189
ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
184190
}
@@ -192,8 +198,6 @@ Error Runner::generate(
192198
"Sequence length exceeded - please increase the seq_len value passed to generate()");
193199

194200
// start the main loop
195-
long start =
196-
0; // used to time our code, only initialized after first iteration
197201
int next; // will store the next token in the sequence
198202
int64_t pos = num_prompt_tokens - 1; // position in the sequence
199203
int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
@@ -255,6 +259,7 @@ Error Runner::generate(
255259
tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
256260
}
257261
}
262+
258263
// create a 1xN int tensor with next as value
259264
while (pos < seq_len) {
260265
// ET_LOG(Info, "Generating step %d...", pos);
@@ -290,7 +295,12 @@ Error Runner::generate(
290295
outputs.size() > 0,
291296
"Expecting output to have at least one evalue. Got %zu",
292297
outputs.size());
293-
298+
if (pos == num_prompt_tokens) {
299+
timers_.first_token = util::time_in_ms() - timers_.start;
300+
timers_.remaining_tokens = util::time_in_ms();
301+
} else if (pos == num_prompt_tokens - 1) {
302+
timers_.prompt_eval = util::time_in_ms() - timers_.start;
303+
}
294304
int32_t next_tok;
295305
exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();
296306

@@ -342,6 +352,7 @@ Error Runner::generate(
342352
if (pos >= num_prompt_tokens && next == eos_id_) {
343353
eos_counter++;
344354
if (eos_counter == n_eos_) {
355+
printf("\n");
345356
ET_LOG(Info, "Reached to the end of generation");
346357
break;
347358
}
@@ -351,10 +362,6 @@ Error Runner::generate(
351362

352363
token = next;
353364

354-
// init the timer here because the first iteration can be slower
355-
if (start == 0) {
356-
start = util::time_in_ms();
357-
}
358365
if (use_kv_cache_) {
359366
// outputs: [k_cache, v_cache, logits, k_cache, v_cache]
360367
memcpy(
@@ -367,23 +374,51 @@ Error Runner::generate(
367374
v_data.size());
368375
}
369376
}
377+
timers_.remaining_tokens = util::time_in_ms() - timers_.remaining_tokens;
378+
timers_.end = util::time_in_ms();
370379
printf("\n");
371380

372381
if (pos == seq_len) {
373382
ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
374383
}
375-
// report achieved tok/s (pos-1 because the timer starts after first
376-
// iteration)
377-
if (pos >= 1) {
378-
long end = util::time_in_ms();
379-
ET_LOG(
380-
Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000);
381-
}
384+
385+
printReport(num_prompt_tokens, pos - num_prompt_tokens);
382386

383387
delete[] prompt_tokens;
384388
return Error::Ok;
385389
}
386390

391+
void Runner::printReport(int64_t num_prompt_tokens, int64_t num_generated_tokens) { printf("\n");
392+
double net_eval_time = (double)(timers_.first_token + timers_.remaining_tokens - timers_.prompt_eval);
393+
ET_LOG(
394+
Info,
395+
"\tPrompt Tokens: %ld Generated Tokens: %ld", num_prompt_tokens, num_generated_tokens);
396+
ET_LOG(
397+
Info,
398+
"\tModel Load Time:\t\t%f (seconds)",
399+
((double)(timers_.model_load) / 1000));
400+
ET_LOG(
401+
Info,
402+
"\tTotal inference time:\t\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)", (double)(timers_.end - timers_.start) / 1000,
403+
(num_generated_tokens) / (double)(timers_.end - timers_.start) * 1000);
404+
ET_LOG(
405+
Info,
406+
"\t\tTime to first token:\t%f (seconds)",
407+
((double)(timers_.first_token) / 1000));
408+
ET_LOG(
409+
Info,
410+
"\t\t\tPrompt eval:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
411+
((double)(timers_.prompt_eval) / 1000), (num_prompt_tokens) / (double)(timers_.prompt_eval) * 1000);
412+
ET_LOG(
413+
Info,
414+
"\t\tRemaining %ld tokens:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)", num_generated_tokens-1, (double)(timers_.remaining_tokens) / 1000,
415+
(num_generated_tokens - 1) / (double)(timers_.remaining_tokens) * 1000);
416+
ET_LOG(
417+
Info,
418+
"\t\tNet evaluation time:\t%f (seconds)\t\t Token Rate: \t%f (tokens/second)",
419+
(net_eval_time / 1000), (num_generated_tokens) / net_eval_time * 1000);
420+
}
421+
387422
void Runner::stop() {
388423
shouldStop_ = true;
389424
}

examples/models/llama2/runner/runner.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,19 @@ class Runner {
6363
std::unique_ptr<Tokenizer> tokenizer_;
6464
std::unique_ptr<Sampler> sampler_;
6565
bool shouldStop_{false};
66+
67+
struct timers{
68+
long start;
69+
long model_load;
70+
long prompt_eval;
71+
long eval;
72+
long first_token;
73+
long remaining_tokens;
74+
long end;
75+
76+
};
77+
timers timers_;
78+
void printReport(int64_t num_prompt_tokens, int64_t num_generated_tokens);
6679
};
6780

6881
} // namespace torch::executor

0 commit comments

Comments
 (0)