Skip to content

Commit 01d12bf

Browse files
committed
fix comments
1 parent 734cf8b commit 01d12bf

File tree

1 file changed

+30
-30
lines changed

1 file changed

+30
-30
lines changed

examples/models/llama2/runner/runner.h

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,36 @@ class Runner {
3939
std::function<void(const std::string&)> callback = {});
4040
void stop();
4141

42+
struct TimeStamps {
43+
// Scaling factor for timestamps - in this case, we use ms.
44+
const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
45+
// Time stamps for the different stages of the execution
46+
// model_load_start_ms: Start of model loading.
47+
long model_load_start_ms;
48+
// model_load_end_ms: End of model loading.
49+
long model_load_end_ms;
50+
// inference_start_ms: Immediately after the model is loaded (or we check
51+
// for model load), measure the inference time.
52+
long inference_start_ms;
53+
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
54+
// before the inference loop starts
55+
long prompt_eval_end_ms;
56+
// first_token: Timestamp when the first generated token is emitted
57+
long first_token_ms;
58+
// inference_end_ms: End of inference/generation.
59+
long inference_end_ms;
60+
// Keep a running total of the time spent in sampling.
61+
long aggregate_sampling_time_ms;
62+
63+
void printReport(
64+
const int64_t& num_prompt_tokens,
65+
const int64_t& num_generated_tokens);
66+
const std::string toJsonString(
67+
const int64_t& num_prompt_tokens,
68+
const int64_t& num_generated_tokens);
69+
};
70+
TimeStamps timers_;
71+
4272
private:
4373
// metadata
4474
template <typename T>
@@ -69,36 +99,6 @@ class Runner {
6999
std::unique_ptr<Sampler> sampler_;
70100
bool shouldStop_{false};
71101

72-
public:
73-
struct TimeStamps {
74-
// Scaling factor for timestamps - in this case, we use ms.
75-
const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
76-
// Time stamps for the different stages of the execution
77-
// model_load_start_ms: Start of model loading.
78-
long model_load_start_ms;
79-
// model_load_end_ms: End of model loading.
80-
long model_load_end_ms;
81-
// inference_start_ms: Immediately after the model is loaded (or we check
82-
// for model load), measure the inference time.
83-
long inference_start_ms;
84-
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
85-
// before the inference loop starts
86-
long prompt_eval_end_ms;
87-
// first_token: Timestamp when the first generated token is emitted
88-
long first_token_ms;
89-
// inference_end_ms: End of inference/generation.
90-
long inference_end_ms;
91-
// Keep a running total of the time spent in sampling.
92-
long aggregate_sampling_time_ms;
93-
94-
void printReport(
95-
const int64_t& num_prompt_tokens,
96-
const int64_t& num_generated_tokens);
97-
const std::string toJsonString(
98-
const int64_t& num_prompt_tokens,
99-
const int64_t& num_generated_tokens);
100-
};
101-
TimeStamps timers_;
102102
};
103103

104104
} // namespace torch::executor

0 commit comments

Comments
 (0)