@@ -39,6 +39,36 @@ class Runner {
39
39
std::function<void (const std::string&)> callback = {});
40
40
void stop ();
41
41
42
+ struct TimeStamps {
43
+ // Scaling factor for timestamps - in this case, we use ms.
44
+ const long SCALING_FACTOR_UNITS_PER_SECOND = 1000 ;
45
+ // Time stamps for the different stages of the execution
46
+ // model_load_start_ms: Start of model loading.
47
+ long model_load_start_ms;
48
+ // model_load_end_ms: End of model loading.
49
+ long model_load_end_ms;
50
+ // inference_start_ms: Immediately after the model is loaded (or we check
51
+ // for model load), measure the inference time.
52
+ long inference_start_ms;
53
+ // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
54
+ // before the inference loop starts
55
+ long prompt_eval_end_ms;
56
+ // first_token: Timestamp when the first generated token is emitted
57
+ long first_token_ms;
58
+ // inference_end_ms: End of inference/generation.
59
+ long inference_end_ms;
60
+ // Keep a running total of the time spent in sampling.
61
+ long aggregate_sampling_time_ms;
62
+
63
+ void printReport (
64
+ const int64_t & num_prompt_tokens,
65
+ const int64_t & num_generated_tokens);
66
+ const std::string toJsonString (
67
+ const int64_t & num_prompt_tokens,
68
+ const int64_t & num_generated_tokens);
69
+ };
70
+ TimeStamps timers_;
71
+
42
72
private:
43
73
// metadata
44
74
template <typename T>
@@ -69,36 +99,6 @@ class Runner {
69
99
std::unique_ptr<Sampler> sampler_;
70
100
bool shouldStop_{false };
71
101
72
- public:
73
- struct TimeStamps {
74
- // Scaling factor for timestamps - in this case, we use ms.
75
- const long SCALING_FACTOR_UNITS_PER_SECOND = 1000 ;
76
- // Time stamps for the different stages of the execution
77
- // model_load_start_ms: Start of model loading.
78
- long model_load_start_ms;
79
- // model_load_end_ms: End of model loading.
80
- long model_load_end_ms;
81
- // inference_start_ms: Immediately after the model is loaded (or we check
82
- // for model load), measure the inference time.
83
- long inference_start_ms;
84
- // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
85
- // before the inference loop starts
86
- long prompt_eval_end_ms;
87
- // first_token: Timestamp when the first generated token is emitted
88
- long first_token_ms;
89
- // inference_end_ms: End of inference/generation.
90
- long inference_end_ms;
91
- // Keep a running total of the time spent in sampling.
92
- long aggregate_sampling_time_ms;
93
-
94
- void printReport (
95
- const int64_t & num_prompt_tokens,
96
- const int64_t & num_generated_tokens);
97
- const std::string toJsonString (
98
- const int64_t & num_prompt_tokens,
99
- const int64_t & num_generated_tokens);
100
- };
101
- TimeStamps timers_;
102
102
};
103
103
104
104
} // namespace torch::executor
0 commit comments