29
29
import triton_python_backend_utils as pb_utils
30
30
from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
31
31
from vllm .engine .metrics import Stats as VllmStats
32
- from vllm .engine .metrics import SupportsMetricsInfo
32
+ from vllm .engine .metrics import SupportsMetricsInfo , build_1_2_5_buckets
33
33
34
34
35
35
class TritonMetrics :
36
- def __init__ (self , labels ):
36
+ def __init__ (self , labels : List [ str ], max_model_len : int ):
37
37
# Initialize metric families
38
38
# Iteration stats
39
39
self .counter_prompt_tokens_family = pb_utils .MetricFamily (
@@ -56,6 +56,34 @@ def __init__(self, labels):
56
56
description = "Histogram of time per output token in seconds." ,
57
57
kind = pb_utils .MetricFamily .HISTOGRAM ,
58
58
)
59
+ # Request stats
60
+ # Latency
61
+ self .histogram_e2e_time_request_family = pb_utils .MetricFamily (
62
+ name = "vllm:e2e_request_latency_seconds" ,
63
+ description = "Histogram of end to end request latency in seconds." ,
64
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
65
+ )
66
+ # Metadata
67
+ self .histogram_num_prompt_tokens_request_family = pb_utils .MetricFamily (
68
+ name = "vllm:request_prompt_tokens" ,
69
+ description = "Number of prefill tokens processed." ,
70
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
71
+ )
72
+ self .histogram_num_generation_tokens_request_family = pb_utils .MetricFamily (
73
+ name = "vllm:request_generation_tokens" ,
74
+ description = "Number of generation tokens processed." ,
75
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
76
+ )
77
+ self .histogram_best_of_request_family = pb_utils .MetricFamily (
78
+ name = "vllm:request_params_best_of" ,
79
+ description = "Histogram of the best_of request parameter." ,
80
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
81
+ )
82
+ self .histogram_n_request_family = pb_utils .MetricFamily (
83
+ name = "vllm:request_params_n" ,
84
+ description = "Histogram of the n request parameter." ,
85
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
86
+ )
59
87
60
88
# Initialize metrics
61
89
# Iteration stats
@@ -110,16 +138,43 @@ def __init__(self, labels):
110
138
],
111
139
)
112
140
)
141
+ # Request stats
142
+ # Latency
143
+ self .histogram_e2e_time_request = self .histogram_e2e_time_request_family .Metric (
144
+ labels = labels ,
145
+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
146
+ )
147
+ # Metadata
148
+ self .histogram_num_prompt_tokens_request = (
149
+ self .histogram_num_prompt_tokens_request_family .Metric (
150
+ labels = labels ,
151
+ buckets = build_1_2_5_buckets (max_model_len ),
152
+ )
153
+ )
154
+ self .histogram_num_generation_tokens_request = (
155
+ self .histogram_num_generation_tokens_request_family .Metric (
156
+ labels = labels ,
157
+ buckets = build_1_2_5_buckets (max_model_len ),
158
+ )
159
+ )
160
+ self .histogram_best_of_request = self .histogram_best_of_request_family .Metric (
161
+ labels = labels ,
162
+ buckets = [1 , 2 , 5 , 10 , 20 ],
163
+ )
164
+ self .histogram_n_request = self .histogram_n_request_family .Metric (
165
+ labels = labels ,
166
+ buckets = [1 , 2 , 5 , 10 , 20 ],
167
+ )
113
168
114
169
115
170
class VllmStatLogger (VllmStatLoggerBase ):
116
171
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
117
172
118
173
# local_interval not used here. It's for vLLM logs to stdout.
119
- def __init__ (self , labels : Dict , local_interval : float = 0 ) -> None :
174
+ def __init__ (self , labels : Dict , max_model_len : int ) -> None :
120
175
# Tracked stats over current local logging interval.
121
- super ().__init__ (local_interval )
122
- self .metrics = TritonMetrics (labels = labels )
176
+ super ().__init__ (local_interval = 0 )
177
+ self .metrics = TritonMetrics (labels , max_model_len )
123
178
124
179
def info (self , type : str , obj : SupportsMetricsInfo ) -> None :
125
180
pass
@@ -159,6 +214,7 @@ def log(self, stats: VllmStats) -> None:
159
214
Returns:
160
215
None
161
216
"""
217
+ # Iteration stats
162
218
self ._log_counter (
163
219
self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter
164
220
)
@@ -172,3 +228,21 @@ def log(self, stats: VllmStats) -> None:
172
228
self .metrics .histogram_time_per_output_token ,
173
229
stats .time_per_output_tokens_iter ,
174
230
)
231
+ # Request stats
232
+ # Latency
233
+ self ._log_histogram (
234
+ self .metrics .histogram_e2e_time_request , stats .time_e2e_requests
235
+ )
236
+ # Metadata
237
+ self ._log_histogram (
238
+ self .metrics .histogram_num_prompt_tokens_request ,
239
+ stats .num_prompt_tokens_requests ,
240
+ )
241
+ self ._log_histogram (
242
+ self .metrics .histogram_num_generation_tokens_request ,
243
+ stats .num_generation_tokens_requests ,
244
+ )
245
+ self ._log_histogram (
246
+ self .metrics .histogram_best_of_request , stats .best_of_requests
247
+ )
248
+ self ._log_histogram (self .metrics .histogram_n_request , stats .n_requests )
0 commit comments