29
29
import triton_python_backend_utils as pb_utils
30
30
from vllm .engine .metrics import StatLoggerBase as VllmStatLoggerBase
31
31
from vllm .engine .metrics import Stats as VllmStats
32
- from vllm .engine .metrics import SupportsMetricsInfo
32
+ from vllm .engine .metrics import SupportsMetricsInfo , build_1_2_5_buckets
33
33
34
34
35
35
class TritonMetrics :
36
- def __init__ (self , labels ):
36
+ def __init__ (self , labels : List [ str ], max_model_len : int ):
37
37
# Initialize metric families
38
38
# Iteration stats
39
39
self .counter_prompt_tokens_family = pb_utils .MetricFamily (
@@ -56,6 +56,34 @@ def __init__(self, labels):
56
56
description = "Histogram of time per output token in seconds." ,
57
57
kind = pb_utils .MetricFamily .HISTOGRAM ,
58
58
)
59
+ # Request stats
60
+ # Latency
61
+ self .histogram_e2e_time_request_family = pb_utils .MetricFamily (
62
+ name = "vllm:e2e_request_latency_seconds" ,
63
+ description = "Histogram of end to end request latency in seconds." ,
64
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
65
+ )
66
+ # Metadata
67
+ self .histogram_num_prompt_tokens_request_family = pb_utils .MetricFamily (
68
+ name = "vllm:request_prompt_tokens" ,
69
+ description = "Number of prefill tokens processed." ,
70
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
71
+ )
72
+ self .histogram_num_generation_tokens_request_family = pb_utils .MetricFamily (
73
+ name = "vllm:request_generation_tokens" ,
74
+ description = "Number of generation tokens processed." ,
75
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
76
+ )
77
+ self .histogram_best_of_request_family = pb_utils .MetricFamily (
78
+ name = "vllm:request_params_best_of" ,
79
+ description = "Histogram of the best_of request parameter." ,
80
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
81
+ )
82
+ self .histogram_n_request_family = pb_utils .MetricFamily (
83
+ name = "vllm:request_params_n" ,
84
+ description = "Histogram of the n request parameter." ,
85
+ kind = pb_utils .MetricFamily .HISTOGRAM ,
86
+ )
59
87
60
88
# Initialize metrics
61
89
# Iteration stats
@@ -65,7 +93,7 @@ def __init__(self, labels):
65
93
self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
66
94
labels = labels
67
95
)
68
- # Use the same bucket boundaries from vLLM sample metrics.
96
+ # Use the same bucket boundaries from vLLM sample metrics as an example .
69
97
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
70
98
self .histogram_time_to_first_token = (
71
99
self .histogram_time_to_first_token_family .Metric (
@@ -110,16 +138,43 @@ def __init__(self, labels):
110
138
],
111
139
)
112
140
)
141
+ # Request stats
142
+ # Latency
143
+ self .histogram_e2e_time_request = self .histogram_e2e_time_request_family .Metric (
144
+ labels = labels ,
145
+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ],
146
+ )
147
+ # Metadata
148
+ self .histogram_num_prompt_tokens_request = (
149
+ self .histogram_num_prompt_tokens_request_family .Metric (
150
+ labels = labels ,
151
+ buckets = build_1_2_5_buckets (max_model_len ),
152
+ )
153
+ )
154
+ self .histogram_num_generation_tokens_request = (
155
+ self .histogram_num_generation_tokens_request_family .Metric (
156
+ labels = labels ,
157
+ buckets = build_1_2_5_buckets (max_model_len ),
158
+ )
159
+ )
160
+ self .histogram_best_of_request = self .histogram_best_of_request_family .Metric (
161
+ labels = labels ,
162
+ buckets = [1 , 2 , 5 , 10 , 20 ],
163
+ )
164
+ self .histogram_n_request = self .histogram_n_request_family .Metric (
165
+ labels = labels ,
166
+ buckets = [1 , 2 , 5 , 10 , 20 ],
167
+ )
113
168
114
169
115
170
class VllmStatLogger (VllmStatLoggerBase ):
116
171
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
117
172
118
173
# local_interval not used here. It's for vLLM logs to stdout.
119
- def __init__ (self , labels : Dict , local_interval : float = 0 ) -> None :
174
+ def __init__ (self , labels : Dict , max_model_len : int ) -> None :
120
175
# Tracked stats over current local logging interval.
121
- super ().__init__ (local_interval )
122
- self .metrics = TritonMetrics (labels = labels )
176
+ super ().__init__ (local_interval = 0 )
177
+ self .metrics = TritonMetrics (labels , max_model_len )
123
178
124
179
def info (self , type : str , obj : SupportsMetricsInfo ) -> None :
125
180
pass
@@ -159,16 +214,35 @@ def log(self, stats: VllmStats) -> None:
159
214
Returns:
160
215
None
161
216
"""
162
- self ._log_counter (
163
- self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter
164
- )
165
- self ._log_counter (
166
- self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter
167
- )
168
- self ._log_histogram (
169
- self .metrics .histogram_time_to_first_token , stats .time_to_first_tokens_iter
170
- )
171
- self ._log_histogram (
172
- self .metrics .histogram_time_per_output_token ,
173
- stats .time_per_output_tokens_iter ,
174
- )
217
+ # The list of vLLM metrics reporting to Triton is also documented here.
218
+ # https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
219
+ counter_metrics = [
220
+ (self .metrics .counter_prompt_tokens , stats .num_prompt_tokens_iter ),
221
+ (self .metrics .counter_generation_tokens , stats .num_generation_tokens_iter ),
222
+ ]
223
+ histogram_metrics = [
224
+ (
225
+ self .metrics .histogram_time_to_first_token ,
226
+ stats .time_to_first_tokens_iter ,
227
+ ),
228
+ (
229
+ self .metrics .histogram_time_per_output_token ,
230
+ stats .time_per_output_tokens_iter ,
231
+ ),
232
+ (self .metrics .histogram_e2e_time_request , stats .time_e2e_requests ),
233
+ (
234
+ self .metrics .histogram_num_prompt_tokens_request ,
235
+ stats .num_prompt_tokens_requests ,
236
+ ),
237
+ (
238
+ self .metrics .histogram_num_generation_tokens_request ,
239
+ stats .num_generation_tokens_requests ,
240
+ ),
241
+ (self .metrics .histogram_best_of_request , stats .best_of_requests ),
242
+ (self .metrics .histogram_n_request , stats .n_requests ),
243
+ ]
244
+
245
+ for metric , data in counter_metrics :
246
+ self ._log_counter (metric , data )
247
+ for metric , data in histogram_metrics :
248
+ self ._log_histogram (metric , data )
0 commit comments