Skip to content

Commit 94426cc

Browse files
committed
Add new vLLM histogram metrics
1 parent 507e4dc commit 94426cc

File tree

4 files changed

+152
-13
lines changed

4 files changed

+152
-13
lines changed

README.md

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,16 @@ counter_generation_tokens
224224
histogram_time_to_first_token
225225
# Histogram of time per output token in seconds.
226226
histogram_time_per_output_token
227+
# Histogram of end to end request latency in seconds.
228+
histogram_e2e_time_request
229+
# Number of prefill tokens processed.
230+
histogram_num_prompt_tokens_request
231+
# Number of generation tokens processed.
232+
histogram_num_generation_tokens_request
233+
# Histogram of the best_of request parameter.
234+
histogram_best_of_request
235+
# Histogram of the n request parameter.
236+
histogram_n_request
227237
```
228238
Your output for these fields should look similar to the following:
229239
```bash
@@ -238,17 +248,50 @@ vllm:generation_tokens_total{model="vllm_model",version="1"} 16
238248
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
239249
vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559
240250
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0
241-
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0
242251
...
243252
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
244253
# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
245254
# TYPE vllm:time_per_output_token_seconds histogram
246255
vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15
247256
vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781
248257
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14
249-
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15
250258
...
251259
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15
260+
# HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
261+
# TYPE vllm:e2e_request_latency_seconds histogram
262+
vllm:e2e_request_latency_seconds_count{model="vllm_model",version="1"} 1
263+
vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.08686184883117676
264+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
265+
...
266+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
267+
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
268+
# TYPE vllm:request_prompt_tokens histogram
269+
vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1
270+
vllm:request_prompt_tokens_sum{model="vllm_model",version="1"} 10
271+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="1"} 0
272+
...
273+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
274+
# HELP vllm:request_generation_tokens Number of generation tokens processed.
275+
# TYPE vllm:request_generation_tokens histogram
276+
vllm:request_generation_tokens_count{model="vllm_model",version="1"} 1
277+
vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
278+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
279+
...
280+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
281+
# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
282+
# TYPE vllm:request_params_best_of histogram
283+
vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
284+
vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
285+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
286+
...
287+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
288+
# HELP vllm:request_params_n Histogram of the n request parameter.
289+
# TYPE vllm:request_params_n histogram
290+
vllm:request_params_n_count{model="vllm_model",version="1"} 1
291+
vllm:request_params_n_sum{model="vllm_model",version="1"} 1
292+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="1"} 1
293+
...
294+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="+Inf"} 1
252295
```
253296
To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false
254297
or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json).

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def setUp(self):
5050
]
5151
self.sampling_parameters = {"temperature": "0", "top_p": "1"}
5252

53-
def get_vllm_metrics(self):
53+
def parse_vllm_metrics(self):
5454
"""
5555
Store vllm metrics in a dictionary.
5656
"""
@@ -118,13 +118,12 @@ def test_vllm_metrics(self):
118118
sampling_parameters=self.sampling_parameters,
119119
model_name=self.vllm_model_name,
120120
)
121-
metrics_dict = self.get_vllm_metrics()
121+
metrics_dict = self.parse_vllm_metrics()
122122

123123
# vllm:prompt_tokens_total
124124
self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
125125
# vllm:generation_tokens_total
126126
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
127-
128127
# vllm:time_to_first_token_seconds
129128
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3)
130129
self.assertGreater(metrics_dict["vllm:time_to_first_token_seconds_sum"], 0)
@@ -133,6 +132,26 @@ def test_vllm_metrics(self):
133132
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
134133
self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0)
135134
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45)
135+
# vllm:e2e_request_latency_seconds
136+
self.assertEqual(metrics_dict["vllm:e2e_request_latency_seconds_count"], 3)
137+
self.assertGreater(metrics_dict["vllm:e2e_request_latency_seconds_sum"], 0)
138+
self.assertEqual(metrics_dict["vllm:e2e_request_latency_seconds_bucket"], 3)
139+
# vllm:request_prompt_tokens
140+
self.assertEqual(metrics_dict["vllm:request_prompt_tokens_count"], 3)
141+
self.assertEqual(metrics_dict["vllm:request_prompt_tokens_sum"], 18)
142+
self.assertEqual(metrics_dict["vllm:request_prompt_tokens_bucket"], 3)
143+
# vllm:request_generation_tokens
144+
self.assertEqual(metrics_dict["vllm:request_generation_tokens_count"], 3)
145+
self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 48)
146+
self.assertEqual(metrics_dict["vllm:request_generation_tokens_bucket"], 3)
147+
# vllm:request_params_best_of
148+
self.assertEqual(metrics_dict["vllm:request_params_best_of_count"], 3)
149+
self.assertEqual(metrics_dict["vllm:request_params_best_of_sum"], 3)
150+
self.assertEqual(metrics_dict["vllm:request_params_best_of_bucket"], 3)
151+
# vllm:request_params_n
152+
self.assertEqual(metrics_dict["vllm:request_params_n_count"], 3)
153+
self.assertEqual(metrics_dict["vllm:request_params_n_sum"], 3)
154+
self.assertEqual(metrics_dict["vllm:request_params_n_bucket"], 3)
136155

137156
def test_vllm_metrics_disabled(self):
138157
# Test vLLM metrics
@@ -141,7 +160,7 @@ def test_vllm_metrics_disabled(self):
141160
sampling_parameters=self.sampling_parameters,
142161
model_name=self.vllm_model_name,
143162
)
144-
metrics_dict = self.get_vllm_metrics()
163+
metrics_dict = self.parse_vllm_metrics()
145164

146165
# No vLLM metric found
147166
self.assertEqual(len(metrics_dict), 0)
@@ -154,7 +173,7 @@ def test_vllm_metrics_refused(self):
154173
model_name=self.vllm_model_name,
155174
)
156175
with self.assertRaises(requests.exceptions.ConnectionError):
157-
self.get_vllm_metrics()
176+
self.parse_vllm_metrics()
158177

159178
def tearDown(self):
160179
self.triton_client.close()

src/model.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,10 @@ def init_engine(self):
173173
"version": self.args["model_version"],
174174
}
175175
# Add vLLM custom metrics
176-
self.llm_engine.add_logger("triton", VllmStatLogger(labels=labels))
176+
engine_config = self.llm_engine.engine.model_config
177+
self.llm_engine.add_logger(
178+
"triton", VllmStatLogger(labels, engine_config.max_model_len)
179+
)
177180
except pb_utils.TritonModelException as e:
178181
if "metrics not supported" in str(e):
179182
# Metrics are disabled at the server

src/utils/metrics.py

Lines changed: 79 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
import triton_python_backend_utils as pb_utils
3030
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
3131
from vllm.engine.metrics import Stats as VllmStats
32-
from vllm.engine.metrics import SupportsMetricsInfo
32+
from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
3333

3434

3535
class TritonMetrics:
36-
def __init__(self, labels):
36+
def __init__(self, labels: List[str], max_model_len: int):
3737
# Initialize metric families
3838
# Iteration stats
3939
self.counter_prompt_tokens_family = pb_utils.MetricFamily(
@@ -56,6 +56,34 @@ def __init__(self, labels):
5656
description="Histogram of time per output token in seconds.",
5757
kind=pb_utils.MetricFamily.HISTOGRAM,
5858
)
59+
# Request stats
60+
# Latency
61+
self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
62+
name="vllm:e2e_request_latency_seconds",
63+
description="Histogram of end to end request latency in seconds.",
64+
kind=pb_utils.MetricFamily.HISTOGRAM,
65+
)
66+
# Metadata
67+
self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
68+
name="vllm:request_prompt_tokens",
69+
description="Number of prefill tokens processed.",
70+
kind=pb_utils.MetricFamily.HISTOGRAM,
71+
)
72+
self.histogram_num_generation_tokens_request_family = pb_utils.MetricFamily(
73+
name="vllm:request_generation_tokens",
74+
description="Number of generation tokens processed.",
75+
kind=pb_utils.MetricFamily.HISTOGRAM,
76+
)
77+
self.histogram_best_of_request_family = pb_utils.MetricFamily(
78+
name="vllm:request_params_best_of",
79+
description="Histogram of the best_of request parameter.",
80+
kind=pb_utils.MetricFamily.HISTOGRAM,
81+
)
82+
self.histogram_n_request_family = pb_utils.MetricFamily(
83+
name="vllm:request_params_n",
84+
description="Histogram of the n request parameter.",
85+
kind=pb_utils.MetricFamily.HISTOGRAM,
86+
)
5987

6088
# Initialize metrics
6189
# Iteration stats
@@ -110,16 +138,43 @@ def __init__(self, labels):
110138
],
111139
)
112140
)
141+
# Request stats
142+
# Latency
143+
self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
144+
labels=labels,
145+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
146+
)
147+
# Metadata
148+
self.histogram_num_prompt_tokens_request = (
149+
self.histogram_num_prompt_tokens_request_family.Metric(
150+
labels=labels,
151+
buckets=build_1_2_5_buckets(max_model_len),
152+
)
153+
)
154+
self.histogram_num_generation_tokens_request = (
155+
self.histogram_num_generation_tokens_request_family.Metric(
156+
labels=labels,
157+
buckets=build_1_2_5_buckets(max_model_len),
158+
)
159+
)
160+
self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
161+
labels=labels,
162+
buckets=[1, 2, 5, 10, 20],
163+
)
164+
self.histogram_n_request = self.histogram_n_request_family.Metric(
165+
labels=labels,
166+
buckets=[1, 2, 5, 10, 20],
167+
)
113168

114169

115170
class VllmStatLogger(VllmStatLoggerBase):
116171
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
117172

118173
# local_interval not used here. It's for vLLM logs to stdout.
119-
def __init__(self, labels: Dict, local_interval: float = 0) -> None:
174+
def __init__(self, labels: Dict, max_model_len: int) -> None:
120175
# Tracked stats over current local logging interval.
121-
super().__init__(local_interval)
122-
self.metrics = TritonMetrics(labels=labels)
176+
super().__init__(local_interval=0)
177+
self.metrics = TritonMetrics(labels, max_model_len)
123178

124179
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
125180
pass
@@ -159,6 +214,7 @@ def log(self, stats: VllmStats) -> None:
159214
Returns:
160215
None
161216
"""
217+
# Iteration stats
162218
self._log_counter(
163219
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
164220
)
@@ -172,3 +228,21 @@ def log(self, stats: VllmStats) -> None:
172228
self.metrics.histogram_time_per_output_token,
173229
stats.time_per_output_tokens_iter,
174230
)
231+
# Request stats
232+
# Latency
233+
self._log_histogram(
234+
self.metrics.histogram_e2e_time_request, stats.time_e2e_requests
235+
)
236+
# Metadata
237+
self._log_histogram(
238+
self.metrics.histogram_num_prompt_tokens_request,
239+
stats.num_prompt_tokens_requests,
240+
)
241+
self._log_histogram(
242+
self.metrics.histogram_num_generation_tokens_request,
243+
stats.num_generation_tokens_requests,
244+
)
245+
self._log_histogram(
246+
self.metrics.histogram_best_of_request, stats.best_of_requests
247+
)
248+
self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)

0 commit comments

Comments
 (0)