Skip to content

Commit 98947a7

Browse files
authored
feat: Report more histogram metrics (#61)
1 parent 507e4dc commit 98947a7

File tree

4 files changed

+205
-31
lines changed

4 files changed

+205
-31
lines changed

README.md

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,16 @@ counter_generation_tokens
224224
histogram_time_to_first_token
225225
# Histogram of time per output token in seconds.
226226
histogram_time_per_output_token
227+
# Histogram of end to end request latency in seconds.
228+
histogram_e2e_time_request
229+
# Number of prefill tokens processed.
230+
histogram_num_prompt_tokens_request
231+
# Number of generation tokens processed.
232+
histogram_num_generation_tokens_request
233+
# Histogram of the best_of request parameter.
234+
histogram_best_of_request
235+
# Histogram of the n request parameter.
236+
histogram_n_request
227237
```
228238
Your output for these fields should look similar to the following:
229239
```bash
@@ -238,17 +248,50 @@ vllm:generation_tokens_total{model="vllm_model",version="1"} 16
238248
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
239249
vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559
240250
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0
241-
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0
242251
...
243252
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
244253
# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
245254
# TYPE vllm:time_per_output_token_seconds histogram
246255
vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15
247256
vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781
248257
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14
249-
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15
250258
...
251259
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15
260+
# HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
261+
# TYPE vllm:e2e_request_latency_seconds histogram
262+
vllm:e2e_request_latency_seconds_count{model="vllm_model",version="1"} 1
263+
vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.08686184883117676
264+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
265+
...
266+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
267+
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
268+
# TYPE vllm:request_prompt_tokens histogram
269+
vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1
270+
vllm:request_prompt_tokens_sum{model="vllm_model",version="1"} 10
271+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="1"} 0
272+
...
273+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
274+
# HELP vllm:request_generation_tokens Number of generation tokens processed.
275+
# TYPE vllm:request_generation_tokens histogram
276+
vllm:request_generation_tokens_count{model="vllm_model",version="1"} 1
277+
vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
278+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
279+
...
280+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
281+
# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
282+
# TYPE vllm:request_params_best_of histogram
283+
vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
284+
vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
285+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
286+
...
287+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
288+
# HELP vllm:request_params_n Histogram of the n request parameter.
289+
# TYPE vllm:request_params_n histogram
290+
vllm:request_params_n_count{model="vllm_model",version="1"} 1
291+
vllm:request_params_n_sum{model="vllm_model",version="1"} 1
292+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="1"} 1
293+
...
294+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="+Inf"} 1
252295
```
253296
To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false
254297
or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json).

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def setUp(self):
5050
]
5151
self.sampling_parameters = {"temperature": "0", "top_p": "1"}
5252

53-
def get_vllm_metrics(self):
53+
def parse_vllm_metrics(self):
5454
"""
5555
Store vllm metrics in a dictionary.
5656
"""
@@ -112,27 +112,81 @@ def vllm_infer(
112112
self.triton_client.stop_stream()
113113

114114
def test_vllm_metrics(self):
115+
# Adding sampling parameters for testing metrics.
116+
# Definitions can be found here https://docs.vllm.ai/en/latest/dev/sampling_params.html
117+
n, best_of = 2, 4
118+
custom_sampling_parameters = self.sampling_parameters.copy()
119+
# Changing "temperature" because "best_of" must be 1 when using greedy
120+
# sampling, i.e. "temperature": "0".
121+
custom_sampling_parameters.update(
122+
{"n": str(n), "best_of": str(best_of), "temperature": "1"}
123+
)
124+
115125
# Test vLLM metrics
116126
self.vllm_infer(
117127
prompts=self.prompts,
118-
sampling_parameters=self.sampling_parameters,
128+
sampling_parameters=custom_sampling_parameters,
119129
model_name=self.vllm_model_name,
120130
)
121-
metrics_dict = self.get_vllm_metrics()
131+
metrics_dict = self.parse_vllm_metrics()
132+
total_prompts = len(self.prompts)
122133

123134
# vllm:prompt_tokens_total
124135
self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
125136
# vllm:generation_tokens_total
126-
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
127-
137+
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 188)
128138
# vllm:time_to_first_token_seconds
129-
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3)
139+
self.assertEqual(
140+
metrics_dict["vllm:time_to_first_token_seconds_count"], total_prompts
141+
)
130142
self.assertGreater(metrics_dict["vllm:time_to_first_token_seconds_sum"], 0)
131-
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3)
143+
self.assertEqual(
144+
metrics_dict["vllm:time_to_first_token_seconds_bucket"], total_prompts
145+
)
132146
# vllm:time_per_output_token_seconds
133147
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
134148
self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0)
135149
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45)
150+
# vllm:e2e_request_latency_seconds
151+
self.assertEqual(
152+
metrics_dict["vllm:e2e_request_latency_seconds_count"], total_prompts
153+
)
154+
self.assertGreater(metrics_dict["vllm:e2e_request_latency_seconds_sum"], 0)
155+
self.assertEqual(
156+
metrics_dict["vllm:e2e_request_latency_seconds_bucket"], total_prompts
157+
)
158+
# vllm:request_prompt_tokens
159+
self.assertEqual(
160+
metrics_dict["vllm:request_prompt_tokens_count"], total_prompts
161+
)
162+
self.assertEqual(metrics_dict["vllm:request_prompt_tokens_sum"], 18)
163+
self.assertEqual(
164+
metrics_dict["vllm:request_prompt_tokens_bucket"], total_prompts
165+
)
166+
# vllm:request_generation_tokens
167+
self.assertEqual(
168+
metrics_dict["vllm:request_generation_tokens_count"],
169+
best_of * total_prompts,
170+
)
171+
self.assertEqual(metrics_dict["vllm:request_generation_tokens_sum"], 188)
172+
self.assertEqual(
173+
metrics_dict["vllm:request_generation_tokens_bucket"],
174+
best_of * total_prompts,
175+
)
176+
# vllm:request_params_best_of
177+
self.assertEqual(
178+
metrics_dict["vllm:request_params_best_of_count"], total_prompts
179+
)
180+
self.assertEqual(
181+
metrics_dict["vllm:request_params_best_of_sum"], best_of * total_prompts
182+
)
183+
self.assertEqual(
184+
metrics_dict["vllm:request_params_best_of_bucket"], total_prompts
185+
)
186+
# vllm:request_params_n
187+
self.assertEqual(metrics_dict["vllm:request_params_n_count"], total_prompts)
188+
self.assertEqual(metrics_dict["vllm:request_params_n_sum"], n * total_prompts)
189+
self.assertEqual(metrics_dict["vllm:request_params_n_bucket"], total_prompts)
136190

137191
def test_vllm_metrics_disabled(self):
138192
# Test vLLM metrics
@@ -141,7 +195,7 @@ def test_vllm_metrics_disabled(self):
141195
sampling_parameters=self.sampling_parameters,
142196
model_name=self.vllm_model_name,
143197
)
144-
metrics_dict = self.get_vllm_metrics()
198+
metrics_dict = self.parse_vllm_metrics()
145199

146200
# No vLLM metric found
147201
self.assertEqual(len(metrics_dict), 0)
@@ -154,7 +208,7 @@ def test_vllm_metrics_refused(self):
154208
model_name=self.vllm_model_name,
155209
)
156210
with self.assertRaises(requests.exceptions.ConnectionError):
157-
self.get_vllm_metrics()
211+
self.parse_vllm_metrics()
158212

159213
def tearDown(self):
160214
self.triton_client.close()

src/model.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,10 @@ def init_engine(self):
173173
"version": self.args["model_version"],
174174
}
175175
# Add vLLM custom metrics
176-
self.llm_engine.add_logger("triton", VllmStatLogger(labels=labels))
176+
engine_config = self.llm_engine.engine.model_config
177+
self.llm_engine.add_logger(
178+
"triton", VllmStatLogger(labels, engine_config.max_model_len)
179+
)
177180
except pb_utils.TritonModelException as e:
178181
if "metrics not supported" in str(e):
179182
# Metrics are disabled at the server

src/utils/metrics.py

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
import triton_python_backend_utils as pb_utils
3030
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
3131
from vllm.engine.metrics import Stats as VllmStats
32-
from vllm.engine.metrics import SupportsMetricsInfo
32+
from vllm.engine.metrics import SupportsMetricsInfo, build_1_2_5_buckets
3333

3434

3535
class TritonMetrics:
36-
def __init__(self, labels):
36+
def __init__(self, labels: List[str], max_model_len: int):
3737
# Initialize metric families
3838
# Iteration stats
3939
self.counter_prompt_tokens_family = pb_utils.MetricFamily(
@@ -56,6 +56,34 @@ def __init__(self, labels):
5656
description="Histogram of time per output token in seconds.",
5757
kind=pb_utils.MetricFamily.HISTOGRAM,
5858
)
59+
# Request stats
60+
# Latency
61+
self.histogram_e2e_time_request_family = pb_utils.MetricFamily(
62+
name="vllm:e2e_request_latency_seconds",
63+
description="Histogram of end to end request latency in seconds.",
64+
kind=pb_utils.MetricFamily.HISTOGRAM,
65+
)
66+
# Metadata
67+
self.histogram_num_prompt_tokens_request_family = pb_utils.MetricFamily(
68+
name="vllm:request_prompt_tokens",
69+
description="Number of prefill tokens processed.",
70+
kind=pb_utils.MetricFamily.HISTOGRAM,
71+
)
72+
self.histogram_num_generation_tokens_request_family = pb_utils.MetricFamily(
73+
name="vllm:request_generation_tokens",
74+
description="Number of generation tokens processed.",
75+
kind=pb_utils.MetricFamily.HISTOGRAM,
76+
)
77+
self.histogram_best_of_request_family = pb_utils.MetricFamily(
78+
name="vllm:request_params_best_of",
79+
description="Histogram of the best_of request parameter.",
80+
kind=pb_utils.MetricFamily.HISTOGRAM,
81+
)
82+
self.histogram_n_request_family = pb_utils.MetricFamily(
83+
name="vllm:request_params_n",
84+
description="Histogram of the n request parameter.",
85+
kind=pb_utils.MetricFamily.HISTOGRAM,
86+
)
5987

6088
# Initialize metrics
6189
# Iteration stats
@@ -65,7 +93,7 @@ def __init__(self, labels):
6593
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
6694
labels=labels
6795
)
68-
# Use the same bucket boundaries from vLLM sample metrics.
96+
# Use the same bucket boundaries from vLLM sample metrics as an example.
6997
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
7098
self.histogram_time_to_first_token = (
7199
self.histogram_time_to_first_token_family.Metric(
@@ -110,16 +138,43 @@ def __init__(self, labels):
110138
],
111139
)
112140
)
141+
# Request stats
142+
# Latency
143+
self.histogram_e2e_time_request = self.histogram_e2e_time_request_family.Metric(
144+
labels=labels,
145+
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0],
146+
)
147+
# Metadata
148+
self.histogram_num_prompt_tokens_request = (
149+
self.histogram_num_prompt_tokens_request_family.Metric(
150+
labels=labels,
151+
buckets=build_1_2_5_buckets(max_model_len),
152+
)
153+
)
154+
self.histogram_num_generation_tokens_request = (
155+
self.histogram_num_generation_tokens_request_family.Metric(
156+
labels=labels,
157+
buckets=build_1_2_5_buckets(max_model_len),
158+
)
159+
)
160+
self.histogram_best_of_request = self.histogram_best_of_request_family.Metric(
161+
labels=labels,
162+
buckets=[1, 2, 5, 10, 20],
163+
)
164+
self.histogram_n_request = self.histogram_n_request_family.Metric(
165+
labels=labels,
166+
buckets=[1, 2, 5, 10, 20],
167+
)
113168

114169

115170
class VllmStatLogger(VllmStatLoggerBase):
116171
"""StatLogger is used as an adapter between vLLM stats collector and Triton metrics provider."""
117172

118173
# local_interval not used here. It's for vLLM logs to stdout.
119-
def __init__(self, labels: Dict, local_interval: float = 0) -> None:
174+
def __init__(self, labels: Dict, max_model_len: int) -> None:
120175
# Tracked stats over current local logging interval.
121-
super().__init__(local_interval)
122-
self.metrics = TritonMetrics(labels=labels)
176+
super().__init__(local_interval=0)
177+
self.metrics = TritonMetrics(labels, max_model_len)
123178

124179
def info(self, type: str, obj: SupportsMetricsInfo) -> None:
125180
pass
@@ -159,16 +214,35 @@ def log(self, stats: VllmStats) -> None:
159214
Returns:
160215
None
161216
"""
162-
self._log_counter(
163-
self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter
164-
)
165-
self._log_counter(
166-
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
167-
)
168-
self._log_histogram(
169-
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
170-
)
171-
self._log_histogram(
172-
self.metrics.histogram_time_per_output_token,
173-
stats.time_per_output_tokens_iter,
174-
)
217+
# The list of vLLM metrics reporting to Triton is also documented here.
218+
# https://github.com/triton-inference-server/vllm_backend/blob/main/README.md#triton-metrics
219+
counter_metrics = [
220+
(self.metrics.counter_prompt_tokens, stats.num_prompt_tokens_iter),
221+
(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter),
222+
]
223+
histogram_metrics = [
224+
(
225+
self.metrics.histogram_time_to_first_token,
226+
stats.time_to_first_tokens_iter,
227+
),
228+
(
229+
self.metrics.histogram_time_per_output_token,
230+
stats.time_per_output_tokens_iter,
231+
),
232+
(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests),
233+
(
234+
self.metrics.histogram_num_prompt_tokens_request,
235+
stats.num_prompt_tokens_requests,
236+
),
237+
(
238+
self.metrics.histogram_num_generation_tokens_request,
239+
stats.num_generation_tokens_requests,
240+
),
241+
(self.metrics.histogram_best_of_request, stats.best_of_requests),
242+
(self.metrics.histogram_n_request, stats.n_requests),
243+
]
244+
245+
for metric, data in counter_metrics:
246+
self._log_counter(metric, data)
247+
for metric, data in histogram_metrics:
248+
self._log_histogram(metric, data)

0 commit comments

Comments
 (0)