Skip to content

Commit 507e4dc

Browse files
authored
feat: Report histogram metrics to Triton metrics server (#58)
1 parent 3829366 commit 507e4dc

File tree

3 files changed

+115
-5
lines changed

3 files changed

+115
-5
lines changed

README.md

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ you need to specify a different `shm-region-prefix-name` for each server. See
203203
for more information.
204204

205205
## Triton Metrics
206-
Starting with the 24.08 release of Triton, users can now obtain partial
206+
Starting with the 24.08 release of Triton, users can now obtain specific
207207
vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics
208208
[here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be
209209
accomplished by launching a Triton server in any of the ways described above
@@ -213,16 +213,42 @@ the following:
213213
```bash
214214
curl localhost:8002/metrics
215215
```
216-
VLLM stats are reported by the metrics endpoint in fields that
217-
are prefixed with `vllm:`. Your output for these fields should look
218-
similar to the following:
216+
VLLM stats are reported by the metrics endpoint in fields that are prefixed with
217+
`vllm:`. Triton currently supports reporting of the following metrics from vLLM.
218+
```bash
219+
# Number of prefill tokens processed.
220+
counter_prompt_tokens
221+
# Number of generation tokens processed.
222+
counter_generation_tokens
223+
# Histogram of time to first token in seconds.
224+
histogram_time_to_first_token
225+
# Histogram of time per output token in seconds.
226+
histogram_time_per_output_token
227+
```
228+
Your output for these fields should look similar to the following:
219229
```bash
220230
# HELP vllm:prompt_tokens_total Number of prefill tokens processed.
221231
# TYPE vllm:prompt_tokens_total counter
222232
vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
223233
# HELP vllm:generation_tokens_total Number of generation tokens processed.
224234
# TYPE vllm:generation_tokens_total counter
225235
vllm:generation_tokens_total{model="vllm_model",version="1"} 16
236+
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
237+
# TYPE vllm:time_to_first_token_seconds histogram
238+
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
239+
vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559
240+
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0
241+
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.005"} 0
242+
...
243+
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
244+
# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
245+
# TYPE vllm:time_per_output_token_seconds histogram
246+
vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15
247+
vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781
248+
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14
249+
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.025"} 15
250+
...
251+
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15
226252
```
227253
To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false
228254
or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json).

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,15 @@ def test_vllm_metrics(self):
125125
# vllm:generation_tokens_total
126126
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
127127

128+
# vllm:time_to_first_token_seconds
129+
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3)
130+
self.assertGreater(metrics_dict["vllm:time_to_first_token_seconds_sum"], 0)
131+
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_bucket"], 3)
132+
# vllm:time_per_output_token_seconds
133+
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
134+
self.assertGreater(metrics_dict["vllm:time_per_output_token_seconds_sum"], 0)
135+
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_bucket"], 45)
136+
128137
def test_vllm_metrics_disabled(self):
129138
# Test vLLM metrics
130139
self.vllm_infer(

src/utils/metrics.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
from typing import Dict, Union
27+
from typing import Dict, List, Union
2828

2929
import triton_python_backend_utils as pb_utils
3030
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
@@ -46,6 +46,16 @@ def __init__(self, labels):
4646
description="Number of generation tokens processed.",
4747
kind=pb_utils.MetricFamily.COUNTER,
4848
)
49+
self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
50+
name="vllm:time_to_first_token_seconds",
51+
description="Histogram of time to first token in seconds.",
52+
kind=pb_utils.MetricFamily.HISTOGRAM,
53+
)
54+
self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
55+
name="vllm:time_per_output_token_seconds",
56+
description="Histogram of time per output token in seconds.",
57+
kind=pb_utils.MetricFamily.HISTOGRAM,
58+
)
4959

5060
# Initialize metrics
5161
# Iteration stats
@@ -55,6 +65,51 @@ def __init__(self, labels):
5565
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
5666
labels=labels
5767
)
68+
# Use the same bucket boundaries from vLLM sample metrics.
69+
# https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
70+
self.histogram_time_to_first_token = (
71+
self.histogram_time_to_first_token_family.Metric(
72+
labels=labels,
73+
buckets=[
74+
0.001,
75+
0.005,
76+
0.01,
77+
0.02,
78+
0.04,
79+
0.06,
80+
0.08,
81+
0.1,
82+
0.25,
83+
0.5,
84+
0.75,
85+
1.0,
86+
2.5,
87+
5.0,
88+
7.5,
89+
10.0,
90+
],
91+
)
92+
)
93+
self.histogram_time_per_output_token = (
94+
self.histogram_time_per_output_token_family.Metric(
95+
labels=labels,
96+
buckets=[
97+
0.01,
98+
0.025,
99+
0.05,
100+
0.075,
101+
0.1,
102+
0.15,
103+
0.2,
104+
0.3,
105+
0.4,
106+
0.5,
107+
0.75,
108+
1.0,
109+
2.5,
110+
],
111+
)
112+
)
58113

59114

60115
class VllmStatLogger(VllmStatLoggerBase):
@@ -82,6 +137,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None:
82137
if data != 0:
83138
counter.increment(data)
84139

140+
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
141+
"""Convenience function for logging list to histogram.
142+
143+
Args:
144+
histogram: A histogram metric instance.
145+
data: A list of int or float data to observe into the histogram metric.
146+
147+
Returns:
148+
None
149+
"""
150+
for datum in data:
151+
histogram.observe(datum)
152+
85153
def log(self, stats: VllmStats) -> None:
86154
"""Report stats to Triton metrics server.
87155
@@ -97,3 +165,10 @@ def log(self, stats: VllmStats) -> None:
97165
self._log_counter(
98166
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
99167
)
168+
self._log_histogram(
169+
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
170+
)
171+
self._log_histogram(
172+
self.metrics.histogram_time_per_output_token,
173+
stats.time_per_output_tokens_iter,
174+
)

0 commit comments

Comments
 (0)