Skip to content

Commit 4b91f8c

Browse files
committed
Add histogram test
1 parent 07f2575 commit 4b91f8c

File tree

2 files changed

+89
-10
lines changed

2 files changed

+89
-10
lines changed

ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,21 +112,28 @@ def vllm_infer(
112112
self.triton_client.stop_stream()
113113

114114
def test_vllm_metrics(self):
115-
# All vLLM metrics from tritonserver
116-
expected_metrics_dict = {
117-
"vllm:prompt_tokens_total": 0,
118-
"vllm:generation_tokens_total": 0,
119-
}
120-
121115
# Test vLLM metrics
122116
self.vllm_infer(
123117
prompts=self.prompts,
124118
sampling_parameters=self.sampling_parameters,
125119
model_name=self.vllm_model_name,
126120
)
127-
expected_metrics_dict["vllm:prompt_tokens_total"] = 18
128-
expected_metrics_dict["vllm:generation_tokens_total"] = 48
129-
self.assertEqual(self.get_metrics(), expected_metrics_dict)
121+
metrics_dict = self.get_metrics()
122+
123+
# vllm:prompt_tokens_total
124+
self.assertEqual(metrics_dict["vllm:prompt_tokens_total"], 18)
125+
# vllm:generation_tokens_total
126+
self.assertEqual(metrics_dict["vllm:generation_tokens_total"], 48)
127+
# vllm:time_to_first_token_seconds
128+
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3)
129+
self.assertTrue(
130+
0 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0005
131+
)
132+
# vllm:time_per_output_token_seconds
133+
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
134+
self.assertTrue(
135+
0 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.005
136+
)
130137

131138
def tearDown(self):
132139
self.triton_client.close()

src/utils/metrics.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27-
from typing import Dict, Union
27+
from typing import Dict, List, Union
2828

2929
import triton_python_backend_utils as pb_utils
3030
from vllm.engine.metrics import StatLoggerBase as VllmStatLoggerBase
@@ -46,6 +46,16 @@ def __init__(self, labels):
4646
description="Number of generation tokens processed.",
4747
kind=pb_utils.MetricFamily.COUNTER,
4848
)
49+
self.histogram_time_to_first_token_family = pb_utils.MetricFamily(
50+
name="vllm:time_to_first_token_seconds",
51+
description="Histogram of time to first token in seconds.",
52+
kind=pb_utils.MetricFamily.HISTOGRAM,
53+
)
54+
self.histogram_time_per_output_token_family = pb_utils.MetricFamily(
55+
name="vllm:time_per_output_token_seconds",
56+
description="Histogram of time per output token in seconds.",
57+
kind=pb_utils.MetricFamily.HISTOGRAM,
58+
)
4959

5060
# Initialize metrics
5161
# Iteration stats
@@ -55,6 +65,49 @@ def __init__(self, labels):
5565
self.counter_generation_tokens = self.counter_generation_tokens_family.Metric(
5666
labels=labels
5767
)
68+
self.histogram_time_to_first_token = (
69+
self.histogram_time_to_first_token_family.Metric(
70+
labels=labels,
71+
buckets=[
72+
0.001,
73+
0.005,
74+
0.01,
75+
0.02,
76+
0.04,
77+
0.06,
78+
0.08,
79+
0.1,
80+
0.25,
81+
0.5,
82+
0.75,
83+
1.0,
84+
2.5,
85+
5.0,
86+
7.5,
87+
10.0,
88+
],
89+
)
90+
)
91+
self.histogram_time_per_output_token = (
92+
self.histogram_time_per_output_token_family.Metric(
93+
labels=labels,
94+
buckets=[
95+
0.01,
96+
0.025,
97+
0.05,
98+
0.075,
99+
0.1,
100+
0.15,
101+
0.2,
102+
0.3,
103+
0.4,
104+
0.5,
105+
0.75,
106+
1.0,
107+
2.5,
108+
],
109+
)
110+
)
58111

59112

60113
class VllmStatLogger(VllmStatLoggerBase):
@@ -93,6 +146,19 @@ def _log_counter(self, counter, data: Union[int, float]) -> None:
93146
"""
94147
if data != 0:
95148
counter.increment(data)
149+
150+
def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
151+
"""Convenience function for logging list to histogram.
152+
153+
Args:
154+
histogram: A histogram metric instance.
155+
data: A list of int or float data to observe into the histogram metric.
156+
157+
Returns:
158+
None
159+
"""
160+
for datum in data:
161+
histogram.observe(datum)
96162

97163
def log(self, stats: VllmStats) -> None:
98164
"""Logs tracked stats to triton metrics server every iteration.
@@ -108,4 +174,10 @@ def log(self, stats: VllmStats) -> None:
108174
)
109175
self._log_counter(
110176
self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter
177+
self._log_histogram(
178+
self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter
179+
)
180+
self._log_histogram(
181+
self.metrics.histogram_time_per_output_token,
182+
stats.time_per_output_tokens_iter,
111183
)

0 commit comments

Comments
 (0)