Skip to content

Commit 4c74307

Browse files
committed
Add histogram test
1 parent 05c5a8b commit 4c74307

File tree

6 files changed

+390
-2
lines changed

6 files changed

+390
-2
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ container with the following commands:
111111

112112
```
113113
mkdir -p /opt/tritonserver/backends/vllm
114-
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
114+
git clone https://github.com/triton-inference-server/vllm_backend.git /opt/tritonserver/backends/vllm/vllm_backend
115+
cp -r /opt/tritonserver/backends/vllm/vllm_backend/src/* /opt/tritonserver/backends/vllm
116+
rm -rf /opt/tritonserver/backends/vllm/vllm_backend
115117
```
116118

117119
## Using the vLLM Backend
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/bin/bash
2+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
source ../../common/util.sh
29+
30+
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
31+
SERVER=${TRITON_DIR}/bin/tritonserver
32+
BACKEND_DIR=${TRITON_DIR}/backends
33+
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1"
34+
SERVER_LOG="./vllm_metrics_server.log"
35+
CLIENT_LOG="./vllm_metrics_client.log"
36+
TEST_RESULT_FILE='test_results.txt'
37+
CLIENT_PY="./vllm_metrics_test.py"
38+
SAMPLE_MODELS_REPO="../../../samples/model_repository"
39+
EXPECTED_NUM_TESTS=1
40+
41+
# Helpers =======================================
42+
function assert_curl_success {
43+
message="${1}"
44+
if [ "$code" != "200" ]; then
45+
cat ./curl.out
46+
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
47+
RET=1
48+
fi
49+
}
50+
51+
rm -rf models && mkdir -p models
52+
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
53+
# `vllm_opt`` model will be loaded on server start and stay loaded throughout
54+
# unittesting. To ensure that vllm's memory profiler will not error out
55+
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
56+
# so that at least 60% of GPU memory was available for other models.
57+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
58+
59+
RET=0
60+
61+
run_server
62+
if [ "$SERVER_PID" == "0" ]; then
63+
cat $SERVER_LOG
64+
echo -e "\n***\n*** Failed to start $SERVER\n***"
65+
exit 1
66+
fi
67+
68+
set +e
69+
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1
70+
71+
if [ $? -ne 0 ]; then
72+
cat $CLIENT_LOG
73+
echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
74+
RET=1
75+
else
76+
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
77+
if [ $? -ne 0 ]; then
78+
cat $CLIENT_LOG
79+
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
80+
RET=1
81+
fi
82+
fi
83+
set -e
84+
85+
kill $SERVER_PID
86+
wait $SERVER_PID
87+
rm -rf "./models"
88+
89+
if [ $RET -eq 1 ]; then
90+
cat $CLIENT_LOG
91+
cat $SERVER_LOG
92+
echo -e "\n***\n*** vLLM test FAILED. \n***"
93+
else
94+
echo -e "\n***\n*** vLLM test PASSED. \n***"
95+
fi
96+
97+
collect_artifacts_from_subdir
98+
exit $RET
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
import os
28+
import re
29+
import sys
30+
import unittest
31+
from functools import partial
32+
33+
import requests
34+
import tritonclient.grpc as grpcclient
35+
from tritonclient.utils import *
36+
37+
sys.path.append("../../common")
38+
from test_util import TestResultCollector, UserData, callback, create_vllm_request
39+
40+
41+
class VLLMTritonMetricsTest(TestResultCollector):
42+
def setUp(self):
43+
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
44+
self.tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
45+
self.vllm_model_name = "vllm_opt"
46+
self.prompts = [
47+
"The most dangerous animal is",
48+
"The capital of France is",
49+
"The future of AI is",
50+
]
51+
self.sampling_parameters = {"temperature": "0", "top_p": "1"}
52+
53+
def get_metrics(self):
54+
"""
55+
Store vllm metrics in a dictionary.
56+
"""
57+
r = requests.get(f"http://{self.tritonserver_ipaddr}:8002/metrics")
58+
r.raise_for_status()
59+
60+
# Regular expression to match the pattern
61+
pattern = r"^(vllm:[^ {]+)(?:{.*})? ([0-9.-]+)$"
62+
vllm_dict = {}
63+
64+
# Find all matches in the text
65+
matches = re.findall(pattern, r.text, re.MULTILINE)
66+
67+
for match in matches:
68+
key, value = match
69+
vllm_dict[key] = float(value) if "." in value else int(value)
70+
71+
return vllm_dict
72+
73+
def vllm_async_stream_infer(
74+
self,
75+
prompts,
76+
sampling_parameters,
77+
stream,
78+
send_parameters_as_tensor,
79+
model_name,
80+
):
81+
"""
82+
Helper function to send async stream infer requests to vLLM.
83+
"""
84+
user_data = UserData()
85+
number_of_vllm_reqs = len(prompts)
86+
87+
self.triton_client.start_stream(callback=partial(callback, user_data))
88+
for i in range(number_of_vllm_reqs):
89+
request_data = create_vllm_request(
90+
prompts[i],
91+
i,
92+
stream,
93+
sampling_parameters,
94+
model_name,
95+
send_parameters_as_tensor,
96+
)
97+
self.triton_client.async_stream_infer(
98+
model_name=model_name,
99+
request_id=request_data["request_id"],
100+
inputs=request_data["inputs"],
101+
outputs=request_data["outputs"],
102+
parameters=sampling_parameters,
103+
)
104+
105+
for _ in range(number_of_vllm_reqs):
106+
result = user_data._completed_requests.get()
107+
if type(result) is InferenceServerException:
108+
print(result.message())
109+
self.assertIsNot(type(result), InferenceServerException, str(result))
110+
111+
output = result.as_numpy("text_output")
112+
self.assertIsNotNone(output, "`text_output` should not be None")
113+
114+
self.triton_client.stop_stream()
115+
116+
def test_vllm_metrics(self):
117+
# Test vLLM metrics
118+
self.vllm_async_stream_infer(
119+
prompts=self.prompts,
120+
sampling_parameters=self.sampling_parameters,
121+
stream=False,
122+
send_parameters_as_tensor=True,
123+
model_name=self.vllm_model_name,
124+
)
125+
metrics_dict = self.get_metrics()
126+
127+
self.assertEqual(metrics_dict["vllm:time_to_first_token_seconds_count"], 3)
128+
self.assertTrue(
129+
0.0001 < metrics_dict["vllm:time_to_first_token_seconds_sum"] < 0.0003
130+
)
131+
self.assertEqual(metrics_dict["vllm:time_per_output_token_seconds_count"], 45)
132+
self.assertTrue(
133+
0.001 <= metrics_dict["vllm:time_per_output_token_seconds_sum"] <= 0.003
134+
)
135+
136+
def tearDown(self):
137+
self.triton_client.close()
138+
139+
140+
if __name__ == "__main__":
141+
unittest.main()

ci/L0_backend_vllm/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
RET=0
29-
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend"
29+
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
3030

3131
python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc]
3232

src/model.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
from vllm.sampling_params import SamplingParams
4040
from vllm.utils import random_uuid
4141

42+
from utils.metrics import VllmStatLogger
43+
4244
_VLLM_ENGINE_ARGS_FILENAME = "model.json"
4345
_MULTI_LORA_ARGS_FILENAME = "multi_lora.json"
4446

@@ -151,6 +153,14 @@ def init_engine(self):
151153
AsyncEngineArgs(**self.vllm_engine_config)
152154
)
153155

156+
# Create vLLM custom Metrics
157+
labels = {
158+
"model": self.args["model_name"],
159+
"version": self.args["model_version"],
160+
}
161+
logger = VllmStatLogger(labels=labels)
162+
self.llm_engine.add_logger("triton", logger)
163+
154164
def setup_lora(self):
155165
self.enable_lora = False
156166

0 commit comments

Comments
 (0)