Add docs

yinggeh · yinggeh · commit a99d38b8a401 · 2024-08-15T01:32:42.000-07:00
diff --git a/README.md b/README.md
@@ -197,14 +197,47 @@ starting from 23.10 release.
 
 You can use  `pip install ...` within the container to upgrade vLLM version.
 
-
 ## Running Multiple Instances of Triton Server
 
 If you are running multiple instances of Triton server with a Python-based backend,
 you need to specify a different `shm-region-prefix-name` for each server. See
 [here](https://github.com/triton-inference-server/python_backend#running-multiple-instances-of-triton-server)
 for more information.
 
+## Triton Metrics
+Starting with the 24.08 release of Triton, users can now obtain partial
+vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics
+[here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be
+accomplished by launching a Triton server in any of the ways described above
+(ensuring the build code / container is 24.08 or later) and querying the server.
+Upon receiving a successful response, you can query the metrics endpoint by entering
+the following:
+```bash
+curl localhost:8002/metrics
+```
+VLLM stats are reported by the metrics endpoint in fields that
+are prefixed with `vllm:`. Your output for these fields should look
+similar to the following:
+```bash
+# HELP vllm:prompt_tokens_total Number of prefill tokens processed.
+# TYPE vllm:prompt_tokens_total counter
+vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
+# HELP vllm:generation_tokens_total Number of generation tokens processed.
+# TYPE vllm:generation_tokens_total counter
+vllm:generation_tokens_total{model="vllm_model",version="1"} 16
+```
+*Note:* The vLLM metrics reporting is disabled by default due to potential
+performance slowdowns. To enable vLLM model's metrics reporting, please add
+following lines to its config.pbtxt.
+```bash
+parameters: {
+  key: "REPORT_CUSTOM_METRICS"
+  value: {
+    string_value:"yes"
+  }
+}
+```
+
 ## Referencing the Tutorial
 
 You can read further in the
diff --git a/ci/L0_backend_vllm/metrics_test/test.sh b/ci/L0_backend_vllm/metrics_test/test.sh
@@ -49,7 +49,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/v
 
 RET=0
 
-# Test disabling vLLM metrics reporting without parameter "REPORT_METRICS" in config.pbtxt
+# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
 run_server
 if [ "$SERVER_PID" == "0" ]; then
     cat $SERVER_LOG
@@ -77,10 +77,10 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test disabling vLLM metrics reporting with parameter "REPORT_METRICS" set to "no" in config.pbtxt
+# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
 echo -e "
 parameters: {
-  key: \"REPORT_METRICS\"
+  key: \"REPORT_CUSTOM_METRICS\"
   value: {
     string_value:\"no\"
   }
@@ -114,11 +114,11 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
-# Test vLLM metrics reporting with parameter "REPORT_METRICS" set to "yes" in config.pbtxt
+# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
 cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
 echo -e "
 parameters: {
-  key: \"REPORT_METRICS\"
+  key: \"REPORT_CUSTOM_METRICS\"
   value: {
     string_value:\"yes\"
   }
diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json
@@ -2,6 +2,5 @@
     "model":"facebook/opt-125m",
     "disable_log_requests": true,
     "gpu_memory_utilization": 0.5,
-    "enforce_eager": true,
-    "disable_log_stats": false
+    "enforce_eager": true
 }
diff --git a/src/model.py b/src/model.py
@@ -162,8 +162,8 @@ def init_engine(self):
 
         # Create vLLM custom metrics
         if (
-            "REPORT_METRICS" in self.model_config["parameters"]
-            and self.model_config["parameters"]["REPORT_METRICS"]["string_value"]
+            "REPORT_CUSTOM_METRICS" in self.model_config["parameters"]
+            and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"]
             == "yes"
         ):
             try:

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,5 @@`
`2`	`2`	`"model":"facebook/opt-125m",`
`3`	`3`	`"disable_log_requests": true,`
`4`	`4`	`"gpu_memory_utilization": 0.5,`
`5`		`- "enforce_eager": true,`
`6`		`- "disable_log_stats": false`
	`5`	`+ "enforce_eager": true`
`7`	`6`	`}`