Moved ASCII image to README.md, and added grafana image

keivenchang · keivenchang · commit ee91fd7f8056 · 2025-06-12T18:05:08.000-07:00
diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md
@@ -7,27 +7,81 @@ This directory contains configuration for visualizing metrics from the metrics a
 - **Prometheus**: Collects and stores metrics from the service
 - **Grafana**: Provides visualization dashboards for the metrics
 
+## Topology
+
+Default Service Relationship Diagram:
+```
+     ┌─────────────┐    ┌─────────────┐    ┌─────────────┐
+     │ nats-server │    │ etcd-server │    │dcgm-exporter│
+     │   :4222     │    │   :2379     │    │   :9400     │
+     │   :6222     │    │   :2380     │    │             │
+     │   :8222     │    │             │    │             │
+     └──────┬──────┘    └──────┬──────┘    └──────┬──────┘
+            │                  │                  │
+            │ :8222/varz       │ :2379/metrics    │ :9400/metrics
+            │                  │                  │
+            ▼                  │                  │
+     ┌─────────────┐           │                  │
+     │nats-prom-exp│           │                  │
+     │   :7777     │           │                  │
+     │             │           │                  │
+     │  /metrics   │           │                  │
+     └──────┬──────┘           │                  │
+            │                  │                  │
+            │ :7777/metrics    │                  │
+            │                  │                  │
+            ▼                  ▼                  ▼
+     ┌─────────────────────────────────────────────────┐
+     │                prometheus                       │
+     │                  :9090                          │
+     │                                                 │
+     │  scrapes: nats-prom-exp:7777/metrics            │
+     │           etcd-server:2379/metrics              │
+     │           dcgm-exporter:9400/metrics            │
+     └──────────────────┬──────────────────────────────┘
+                        │
+                        │ :9090/query API
+                        │
+                        ▼
+                ┌─────────────┐
+                │   grafana   │
+                │    :3001    │
+                │             │
+                └─────────────┘
+```
+
+Networks:
+  - monitoring: nats-prom-exp, etcd-server, dcgm-exporter, prometheus, grafana
+  - default: nats-server (accessible via host network)
+
 ## Getting Started
 
 1. Make sure Docker and Docker Compose are installed on your system
 
-2. Start the `components/metrics` application to begin monitoring for metric events from dynamo workers
-   and aggregating them on a prometheus metrics endpoint: `http://localhost:9091/metrics`.
+2. Start the visualization stack:
 
-3. Start worker(s) that publishes KV Cache metrics.
-  - For quick testing, `examples/rust/service_metrics/bin/server.rs` can populate dummy KV Cache metrics.
-  - For a real workflow with real data, see the KV Routing example in `examples/python_rs/llm/vllm`.
+   ```bash
+   docker compose --profile metrics up -d
+   ```
 
-4. Start the visualization stack:
+3. Web servers started. The ones that end in /metrics are in Prometheus format:
+   - Grafana: `http://localhost:3001` (default login: dynamo/dynamo)
+   - Prometheus Server: `http://localhost:9090`
+   - NATS Server: `http://localhost:8222` (monitoring endpoints: /varz, /healthz, etc.)
+   - NATS Prometheus Exporter: `http://localhost:7777/metrics`
+   - etcd Server: `http://localhost:2379/metrics`
+   - DCGM Exporter: `http://localhost:9401/metrics`
 
-  ```bash
-  docker compose --profile metrics up -d
-  ```
+4. Optionally, if you want to experiment further:
+   Start the `components/metrics` application to begin monitoring for metric events from dynamo workers
+   and aggregating them on a prometheus metrics endpoint: `http://localhost:9091/metrics`.
+
+   Then, uncomment the appropriate lines in prometheus.yml.
+
+5. Optionally, start worker(s) that publishes KV Cache metrics:
+   - For quick testing, `examples/rust/service_metrics/bin/server.rs` can populate dummy KV Cache metrics.
+   - For a real workflow with real data, see the KV Routing example in `examples/python_rs/llm/vllm`.
 
-5. Web servers started:
-   - Grafana: `http://localhost:3001` (default login: admin/admin) (started by docker compose)
-   - Prometheus Server: `http://localhost:9090` (started by docker compose)
-   - Prometheus Metrics Endpoint: `http://localhost:9091/metrics` (started by `components/metrics` application)
 
 ## Configuration
 
@@ -42,6 +96,7 @@ Note: You may need to adjust the target based on your host configuration and net
 Grafana is pre-configured with:
 - Prometheus datasource
 - Sample dashboard for visualizing service metrics
+![grafana image](./grafana1.png)
 
 ## Required Files
 
diff --git a/deploy/metrics/docker-compose.yml b/deploy/metrics/docker-compose.yml
@@ -13,51 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Service Relationship Diagram:
-#
-#     ┌─────────────┐    ┌─────────────┐    ┌─────────────┐
-#     │ nats-server │    │ etcd-server │    │dcgm-exporter│
-#     │   :4222     │    │   :2379     │    │   :9400     │
-#     │   :6222     │    │   :2380     │    │             │
-#     │   :8222     │    │             │    │             │
-#     └──────┬──────┘    └──────┬──────┘    └──────┬──────┘
-#            │                  │                  │
-#            │ :8222/varz       │ :2379/metrics    │ :9400/metrics
-#            │                  │                  │
-#            ▼                  │                  │
-#     ┌─────────────┐           │                  │
-#     │nats-prom-exp│           │                  │
-#     │   :7777     │           │                  │
-#     │             │           │                  │
-#     │  /metrics   │           │                  │
-#     └──────┬──────┘           │                  │
-#            │                  │                  │
-#            │ :7777/metrics    │                  │
-#            │                  │                  │
-#            ▼                  ▼                  ▼
-#     ┌─────────────────────────────────────────────────┐
-#     │                prometheus                       │
-#     │                  :9090                          │
-#     │                                                 │
-#     │  scrapes: nats-prom-exp:7777/metrics            │
-#     │           etcd-server:2379/metrics              │
-#     │           dcgm-exporter:9400/metrics            │
-#     └──────────────────┬──────────────────────────────┘
-#                        │
-#                        │ :9090/query API
-#                        │
-#                        ▼
-#                ┌─────────────┐
-#                │   grafana   │
-#                │    :3001    │
-#                │             │
-#                └─────────────┘
-#
-# Networks:
-#   - monitoring: nats-prom-exp, etcd-server, dcgm-exporter, prometheus, grafana
-#   - default: nats-server (accessible via host network)
-#
 networks:
   server:
     driver: bridge
@@ -106,6 +61,8 @@ services:
     image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
     ports:
       - 9401:9400
+    cap_add:
+      - SYS_ADMIN
     deploy:
       resources:
         reservations:
diff --git a/deploy/metrics/grafana1.png b/deploy/metrics/grafana1.png
diff --git a/deploy/metrics/prometheus.yml b/deploy/metrics/prometheus.yml
@@ -31,10 +31,16 @@ scrape_configs:
   - job_name: 'dcgm-exporter'
     scrape_interval: 5s
     static_configs:
-      - targets: ['dcgm-exporter:9400']  # on the "monitoring" network
+      - targets: ['dcgm-exporter:9401']  # on the "monitoring" network
 
   # Uncomment to see its own Prometheus metrics
   # - job_name: 'prometheus'
   #   scrape_interval: 5s
   #   static_configs:
   #     - targets: ['prometheus:9090']  # on the "monitoring" network
+
+  # Uncomment to see the metrics-aggregation-service metrics
+  # - job_name: 'metrics-aggregation-service'
+  #   scrape_interval: 2s
+  #   static_configs:
+  #     - targets: ['host.docker.internal:9091']  # metrics aggregation service on host