ai-dynamo · keivenchang · Jun 12, 2025 · nnshah1 · Jun 12, 2025 · coderabbitai
@@ -7,27 +7,78 @@ This directory contains configuration for visualizing metrics from the metrics a
 - **Prometheus**: Collects and stores metrics from the service
 - **Grafana**: Provides visualization dashboards for the metrics
 
+## Topology
+
+Default Service Relationship Diagram:
+```text
+     ┌─────────────┐    ┌─────────────┐    ┌─────────────┐
+     │ nats-server │    │ etcd-server │    │dcgm-exporter│
+     │   :4222     │    │   :2379     │    │   :9400     │
+     │   :6222     │    │   :2380     │    │             │
+     │   :8222     │    │             │    │             │
+     └──────┬──────┘    └──────┬──────┘    └──────┬──────┘
+            │                  │                  │
+            │ :8222/varz       │ :2379/metrics    │ :9400/metrics
+            │                  │                  │
+            ▼                  │                  │
+     ┌─────────────┐           │                  │
+     │nats-prom-exp│           │                  │
+     │   :7777     │           │                  │
+     │             │           │                  │
+     │  /metrics   │           │                  │
+     └──────┬──────┘           │                  │
+            │                  │                  │
+            │ :7777/metrics    │                  │
+            │                  │                  │
+            ▼                  ▼                  ▼
+     ┌─────────────────────────────────────────────────┐
+     │                prometheus                       │
+     │                  :9090                          │
+     │                                                 │
+     │  scrapes: nats-prom-exp:7777/metrics            │
+     │           etcd-server:2379/metrics              │
+     │           dcgm-exporter:9400/metrics            │
+     └──────────────────┬──────────────────────────────┘
+                        │
+                        │ :9090/query API
+                        │
+                        ▼
+                ┌─────────────┐
+                │   grafana   │
+                │    :3001    │
+                │             │
+                └─────────────┘
+```
+
+Networks:
+- monitoring: nats-prom-exp, etcd-server, dcgm-exporter, prometheus, grafana
+- default: nats-server (accessible via host network)
+
 ## Getting Started
 
 1. Make sure Docker and Docker Compose are installed on your system
 
-2. Start the `components/metrics` application to begin monitoring for metric events from dynamo workers
-   and aggregating them on a prometheus metrics endpoint: `http://localhost:9091/metrics`.
+2. Start the visualization stack:
 
-3. Start worker(s) that publishes KV Cache metrics.
-  - For quick testing, `examples/rust/service_metrics/bin/server.rs` can populate dummy KV Cache metrics.
-  - For a real workflow with real data, see the KV Routing example in `examples/python_rs/llm/vllm`.
+   ```bash
+   docker compose --profile metrics up -d
+   ```
 
-4. Start the visualization stack:
+3. Web servers started. The ones that end in /metrics are in Prometheus format:
+   - Grafana: `http://localhost:3001` (default login: dynamo/dynamo)
+   - Prometheus Server: `http://localhost:9090`
+   - NATS Server: `http://localhost:8222` (monitoring endpoints: /varz, /healthz, etc.)
+   - NATS Prometheus Exporter: `http://localhost:7777/metrics`
+   - etcd Server: `http://localhost:2379/metrics`
+   - DCGM Exporter: `http://localhost:9401/metrics`
 
-  ```bash
-  docker compose --profile metrics up -d
-  ```
+4. Optionally, if you want to experiment further, look through components/metrics/README.md for more details on launching a metrics server, mock_worker, and a real worker.
+
+   - Start the `components/metrics` application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`.
+   - Uncomment the appropriate lines in prometheus.yml to poll port 9091.
+   - Start worker(s) that publishes KV Cache metrics: `examples/rust/service_metrics/bin/server.rs` can populate dummy KV Cache metrics.
+   - For a real workflow with real data, see the KV Routing example in `examples/python_rs/llm/vllm`.
 
-5. Web servers started:
-   - Grafana: `http://localhost:3001` (default login: admin/admin) (started by docker compose)
-   - Prometheus Server: `http://localhost:9090` (started by docker compose)
-   - Prometheus Metrics Endpoint: `http://localhost:9091/metrics` (started by `components/metrics` application)
 
 ## Configuration
 
@@ -42,6 +93,7 @@ Note: You may need to adjust the target based on your host configuration and net
 Grafana is pre-configured with:
 - Prometheus datasource
 - Sample dashboard for visualizing service metrics
+![grafana image](./grafana1.png)
 
 ## Required Files
 
@@ -75,4 +127,3 @@ The prometheus metrics endpoint exposes the following metrics:
   docker compose logs prometheus
   docker compose logs grafana
   ```
-
@@ -13,28 +13,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+networks:
+  server:
+    driver: bridge
+  monitoring:
+    driver: bridge
+
+# Note that the images are pinned to specific versions to avoid breaking changes.
 services:
   nats-server:
-    image: nats
-    command: [ "-js", "--trace" ]
+    image: nats:2.11.4
+    command: [ "-js", "--trace", "-m", "8222" ]
     ports:
       - 4222:4222
       - 6222:6222
-      - 8222:8222
+      - 8222:8222  # the endpoints include /varz, /healthz, ...
+    networks:
+      - server
+      - monitoring
 
   etcd-server:
-    image: bitnami/etcd
+    image: bitnami/etcd:3.6.1
     environment:
       - ALLOW_NONE_AUTHENTICATION=yes
     ports:
-      - 2379:2379
+      - 2379:2379  # this port exposes the /metrics endpoint
       - 2380:2380
+    networks:
+      - server
+      - monitoring
+
+  # All the services below are part of the metrics profile and monitoring network.
+
+  # The exporter translates from /varz and other stats to Prometheus metrics
+  nats-prometheus-exporter:
+    image: natsio/prometheus-nats-exporter:0.17.3
+    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
+    ports:
+      - 7777:7777
+    networks:
+      - monitoring
+    profiles: [metrics]
+    depends_on:
+      - nats-server
+
+  dcgm-exporter:
+    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
+    ports:
+      - 9401:9400  # Remap from 9400 to 9401 to avoid conflict with an existing dcgm-exporter (on dlcluster)
+    cap_add:
+      - SYS_ADMIN
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all  # Make all GPUs visible to the container
+    runtime: nvidia  # Specify the NVIDIA runtime
+    networks:
+      - monitoring
 
   prometheus:
-    image: prom/prometheus:latest
+    image: prom/prometheus:v3.4.1
     container_name: prometheus
     volumes:
-      - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
     command:
       - '--config.file=/etc/prometheus/prometheus.yml'
       - '--storage.tsdb.path=/prometheus'
@@ -43,38 +89,41 @@ services:
       - '--web.console.templates=/etc/prometheus/consoles'
       - '--web.enable-lifecycle'
     restart: unless-stopped
-    # TODO: Use more explicit networking setup when metrics is containerized
-    #ports:
-    #  - "9090:9090"
-    #networks:
-    #  - monitoring
-    network_mode: "host"
+    # Example to pull from the /query endpoint:
+    # {__name__=~"DCGM.*", job="dcgm-exporter"}
+    ports:
+      - "9090:9090"
+    networks:
+      - monitoring
     profiles: [metrics]
+    depends_on:
+      - dcgm-exporter
+      - nats-prometheus-exporter
+      - etcd-server
 
+  # grafana connects to prometheus via the /query endpoint.
+  # Default credentials are dynamo/dynamo.
   grafana:
-    image: grafana/grafana-enterprise:latest
+    image: grafana/grafana-enterprise:12.0.1
     container_name: grafana
     volumes:
-      - ./metrics/grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
-      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
-      - ./metrics/grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
+      - ./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
+      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
+      - ./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
     environment:
-      # Port 3000 is used by "dynamo serve", so use 3001
+      # Port 3000 is already used by "dynamo serve", so use 3001
       - GF_SERVER_HTTP_PORT=3001
-      - GF_SECURITY_ADMIN_USER=admin
-      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_SECURITY_ADMIN_USER=dynamo
+      - GF_SECURITY_ADMIN_PASSWORD=dynamo
       - GF_USERS_ALLOW_SIGN_UP=false
       - GF_INSTALL_PLUGINS=grafana-piechart-panel
       # Default min interval is 5s, but can be configured lower
       - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
     restart: unless-stopped
-    # TODO: Use more explicit networking setup when metrics is containerized
-    #ports:
-    #  - "3001:3001"
-    #networks:
-    #  - monitoring
-    network_mode: "host"
+    ports:
+      - "3001:3001"
+    networks:
+      - monitoring
     profiles: [metrics]
     depends_on:
       - prometheus
-
@@ -19,7 +19,5 @@ datasources:
   - name: prometheus
     type: prometheus
     access: proxy
-    # TODO: Use proper docker networking
-    # url: http://prometheus:9090
-    url: http://localhost:9090
+    url: http://prometheus:9090
     isDefault: true
@@ -14,12 +14,33 @@
 # limitations under the License.
 
 global:
-  scrape_interval: 1s
-  evaluation_interval: 1s
+  scrape_interval: 10s
+  evaluation_interval: 10s
 
 scrape_configs:
-  - job_name: 'count'
+  - job_name: 'nats-prometheus-exporter'
+    scrape_interval: 2s
     static_configs:
-      # TODO: Use proper docker networking
-      # - targets: ['host.docker.internal:9091']
-      - targets: ['localhost:9091']
+      - targets: ['nats-prometheus-exporter:7777']  # on the "monitoring" network
+
+  - job_name: 'etcd-server'
+    scrape_interval: 2s
+    static_configs:
+      - targets: ['etcd-server:2379']  # etcd-server is on the "monitoring" network
+
+  - job_name: 'dcgm-exporter'
+    scrape_interval: 5s
+    static_configs:
+      - targets: ['dcgm-exporter:9401']  # on the "monitoring" network
+
+  # Uncomment to see its own Prometheus metrics
+  # - job_name: 'prometheus'
+  #   scrape_interval: 5s
+  #   static_configs:
+  #     - targets: ['prometheus:9090']  # on the "monitoring" network
+
+  # Uncomment to see the metrics-aggregation-service metrics
+  # - job_name: 'metrics-aggregation-service'
+  #   scrape_interval: 2s
+  #   static_configs:
+  #     - targets: ['host.docker.internal:9091']  # metrics aggregation service on host