Skip to content

Commit d2c0ed2

Browse files
authored
[observability] update dashboard not all ready alert (#19317)
1 parent 827a1a0 commit d2c0ed2

File tree

2 files changed

+11
-14
lines changed

2 files changed

+11
-14
lines changed

operations/observability/mixins/meta/rules/dashboard.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,14 @@ spec:
2222
summary: Dashboard has excessive CPU usage.
2323
description: Dashboard is consumming too much CPU. Please investigate.
2424
dashboard_url: https://grafana.gitpod.io/d/6581e46e4e5c7ba40a07646395ef7b23/kubernetes-compute-resources-pod?var-cluster={{ $labels.cluster }}&var-namespace=default
25+
- alert: DashboardPodsAreNotAllInReadyState
26+
expr: sum(kube_deployment_status_replicas_unavailable{deployment="dashboard"}) > 0
27+
for: 5m
28+
labels:
29+
severity: critical
30+
team: webapp
31+
dedicated: included
32+
annotations:
33+
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/DashboardStuckInPodInitState.md
34+
summary: Dashboard stuck in PodInitializing state {{ $labels.cluster }}.
35+
description: Dashboard is stuck in PodInitializing for at least 5 minutes

operations/observability/mixins/meta/rules/server.yaml

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -97,20 +97,6 @@ spec:
9797
description: Server is consumming too much CPU. Please investigate.
9898
dashboard_url: https://grafana.gitpod.io/d/6581e46e4e5c7ba40a07646395ef7b23/kubernetes-compute-resources-pod?var-cluster={{ $labels.cluster }}&var-namespace=default
9999

100-
- alert: DashboardStuckInPodInitState
101-
# Reasoning: alert if dashboard is stuck in init more than 5 minute.
102-
expr: sum(kube_pod_container_status_waiting_reason{container="dashboard", reason="PodInitializing"}) by (container) > 0
103-
# Five minutes sound high, but that's the only value that's higher than recent history
104-
for: 5m
105-
labels:
106-
severity: critical
107-
team: webapp
108-
dedicated: included
109-
annotations:
110-
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/DashboardStuckInPodInitState.md
111-
summary: Dashboard stuck in PodInitializing state {{ $labels.cluster }}.
112-
description: Dashboard is stuck in PodInitializing for at least 5 minutes
113-
114100
- alert: WebAppServicesCrashlooping
115101
# Reasoning: alert if any pod is stuck in crashlooping more than 5 minute.
116102
expr: sum(increase(kube_pod_container_status_restarts_total{container!="POD", pod=~"(content-service|dashboard|db|proxy|server|ws-manager-bridge|usage)-.*"}[5m])) by (cluster, pod) > 0

0 commit comments

Comments
 (0)