@@ -20,16 +20,30 @@ import (
20
20
const (
21
21
workspaceStartupSeconds string = "workspace_startup_seconds"
22
22
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
23
+ workspaceFailuresTotal string = "workspace_failure_total"
23
24
workspaceStopsTotal string = "workspace_stops_total"
24
25
workspaceBackupsTotal string = "workspace_backups_total"
25
26
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
26
27
workspaceRestoresTotal string = "workspace_restores_total"
27
28
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
28
29
)
29
30
31
+ type StopReason string
32
+
33
+ const (
34
+ StopReasonFailed = "failed"
35
+ StopReasonStartFailure = "start-failure"
36
+ StopReasonAborted = "aborted"
37
+ StopReasonOutOfSpace = "out-of-space"
38
+ StopReasonTimeout = "timeout"
39
+ StopReasonTabClosed = "tab-closed"
40
+ StopReasonRegular = "regular-stop"
41
+ )
42
+
30
43
type controllerMetrics struct {
31
44
startupTimeHistVec * prometheus.HistogramVec
32
45
totalStartsFailureCounterVec * prometheus.CounterVec
46
+ totalFailuresCounterVec * prometheus.CounterVec
33
47
totalStopsCounterVec * prometheus.CounterVec
34
48
35
49
totalBackupCounterVec * prometheus.CounterVec
@@ -64,6 +78,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
64
78
Name : workspaceStartFailuresTotal ,
65
79
Help : "total number of workspaces that failed to start" ,
66
80
}, []string {"type" , "class" }),
81
+ totalFailuresCounterVec : prometheus .NewCounterVec (prometheus.CounterOpts {
82
+ Namespace : metricsNamespace ,
83
+ Subsystem : metricsWorkspaceSubsystem ,
84
+ Name : workspaceFailuresTotal ,
85
+ Help : "total number of workspaces that had a failed condition" ,
86
+ }, []string {"type" , "class" }),
67
87
totalStopsCounterVec : prometheus .NewCounterVec (prometheus.CounterOpts {
68
88
Namespace : metricsNamespace ,
69
89
Subsystem : metricsWorkspaceSubsystem ,
@@ -126,11 +146,42 @@ func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *wo
126
146
counter .Inc ()
127
147
}
128
148
149
+ func (m * controllerMetrics ) countWorkspaceFailure (log * logr.Logger , ws * workspacev1.Workspace ) {
150
+ class := ws .Spec .Class
151
+ tpe := string (ws .Spec .Type )
152
+
153
+ counter , err := m .totalFailuresCounterVec .GetMetricWithLabelValues (tpe , class )
154
+ if err != nil {
155
+ log .Error (err , "could not count workspace failure" , "type" , tpe , "class" , class )
156
+ }
157
+
158
+ counter .Inc ()
159
+ }
160
+
129
161
func (m * controllerMetrics ) countWorkspaceStop (log * logr.Logger , ws * workspacev1.Workspace ) {
162
+ var reason string
163
+ if c := wsk8s .GetCondition (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionFailed )); c != nil {
164
+ reason = StopReasonFailed
165
+ if ! wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionEverReady )) {
166
+ // Don't record 'failed' if there was a start failure.
167
+ reason = StopReasonStartFailure
168
+ } else if strings .Contains (c .Message , "Pod ephemeral local storage usage exceeds the total limit of containers" ) {
169
+ reason = StopReasonOutOfSpace
170
+ }
171
+ } else if wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionAborted )) {
172
+ reason = StopReasonAborted
173
+ } else if wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionTimeout )) {
174
+ reason = StopReasonTimeout
175
+ } else if wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionClosed )) {
176
+ reason = StopReasonTabClosed
177
+ } else {
178
+ reason = StopReasonRegular
179
+ }
180
+
130
181
class := ws .Spec .Class
131
182
tpe := string (ws .Spec .Type )
132
183
133
- counter , err := m .totalStopsCounterVec .GetMetricWithLabelValues ("unknown" , tpe , class )
184
+ counter , err := m .totalStopsCounterVec .GetMetricWithLabelValues (reason , tpe , class )
134
185
if err != nil {
135
186
log .Error (err , "could not count workspace stop" , "reason" , "unknown" , "type" , tpe , "class" , class )
136
187
}
@@ -210,6 +261,7 @@ type metricState struct {
210
261
recordedStartTime bool
211
262
recordedInitFailure bool
212
263
recordedStartFailure bool
264
+ recordedFailure bool
213
265
recordedContentReady bool
214
266
recordedBackupFailed bool
215
267
recordedBackupCompleted bool
@@ -223,7 +275,8 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
223
275
// each workspace.
224
276
recordedStartTime : ws .Status .Phase == workspacev1 .WorkspacePhaseRunning ,
225
277
recordedInitFailure : wsk8s .ConditionWithStatusAndReason (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionContentReady ), false , workspacev1 .ReasonInitializationFailure ),
226
- recordedStartFailure : wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionFailed )),
278
+ recordedStartFailure : ws .Status .Phase == workspacev1 .WorkspacePhaseStopped && ! wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionEverReady )),
279
+ recordedFailure : wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionFailed )),
227
280
recordedContentReady : wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionContentReady )),
228
281
recordedBackupFailed : wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionBackupFailure )),
229
282
recordedBackupCompleted : wsk8s .ConditionPresentAndTrue (ws .Status .Conditions , string (workspacev1 .WorkspaceConditionBackupComplete )),
@@ -245,6 +298,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
245
298
m .startupTimeHistVec .Describe (ch )
246
299
m .totalStopsCounterVec .Describe (ch )
247
300
m .totalStartsFailureCounterVec .Describe (ch )
301
+ m .totalFailuresCounterVec .Describe (ch )
248
302
249
303
m .totalBackupCounterVec .Describe (ch )
250
304
m .totalBackupFailureCounterVec .Describe (ch )
@@ -260,6 +314,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
260
314
m .startupTimeHistVec .Collect (ch )
261
315
m .totalStopsCounterVec .Collect (ch )
262
316
m .totalStartsFailureCounterVec .Collect (ch )
317
+ m .totalFailuresCounterVec .Collect (ch )
263
318
264
319
m .totalBackupCounterVec .Collect (ch )
265
320
m .totalBackupFailureCounterVec .Collect (ch )
0 commit comments