Skip to content

Commit 06ec36b

Browse files
authored
[ws-manager-mk2] Refactor metrics with EverReady condition (#17114)
* [ws-manager-mk2] Refactor metrics with EverReady condition * Fix test, default failure message * Add stop reason metric
1 parent 401cd01 commit 06ec36b

File tree

5 files changed

+157
-98
lines changed

5 files changed

+157
-98
lines changed

components/ws-manager-api/go/crd/v1/workspace_types.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ func (s *WorkspaceStatus) SetCondition(cond metav1.Condition) {
155155
s.Conditions = wsk8s.AddUniqueCondition(s.Conditions, cond)
156156
}
157157

158-
// +kubebuilder:validation:Enum=Deployed;Failed;Timeout;FirstUserActivity;Closed;HeadlessTaskFailed;StoppedByRequest;Aborted;ContentReady;BackupComplete;BackupFailure
158+
// +kubebuilder:validation:Enum=Deployed;Failed;Timeout;FirstUserActivity;Closed;HeadlessTaskFailed;StoppedByRequest;Aborted;ContentReady;EverReady;BackupComplete;BackupFailure
159159
type WorkspaceCondition string
160160

161161
const (
@@ -188,6 +188,10 @@ const (
188188
// ContentReady is true once the content initialisation is complete
189189
WorkspaceConditionContentReady WorkspaceCondition = "ContentReady"
190190

191+
// EverReady is true if the workspace has ever been ready (content init
192+
// succeeded and container is ready)
193+
WorkspaceConditionEverReady WorkspaceCondition = "EverReady"
194+
191195
// BackupComplete is true once the backup has happened
192196
WorkspaceConditionBackupComplete WorkspaceCondition = "BackupComplete"
193197

@@ -272,6 +276,14 @@ func NewWorkspaceConditionContentReady(status metav1.ConditionStatus, reason, me
272276
}
273277
}
274278

279+
func NewWorkspaceConditionEverReady() metav1.Condition {
280+
return metav1.Condition{
281+
Type: string(WorkspaceConditionEverReady),
282+
LastTransitionTime: metav1.Now(),
283+
Status: metav1.ConditionTrue,
284+
}
285+
}
286+
275287
func NewWorkspaceConditionBackupComplete() metav1.Condition {
276288
return metav1.Condition{
277289
Type: string(WorkspaceConditionBackupComplete),

components/ws-manager-mk2/controllers/metrics.go

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,30 @@ import (
2020
const (
2121
workspaceStartupSeconds string = "workspace_startup_seconds"
2222
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
23+
workspaceFailuresTotal string = "workspace_failure_total"
2324
workspaceStopsTotal string = "workspace_stops_total"
2425
workspaceBackupsTotal string = "workspace_backups_total"
2526
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
2627
workspaceRestoresTotal string = "workspace_restores_total"
2728
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
2829
)
2930

31+
type StopReason string
32+
33+
const (
34+
StopReasonFailed = "failed"
35+
StopReasonStartFailure = "start-failure"
36+
StopReasonAborted = "aborted"
37+
StopReasonOutOfSpace = "out-of-space"
38+
StopReasonTimeout = "timeout"
39+
StopReasonTabClosed = "tab-closed"
40+
StopReasonRegular = "regular-stop"
41+
)
42+
3043
type controllerMetrics struct {
3144
startupTimeHistVec *prometheus.HistogramVec
3245
totalStartsFailureCounterVec *prometheus.CounterVec
46+
totalFailuresCounterVec *prometheus.CounterVec
3347
totalStopsCounterVec *prometheus.CounterVec
3448

3549
totalBackupCounterVec *prometheus.CounterVec
@@ -64,6 +78,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
6478
Name: workspaceStartFailuresTotal,
6579
Help: "total number of workspaces that failed to start",
6680
}, []string{"type", "class"}),
81+
totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
82+
Namespace: metricsNamespace,
83+
Subsystem: metricsWorkspaceSubsystem,
84+
Name: workspaceFailuresTotal,
85+
Help: "total number of workspaces that had a failed condition",
86+
}, []string{"type", "class"}),
6787
totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
6888
Namespace: metricsNamespace,
6989
Subsystem: metricsWorkspaceSubsystem,
@@ -126,11 +146,42 @@ func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *wo
126146
counter.Inc()
127147
}
128148

149+
func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) {
150+
class := ws.Spec.Class
151+
tpe := string(ws.Spec.Type)
152+
153+
counter, err := m.totalFailuresCounterVec.GetMetricWithLabelValues(tpe, class)
154+
if err != nil {
155+
log.Error(err, "could not count workspace failure", "type", tpe, "class", class)
156+
}
157+
158+
counter.Inc()
159+
}
160+
129161
func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) {
162+
var reason string
163+
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil {
164+
reason = StopReasonFailed
165+
if !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) {
166+
// Don't record 'failed' if there was a start failure.
167+
reason = StopReasonStartFailure
168+
} else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") {
169+
reason = StopReasonOutOfSpace
170+
}
171+
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionAborted)) {
172+
reason = StopReasonAborted
173+
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
174+
reason = StopReasonTimeout
175+
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed)) {
176+
reason = StopReasonTabClosed
177+
} else {
178+
reason = StopReasonRegular
179+
}
180+
130181
class := ws.Spec.Class
131182
tpe := string(ws.Spec.Type)
132183

133-
counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues("unknown", tpe, class)
184+
counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues(reason, tpe, class)
134185
if err != nil {
135186
log.Error(err, "could not count workspace stop", "reason", "unknown", "type", tpe, "class", class)
136187
}
@@ -210,6 +261,7 @@ type metricState struct {
210261
recordedStartTime bool
211262
recordedInitFailure bool
212263
recordedStartFailure bool
264+
recordedFailure bool
213265
recordedContentReady bool
214266
recordedBackupFailed bool
215267
recordedBackupCompleted bool
@@ -223,7 +275,8 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
223275
// each workspace.
224276
recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning,
225277
recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure),
226-
recordedStartFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)),
278+
recordedStartFailure: ws.Status.Phase == workspacev1.WorkspacePhaseStopped && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)),
279+
recordedFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)),
227280
recordedContentReady: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)),
228281
recordedBackupFailed: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)),
229282
recordedBackupCompleted: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)),
@@ -245,6 +298,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
245298
m.startupTimeHistVec.Describe(ch)
246299
m.totalStopsCounterVec.Describe(ch)
247300
m.totalStartsFailureCounterVec.Describe(ch)
301+
m.totalFailuresCounterVec.Describe(ch)
248302

249303
m.totalBackupCounterVec.Describe(ch)
250304
m.totalBackupFailureCounterVec.Describe(ch)
@@ -260,6 +314,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
260314
m.startupTimeHistVec.Collect(ch)
261315
m.totalStopsCounterVec.Collect(ch)
262316
m.totalStartsFailureCounterVec.Collect(ch)
317+
m.totalFailuresCounterVec.Collect(ch)
263318

264319
m.totalBackupCounterVec.Collect(ch)
265320
m.totalBackupFailureCounterVec.Collect(ch)

components/ws-manager-mk2/controllers/status.go

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,7 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
9999

100100
if failure != "" && !wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
101101
// workspaces can fail only once - once there is a failed condition set, stick with it
102-
workspace.Status.Conditions = wsk8s.AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
103-
Type: string(workspacev1.WorkspaceConditionFailed),
104-
Status: metav1.ConditionTrue,
105-
LastTransitionTime: metav1.Now(),
106-
Message: failure,
107-
})
108-
102+
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionFailed(failure))
109103
r.Recorder.Event(workspace, corev1.EventTypeWarning, "Failed", failure)
110104
}
111105

@@ -147,19 +141,30 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
147141
}
148142

149143
case pod.Status.Phase == corev1.PodRunning:
150-
var ready bool
151-
for _, cs := range pod.Status.ContainerStatuses {
152-
if cs.Ready {
153-
ready = true
154-
break
155-
}
156-
}
157-
if ready {
158-
// workspace is ready - hence content init is done
144+
everReady := wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady))
145+
if everReady {
146+
// If the workspace has been ready before, stay in a Running state, even
147+
// if the workspace container is not ready anymore. This is to avoid the workspace
148+
// moving back to Initializing and becoming unusable.
159149
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
160150
} else {
161-
// workspace has not become ready yet - it must be initializing then.
162-
workspace.Status.Phase = workspacev1.WorkspacePhaseInitializing
151+
var ready bool
152+
for _, cs := range pod.Status.ContainerStatuses {
153+
if cs.Ready {
154+
ready = true
155+
break
156+
}
157+
}
158+
if ready {
159+
// workspace is ready - hence content init is done
160+
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
161+
if !wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) {
162+
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionEverReady())
163+
}
164+
} else {
165+
// workspace has not become ready yet - it must be initializing then.
166+
workspace.Status.Phase = workspacev1.WorkspacePhaseInitializing
167+
}
163168
}
164169

165170
case workspace.IsHeadless() && (pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed):
@@ -198,13 +203,21 @@ func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev
198203
// Check for content init failure.
199204
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c != nil {
200205
if c.Status == metav1.ConditionFalse && c.Reason == workspacev1.ReasonInitializationFailure {
201-
return c.Message, nil
206+
msg := c.Message
207+
if msg == "" {
208+
msg = "Content initialization failed for an unknown reason"
209+
}
210+
return msg, nil
202211
}
203212
}
204213

205214
// Check for backup failure.
206215
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)); c != nil {
207-
return c.Message, nil
216+
msg := c.Message
217+
if msg == "" {
218+
msg = "Backup failed for an unknown reason"
219+
}
220+
return msg, nil
208221
}
209222

210223
status := pod.Status

components/ws-manager-mk2/controllers/workspace_controller.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -310,18 +310,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
310310
if !lastState.recordedInitFailure && wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure) {
311311
r.metrics.countTotalRestoreFailures(&log, workspace)
312312
lastState.recordedInitFailure = true
313-
314-
if !lastState.recordedStartFailure {
315-
r.metrics.countWorkspaceStartFailures(&log, workspace)
316-
lastState.recordedStartFailure = true
317-
}
318313
}
319314

320-
if !lastState.recordedStartFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
321-
// Only record if there was no other start failure recorded yet, to ensure max one
322-
// start failure gets recorded per workspace.
323-
r.metrics.countWorkspaceStartFailures(&log, workspace)
324-
lastState.recordedStartFailure = true
315+
if !lastState.recordedFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
316+
r.metrics.countWorkspaceFailure(&log, workspace)
317+
lastState.recordedFailure = true
325318
}
326319

327320
if !lastState.recordedContentReady && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) {
@@ -348,6 +341,13 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
348341
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
349342
r.metrics.countWorkspaceStop(&log, workspace)
350343

344+
everReady := wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady))
345+
if !lastState.recordedStartFailure && !everReady {
346+
// Workspace never became ready, count as a startup failure.
347+
r.metrics.countWorkspaceStartFailures(&log, workspace)
348+
// No need to record in metricState, as we're forgetting the workspace state next anyway.
349+
}
350+
351351
// Forget about this workspace, no more state updates will be recorded after this.
352352
r.metrics.forgetWorkspace(workspace)
353353
return

0 commit comments

Comments
 (0)