Skip to content

[ws-manager-mk2] Refactor metrics with EverReady condition #17114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion components/ws-manager-api/go/crd/v1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func (s *WorkspaceStatus) SetCondition(cond metav1.Condition) {
s.Conditions = wsk8s.AddUniqueCondition(s.Conditions, cond)
}

// +kubebuilder:validation:Enum=Deployed;Failed;Timeout;FirstUserActivity;Closed;HeadlessTaskFailed;StoppedByRequest;Aborted;ContentReady;BackupComplete;BackupFailure
// +kubebuilder:validation:Enum=Deployed;Failed;Timeout;FirstUserActivity;Closed;HeadlessTaskFailed;StoppedByRequest;Aborted;ContentReady;EverReady;BackupComplete;BackupFailure
type WorkspaceCondition string

const (
Expand Down Expand Up @@ -185,6 +185,10 @@ const (
// ContentReady is true once the content initialisation is complete
WorkspaceConditionContentReady WorkspaceCondition = "ContentReady"

// EverReady is true if the workspace has ever been ready (content init
// succeeded and container is ready)
WorkspaceConditionEverReady WorkspaceCondition = "EverReady"

// BackupComplete is true once the backup has happened
WorkspaceConditionBackupComplete WorkspaceCondition = "BackupComplete"

Expand Down Expand Up @@ -269,6 +273,14 @@ func NewWorkspaceConditionContentReady(status metav1.ConditionStatus, reason, me
}
}

func NewWorkspaceConditionEverReady() metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionEverReady),
LastTransitionTime: metav1.Now(),
Status: metav1.ConditionTrue,
}
}

func NewWorkspaceConditionBackupComplete() metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionBackupComplete),
Expand Down
59 changes: 57 additions & 2 deletions components/ws-manager-mk2/controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,30 @@ import (
const (
workspaceStartupSeconds string = "workspace_startup_seconds"
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
workspaceFailuresTotal string = "workspace_failure_total"
workspaceStopsTotal string = "workspace_stops_total"
workspaceBackupsTotal string = "workspace_backups_total"
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
workspaceRestoresTotal string = "workspace_restores_total"
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
)

type StopReason string

const (
StopReasonFailed = "failed"
StopReasonStartFailure = "start-failure"
StopReasonAborted = "aborted"
StopReasonOutOfSpace = "out-of-space"
StopReasonTimeout = "timeout"
StopReasonTabClosed = "tab-closed"
StopReasonRegular = "regular-stop"
)

type controllerMetrics struct {
startupTimeHistVec *prometheus.HistogramVec
totalStartsFailureCounterVec *prometheus.CounterVec
totalFailuresCounterVec *prometheus.CounterVec
totalStopsCounterVec *prometheus.CounterVec

totalBackupCounterVec *prometheus.CounterVec
Expand Down Expand Up @@ -64,6 +78,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
Name: workspaceStartFailuresTotal,
Help: "total number of workspaces that failed to start",
}, []string{"type", "class"}),
totalFailuresCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: workspaceFailuresTotal,
Help: "total number of workspaces that had a failed condition",
}, []string{"type", "class"}),
totalStopsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Expand Down Expand Up @@ -126,11 +146,42 @@ func (m *controllerMetrics) countWorkspaceStartFailures(log *logr.Logger, ws *wo
counter.Inc()
}

func (m *controllerMetrics) countWorkspaceFailure(log *logr.Logger, ws *workspacev1.Workspace) {
class := ws.Spec.Class
tpe := string(ws.Spec.Type)

counter, err := m.totalFailuresCounterVec.GetMetricWithLabelValues(tpe, class)
if err != nil {
log.Error(err, "could not count workspace failure", "type", tpe, "class", class)
}

counter.Inc()
}

func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1.Workspace) {
var reason string
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)); c != nil {
reason = StopReasonFailed
if !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) {
// Don't record 'failed' if there was a start failure.
reason = StopReasonStartFailure
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

different from MK1: instead of not incrementing the stop metric when there was a start failure, we increment it now but with its own unique reason.

} else if strings.Contains(c.Message, "Pod ephemeral local storage usage exceeds the total limit of containers") {
reason = StopReasonOutOfSpace
}
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionAborted)) {
reason = StopReasonAborted
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionTimeout)) {
reason = StopReasonTimeout
} else if wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionClosed)) {
reason = StopReasonTabClosed
} else {
reason = StopReasonRegular
}
Comment on lines +163 to +179
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-ordered these checks compared to mk1, to e.g. check for Failure first. If there's both a failure and closed condition, we'd want the failure reason to be reported instead of closed


class := ws.Spec.Class
tpe := string(ws.Spec.Type)

counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues("unknown", tpe, class)
counter, err := m.totalStopsCounterVec.GetMetricWithLabelValues(reason, tpe, class)
if err != nil {
log.Error(err, "could not count workspace stop", "reason", "unknown", "type", tpe, "class", class)
}
Expand Down Expand Up @@ -210,6 +261,7 @@ type metricState struct {
recordedStartTime bool
recordedInitFailure bool
recordedStartFailure bool
recordedFailure bool
recordedContentReady bool
recordedBackupFailed bool
recordedBackupCompleted bool
Expand All @@ -223,7 +275,8 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
// each workspace.
recordedStartTime: ws.Status.Phase == workspacev1.WorkspacePhaseRunning,
recordedInitFailure: wsk8s.ConditionWithStatusAndReason(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure),
recordedStartFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)),
recordedStartFailure: ws.Status.Phase == workspacev1.WorkspacePhaseStopped && !wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)),
recordedFailure: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)),
recordedContentReady: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)),
recordedBackupFailed: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)),
recordedBackupCompleted: wsk8s.ConditionPresentAndTrue(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupComplete)),
Expand All @@ -245,6 +298,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
m.startupTimeHistVec.Describe(ch)
m.totalStopsCounterVec.Describe(ch)
m.totalStartsFailureCounterVec.Describe(ch)
m.totalFailuresCounterVec.Describe(ch)

m.totalBackupCounterVec.Describe(ch)
m.totalBackupFailureCounterVec.Describe(ch)
Expand All @@ -260,6 +314,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
m.startupTimeHistVec.Collect(ch)
m.totalStopsCounterVec.Collect(ch)
m.totalStartsFailureCounterVec.Collect(ch)
m.totalFailuresCounterVec.Collect(ch)

m.totalBackupCounterVec.Collect(ch)
m.totalBackupFailureCounterVec.Collect(ch)
Expand Down
53 changes: 33 additions & 20 deletions components/ws-manager-mk2/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,7 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa

if failure != "" && !wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
// workspaces can fail only once - once there is a failed condition set, stick with it
workspace.Status.Conditions = wsk8s.AddUniqueCondition(workspace.Status.Conditions, metav1.Condition{
Type: string(workspacev1.WorkspaceConditionFailed),
Status: metav1.ConditionTrue,
LastTransitionTime: metav1.Now(),
Message: failure,
})

workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionFailed(failure))
r.Recorder.Event(workspace, corev1.EventTypeWarning, "Failed", failure)
}

Expand Down Expand Up @@ -146,19 +140,30 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
}

case pod.Status.Phase == corev1.PodRunning:
var ready bool
for _, cs := range pod.Status.ContainerStatuses {
if cs.Ready {
ready = true
break
}
}
if ready {
// workspace is ready - hence content init is done
everReady := wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady))
if everReady {
// If the workspace has been ready before, stay in a Running state, even
// if the workspace container is not ready anymore. This is to avoid the workspace
// moving back to Initializing and becoming unusable.
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
} else {
// workspace has not become ready yet - it must be initializing then.
workspace.Status.Phase = workspacev1.WorkspacePhaseInitializing
var ready bool
for _, cs := range pod.Status.ContainerStatuses {
if cs.Ready {
ready = true
break
}
}
if ready {
// workspace is ready - hence content init is done
workspace.Status.Phase = workspacev1.WorkspacePhaseRunning
if !wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) {
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionEverReady())
}
} else {
// workspace has not become ready yet - it must be initializing then.
workspace.Status.Phase = workspacev1.WorkspacePhaseInitializing
}
}

case workspace.IsHeadless() && (pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed):
Expand Down Expand Up @@ -197,13 +202,21 @@ func extractFailure(ws *workspacev1.Workspace, pod *corev1.Pod) (string, *worksp
// Check for content init failure.
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c != nil {
if c.Status == metav1.ConditionFalse && c.Reason == workspacev1.ReasonInitializationFailure {
return c.Message, nil
msg := c.Message
if msg == "" {
msg = "Content initialization failed for an unknown reason"
}
return msg, nil
}
}

// Check for backup failure.
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionBackupFailure)); c != nil {
return c.Message, nil
msg := c.Message
if msg == "" {
msg = "Backup failed for an unknown reason"
}
return msg, nil
}

status := pod.Status
Expand Down
20 changes: 10 additions & 10 deletions components/ws-manager-mk2/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,18 +306,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
if !lastState.recordedInitFailure && wsk8s.ConditionWithStatusAndReason(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady), false, workspacev1.ReasonInitializationFailure) {
r.metrics.countTotalRestoreFailures(&log, workspace)
lastState.recordedInitFailure = true

if !lastState.recordedStartFailure {
r.metrics.countWorkspaceStartFailures(&log, workspace)
lastState.recordedStartFailure = true
}
}

if !lastState.recordedStartFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
// Only record if there was no other start failure recorded yet, to ensure max one
// start failure gets recorded per workspace.
r.metrics.countWorkspaceStartFailures(&log, workspace)
lastState.recordedStartFailure = true
if !lastState.recordedFailure && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionFailed)) {
r.metrics.countWorkspaceFailure(&log, workspace)
lastState.recordedFailure = true
}

if !lastState.recordedContentReady && wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)) {
Expand All @@ -344,6 +337,13 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
r.metrics.countWorkspaceStop(&log, workspace)

everReady := wsk8s.ConditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady))
if !lastState.recordedStartFailure && !everReady {
// Workspace never became ready, count as a startup failure.
r.metrics.countWorkspaceStartFailures(&log, workspace)
// No need to record in metricState, as we're forgetting the workspace state next anyway.
}

// Forget about this workspace, no more state updates will be recorded after this.
r.metrics.forgetWorkspace(workspace)
return
Expand Down
Loading