Skip to content

Commit dc68610

Browse files
authored
[ws-manager-mk2] Ignore container killed failures (#17118)
* [ws-manager-mk2] Ignore container killed failures * ClusterRole + Binding
1 parent 24c401a commit dc68610

File tree

3 files changed

+56
-2
lines changed

3 files changed

+56
-2
lines changed

components/ws-manager-mk2/controllers/status.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"golang.org/x/xerrors"
1717
corev1 "k8s.io/api/core/v1"
1818
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+
"k8s.io/apimachinery/pkg/types"
1920
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
2021
"sigs.k8s.io/controller-runtime/pkg/log"
2122
)
@@ -91,7 +92,7 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
9192
workspace.Status.OwnerToken = ownerToken
9293
}
9394

94-
failure, phase := extractFailure(workspace, pod)
95+
failure, phase := r.extractFailure(ctx, workspace, pod)
9596
if phase != nil {
9697
workspace.Status.Phase = *phase
9798
}
@@ -193,7 +194,7 @@ func isDisposalFinished(ws *workspacev1.Workspace) bool {
193194
// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
194195
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
195196
// This failure is then stored in the Failed condition on the workspace.
196-
func extractFailure(ws *workspacev1.Workspace, pod *corev1.Pod) (string, *workspacev1.WorkspacePhase) {
197+
func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev1.Workspace, pod *corev1.Pod) (string, *workspacev1.WorkspacePhase) {
197198
// Check for content init failure.
198199
if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c != nil {
199200
if c.Status == metav1.ConditionFalse && c.Reason == workspacev1.ReasonInitializationFailure {
@@ -250,6 +251,18 @@ func extractFailure(ws *workspacev1.Workspace, pod *corev1.Pod) (string, *worksp
250251
phase = workspacev1.WorkspacePhaseRunning
251252
}
252253

254+
if terminationState.ExitCode == containerKilledExitCode && terminationState.Reason == "ContainerStatusUnknown" {
255+
// For some reason, the pod is killed with unknown container status and no taints on the underlying node.
256+
// Therefore, we skip extracting the failure from the terminated message.
257+
// ref: https://github.com/gitpod-io/gitpod/issues/12021
258+
var node corev1.Node
259+
if ws.Status.Runtime != nil && ws.Status.Runtime.NodeName != "" {
260+
if err := r.Get(ctx, types.NamespacedName{Namespace: "", Name: ws.Status.Runtime.NodeName}, &node); err == nil && len(node.Spec.Taints) == 0 {
261+
return "", nil
262+
}
263+
}
264+
}
265+
253266
// the container itself told us why it was terminated - use that as failure reason
254267
return extractFailureFromLogs([]byte(terminationState.Message)), &phase
255268
} else if terminationState.Reason == "Error" {

install/installer/pkg/components/ws-manager-mk2/role.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,18 @@ var controllerRules = []rbacv1.PolicyRule{
102102
},
103103
}
104104

105+
var controllerClusterRules = []rbacv1.PolicyRule{
106+
{
107+
APIGroups: []string{""},
108+
Resources: []string{"nodes"},
109+
Verbs: []string{
110+
"get",
111+
"list",
112+
"watch",
113+
},
114+
},
115+
}
116+
105117
// ConfigMap, Leases, and Events access is required for leader-election.
106118
var leaderElectionRules = []rbacv1.PolicyRule{
107119
{
@@ -150,5 +162,14 @@ func role(ctx *common.RenderContext) ([]runtime.Object, error) {
150162
},
151163
Rules: controllerRules,
152164
},
165+
166+
&rbacv1.ClusterRole{
167+
TypeMeta: common.TypeMetaClusterRole,
168+
ObjectMeta: metav1.ObjectMeta{
169+
Name: Component,
170+
Labels: labels,
171+
},
172+
Rules: controllerClusterRules,
173+
},
153174
}, nil
154175
}

install/installer/pkg/components/ws-manager-mk2/rolebinding.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,5 +78,25 @@ func rolebinding(ctx *common.RenderContext) ([]runtime.Object, error) {
7878
},
7979
},
8080
},
81+
82+
&rbacv1.ClusterRoleBinding{
83+
TypeMeta: common.TypeMetaClusterRoleBinding,
84+
ObjectMeta: metav1.ObjectMeta{
85+
Name: Component,
86+
Labels: labels,
87+
},
88+
RoleRef: rbacv1.RoleRef{
89+
Kind: "ClusterRole",
90+
Name: Component,
91+
APIGroup: "rbac.authorization.k8s.io",
92+
},
93+
Subjects: []rbacv1.Subject{
94+
{
95+
Kind: "ServiceAccount",
96+
Name: Component,
97+
Namespace: ctx.Namespace,
98+
},
99+
},
100+
},
81101
}, nil
82102
}

0 commit comments

Comments
 (0)