Skip to content

Commit 8835e0e

Browse files
committed
[node-labeler] Refactor node labeling to use taints instead of labels
1 parent 61ecd26 commit 8835e0e

File tree

1 file changed

+139
-29
lines changed
  • components/node-labeler/cmd

1 file changed

+139
-29
lines changed

components/node-labeler/cmd/run.go

Lines changed: 139 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ const (
5050

5151
registryFacade = "registry-facade"
5252
wsDaemon = "ws-daemon"
53+
54+
// Taint keys for different components
55+
registryFacadeTaintKey = "gitpod.io/registry-facade-not-ready"
56+
wsDaemonTaintKey = "gitpod.io/ws-daemon-not-ready"
57+
58+
workspacesRegularLabel = "gitpod.io/workload_workspace_regular"
59+
workspacesHeadlessLabel = "gitpod.io/workload_workspace_headless"
5360
)
5461

5562
var defaultRequeueTime = time.Second * 10
@@ -61,6 +68,15 @@ var runCmd = &cobra.Command{
6168
Run: func(cmd *cobra.Command, args []string) {
6269
ctrl.SetLogger(logrusr.New(log.Log))
6370

71+
kClient, err := client.New(ctrl.GetConfigOrDie(), client.Options{})
72+
if err != nil {
73+
log.WithError(err).Fatal("unable to create client")
74+
}
75+
76+
if err := initializeLabels(context.Background(), kClient); err != nil {
77+
log.WithError(err).Fatal("failed to initialize labels")
78+
}
79+
6480
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
6581
Scheme: scheme,
6682
HealthProbeBindAddress: ":8086",
@@ -84,11 +100,6 @@ var runCmd = &cobra.Command{
84100
log.WithError(err).Fatal("unable to start node-labeler")
85101
}
86102

87-
kClient, err := client.New(ctrl.GetConfigOrDie(), client.Options{})
88-
if err != nil {
89-
log.WithError(err).Fatal("unable to create client")
90-
}
91-
92103
r := &PodReconciler{
93104
kClient,
94105
}
@@ -198,21 +209,18 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
198209
}
199210

200211
var (
201-
ipAddress string
202-
port string
203-
component string
204-
labelToUpdate string
212+
ipAddress string
213+
port string
214+
taintKey string
205215
)
206216

207217
switch {
208218
case strings.HasPrefix(pod.Name, registryFacade):
209-
component = registryFacade
210-
labelToUpdate = fmt.Sprintf(registryFacadeLabel, namespace)
219+
taintKey = registryFacadeTaintKey
211220
ipAddress = pod.Status.HostIP
212221
port = strconv.Itoa(registryFacadePort)
213222
case strings.HasPrefix(pod.Name, wsDaemon):
214-
component = wsDaemon
215-
labelToUpdate = fmt.Sprintf(wsdaemonLabel, namespace)
223+
taintKey = wsDaemonTaintKey
216224
ipAddress = pod.Status.PodIP
217225
port = strconv.Itoa(wsdaemonPort)
218226
default:
@@ -222,17 +230,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
222230

223231
if !pod.ObjectMeta.DeletionTimestamp.IsZero() {
224232
// the pod is being removed.
225-
// remove the component label from the node
233+
// add the taint to the node
226234
time.Sleep(1 * time.Second)
227-
err := updateLabel(labelToUpdate, false, nodeName, r)
235+
err := updateNodeTaint(taintKey, true, nodeName, r)
228236
if err != nil {
229237
// this is a edge case when cluster-autoscaler removes a node
230238
// (all the running pods will be removed after that)
231239
if errors.IsNotFound(err) {
232240
return reconcile.Result{}, nil
233241
}
234242

235-
log.WithError(err).Error("removing node label")
243+
log.WithError(err).Error("adding node taint")
236244
return reconcile.Result{RequeueAfter: defaultRequeueTime}, err
237245
}
238246

@@ -250,8 +258,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
250258
return reconcile.Result{}, fmt.Errorf("obtaining node %s: %w", nodeName, err)
251259
}
252260

253-
if labelValue, exists := node.Labels[labelToUpdate]; exists && labelValue == "true" {
254-
// nothing to do, the label already exists.
261+
// Check if taint exists
262+
taintExists := false
263+
for _, taint := range node.Spec.Taints {
264+
if taint.Key == taintKey {
265+
taintExists = true
266+
break
267+
}
268+
}
269+
270+
if !taintExists {
271+
// nothing to do, the taint doesn't exist.
255272
return reconcile.Result{}, nil
256273
}
257274

@@ -261,7 +278,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
261278
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
262279
}
263280

264-
if component == registryFacade {
281+
if strings.HasPrefix(pod.Name, registryFacade) {
265282
err = checkRegistryFacade(ipAddress, port)
266283
if err != nil {
267284
log.WithError(err).Error("checking registry-facade")
@@ -271,15 +288,15 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
271288
time.Sleep(1 * time.Second)
272289
}
273290

274-
err = updateLabel(labelToUpdate, true, nodeName, r)
291+
err = updateNodeTaint(taintKey, false, nodeName, r)
275292
if err != nil {
276-
log.WithError(err).Error("updating node label")
277-
return reconcile.Result{}, fmt.Errorf("trying to add the label: %v", err)
293+
log.WithError(err).Error("removing node taint")
294+
return reconcile.Result{}, fmt.Errorf("trying to remove the taint: %v", err)
278295
}
279296

280297
readyIn := time.Since(pod.Status.StartTime.Time)
281-
NodeLabelerTimeHistVec.WithLabelValues(component).Observe(readyIn.Seconds())
282-
NodeLabelerCounterVec.WithLabelValues(component).Inc()
298+
NodeLabelerTimeHistVec.WithLabelValues(strings.Split(pod.Name, "-")[0]).Observe(readyIn.Seconds())
299+
NodeLabelerCounterVec.WithLabelValues(strings.Split(pod.Name, "-")[0]).Inc()
283300

284301
return reconcile.Result{}, nil
285302
}
@@ -485,7 +502,7 @@ func (c *NodeScaledownAnnotationController) updateNodeAnnotation(ctx context.Con
485502
})
486503
}
487504

488-
func updateLabel(label string, add bool, nodeName string, client client.Client) error {
505+
func updateNodeTaint(taintKey string, add bool, nodeName string, client client.Client) error {
489506
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
490507
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
491508
defer cancel()
@@ -496,12 +513,36 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
496513
return err
497514
}
498515

516+
// Create or remove taint
499517
if add {
500-
node.Labels[label] = "true"
501-
log.WithField("label", label).WithField("node", nodeName).Info("adding label to node")
518+
// Add taint if it doesn't exist
519+
taintExists := false
520+
for _, taint := range node.Spec.Taints {
521+
if taint.Key == taintKey {
522+
taintExists = true
523+
break
524+
}
525+
}
526+
if !taintExists {
527+
node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{
528+
Key: taintKey,
529+
Value: "true",
530+
Effect: corev1.TaintEffectNoSchedule,
531+
})
532+
log.WithField("taint", taintKey).WithField("node", nodeName).Info("adding taint to node")
533+
}
502534
} else {
503-
delete(node.Labels, label)
504-
log.WithField("label", label).WithField("node", nodeName).Info("removing label from node")
535+
// Remove taint if it exists
536+
newTaints := make([]corev1.Taint, 0)
537+
for _, taint := range node.Spec.Taints {
538+
if taint.Key != taintKey {
539+
newTaints = append(newTaints, taint)
540+
}
541+
}
542+
if len(newTaints) != len(node.Spec.Taints) {
543+
node.Spec.Taints = newTaints
544+
log.WithField("taint", taintKey).WithField("node", nodeName).Info("removing taint from node")
545+
}
505546
}
506547

507548
err = client.Update(ctx, &node)
@@ -569,3 +610,72 @@ func newDefaultTransport() *http.Transport {
569610
DisableKeepAlives: true,
570611
}
571612
}
613+
614+
func initializeLabels(ctx context.Context, kClient client.Client) error {
615+
log.Info("initializing labels on nodes")
616+
617+
var nodes corev1.NodeList
618+
if err := kClient.List(ctx, &nodes); err != nil {
619+
return fmt.Errorf("failed to list nodes: %w", err)
620+
}
621+
622+
for _, node := range nodes.Items {
623+
if node.Labels == nil {
624+
continue
625+
}
626+
_, isRegularWorkspaceNode := node.Labels[workspacesRegularLabel]
627+
_, isHeadlessWorkspaceNode := node.Labels[workspacesHeadlessLabel]
628+
629+
if isRegularWorkspaceNode || isHeadlessWorkspaceNode {
630+
err := updateNodeLabel(node.Name, kClient)
631+
if err != nil {
632+
log.WithError(err).WithField("node", node.Name).Error("failed to initialize labels on node")
633+
}
634+
}
635+
}
636+
637+
log.Info("finished initializing labels on nodes")
638+
return nil
639+
}
640+
641+
func updateNodeLabel(nodeName string, client client.Client) error {
642+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
643+
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
644+
defer cancel()
645+
646+
var node corev1.Node
647+
err := client.Get(ctx, types.NamespacedName{Name: nodeName}, &node)
648+
if err != nil {
649+
return err
650+
}
651+
652+
registryFacadeLabelForNamespace := fmt.Sprintf(registryFacadeLabel, namespace)
653+
wsDaemonLabelForNamespace := fmt.Sprintf(wsdaemonLabel, namespace)
654+
655+
needUpdate := false
656+
657+
if node.Labels == nil {
658+
node.Labels = make(map[string]string)
659+
}
660+
661+
if v := node.Labels[registryFacadeLabelForNamespace]; v != "true" {
662+
needUpdate = true
663+
}
664+
if v := node.Labels[wsDaemonLabelForNamespace]; v != "true" {
665+
needUpdate = true
666+
}
667+
668+
if !needUpdate {
669+
return nil
670+
}
671+
node.Labels[registryFacadeLabelForNamespace] = "true"
672+
node.Labels[wsDaemonLabelForNamespace] = "true"
673+
674+
err = client.Update(ctx, &node)
675+
if err != nil {
676+
return err
677+
}
678+
679+
return nil
680+
})
681+
}

0 commit comments

Comments
 (0)