Skip to content

Commit 13de063

Browse files
committed
Replace watch with builder.ControllerManagedBy
Signed-off-by: Manuel de Brito Fontes <[email protected]>
1 parent 72a58dd commit 13de063

File tree

1 file changed

+49
-72
lines changed
  • components/node-labeler/cmd

1 file changed

+49
-72
lines changed

components/node-labeler/cmd/run.go

Lines changed: 49 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,22 @@ import (
1818
"github.com/spf13/cobra"
1919
corev1 "k8s.io/api/core/v1"
2020
"k8s.io/apimachinery/pkg/api/errors"
21+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2122
"k8s.io/apimachinery/pkg/runtime"
2223
"k8s.io/apimachinery/pkg/types"
2324
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2425
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
2526
_ "k8s.io/client-go/plugin/pkg/client/auth"
2627
"k8s.io/client-go/util/retry"
28+
"k8s.io/utils/pointer"
2729
ctrl "sigs.k8s.io/controller-runtime"
30+
"sigs.k8s.io/controller-runtime/pkg/builder"
2831
"sigs.k8s.io/controller-runtime/pkg/client"
2932
"sigs.k8s.io/controller-runtime/pkg/controller"
30-
"sigs.k8s.io/controller-runtime/pkg/event"
31-
"sigs.k8s.io/controller-runtime/pkg/handler"
3233
"sigs.k8s.io/controller-runtime/pkg/healthz"
3334
"sigs.k8s.io/controller-runtime/pkg/metrics"
3435
"sigs.k8s.io/controller-runtime/pkg/predicate"
3536
"sigs.k8s.io/controller-runtime/pkg/reconcile"
36-
"sigs.k8s.io/controller-runtime/pkg/source"
3737

3838
"github.com/gitpod-io/gitpod/common-go/log"
3939
)
@@ -46,6 +46,8 @@ const (
4646
wsDaemon = "ws-daemon"
4747
)
4848

49+
var defaultRequeueTime = time.Second * 10
50+
4951
// serveCmd represents the serve command
5052
var runCmd = &cobra.Command{
5153
Use: "run",
@@ -60,6 +62,10 @@ var runCmd = &cobra.Command{
6062
LeaderElection: true,
6163
LeaderElectionID: "node-labeler.gitpod.io",
6264
Namespace: namespace,
65+
// default sync period is 10h.
66+
// in case node-labeler is restarted and not change happens, we could waste (at least) 20m in a node
67+
// that never will run workspaces and the additional nodes cluster-autoscaler adds to compensate
68+
SyncPeriod: pointer.Duration(2 * time.Minute),
6369
})
6470
if err != nil {
6571
log.WithError(err).Fatal("unable to start node-labeber")
@@ -74,35 +80,38 @@ var runCmd = &cobra.Command{
7480
client,
7581
}
7682

77-
c, err := controller.New("pod-watcher", mgr, controller.Options{
78-
Reconciler: r,
79-
MaxConcurrentReconciles: 20,
83+
rfPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{
84+
MatchLabels: map[string]string{
85+
"app": "gitpod",
86+
"component": "registry-facade",
87+
},
8088
})
8189
if err != nil {
82-
log.WithError(err).Fatal("unable to bind controller watch event handler")
90+
log.WithError(err).Fatal("unable to create predicate")
8391
}
8492

85-
metrics.Registry.MustRegister(NodeLabelerCounterVec)
86-
metrics.Registry.MustRegister(NodeLabelerTimeHistVec)
87-
88-
err = c.Watch(&source.Kind{Type: &corev1.Pod{}}, &handler.EnqueueRequestForObject{}, predicate.Funcs{
89-
CreateFunc: func(ce event.CreateEvent) bool {
90-
return processPodEvent(ce.Object)
91-
},
92-
UpdateFunc: func(ue event.UpdateEvent) bool {
93-
return processPodEvent(ue.ObjectNew)
94-
},
95-
DeleteFunc: func(deleteEvent event.DeleteEvent) bool {
96-
return processPodEvent(deleteEvent.Object)
97-
},
98-
GenericFunc: func(genericEvent event.GenericEvent) bool {
99-
return false
93+
wsPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{
94+
MatchLabels: map[string]string{
95+
"app": "gitpod",
96+
"component": "ws-daemon",
10097
},
10198
})
10299
if err != nil {
103-
log.WithError(err).Fatal("unable to create controller")
100+
log.WithError(err).Fatal("unable to create predicate")
101+
}
102+
103+
err = ctrl.NewControllerManagedBy(mgr).
104+
Named("pod-watcher").
105+
For(&corev1.Pod{}, builder.WithPredicates(predicate.Or(rfPredicate, wsPredicate))).
106+
WithOptions(controller.Options{MaxConcurrentReconciles: 1}).
107+
Complete(r)
108+
if err != nil {
109+
log.WithError(err).Fatal("unable to bind controller watch event handler")
104110
}
105111

112+
metrics.Registry.MustRegister(NodeLabelerCounterVec)
113+
metrics.Registry.MustRegister(NodeLabelerTimeHistVec)
114+
106115
err = mgr.AddHealthzCheck("healthz", healthz.Ping)
107116
if err != nil {
108117
log.WithError(err).Fatal("unable to set up health check")
@@ -132,14 +141,6 @@ var (
132141
scheme = runtime.NewScheme()
133142
)
134143

135-
func processPodEvent(pod client.Object) bool {
136-
if strings.HasPrefix(pod.GetName(), registryFacade) || strings.HasPrefix(pod.GetName(), wsDaemon) {
137-
return true
138-
}
139-
140-
return false
141-
}
142-
143144
type PodReconciler struct {
144145
client.Client
145146
}
@@ -157,16 +158,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
157158

158159
nodeName := pod.Spec.NodeName
159160
if nodeName == "" {
160-
return reconcile.Result{RequeueAfter: time.Second * 10}, err
161+
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
161162
}
162163

163164
var (
164165
ipAddress string
165166
port string
166167
component string
167168
labelToUpdate string
168-
169-
waitTimeout time.Duration = 5 * time.Second
170169
)
171170

172171
switch {
@@ -181,7 +180,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
181180
ipAddress = pod.Status.PodIP
182181
port = strconv.Itoa(wsdaemonPort)
183182
default:
184-
log.WithField("pod", pod.Name).Info("Invalid pod. Skipping...")
183+
// nothing to do
185184
return reconcile.Result{}, nil
186185
}
187186

@@ -198,7 +197,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
198197
}
199198

200199
log.WithError(err).Error("removing node label")
201-
return reconcile.Result{RequeueAfter: time.Second * 10}, err
200+
return reconcile.Result{RequeueAfter: defaultRequeueTime}, err
202201
}
203202

204203
return reconcile.Result{}, err
@@ -215,28 +214,28 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
215214
return reconcile.Result{}, fmt.Errorf("obtaining node %s: %w", nodeName, err)
216215
}
217216

218-
if node.Labels[labelToUpdate] == "true" {
219-
// Label already exists.
217+
if labelValue, exists := node.Labels[labelToUpdate]; exists && labelValue == "true" {
218+
// nothing to do, the label already exists.
220219
return reconcile.Result{}, nil
221220
}
222221

223-
err = waitForTCPPortToBeReachable(ipAddress, port, 30*time.Second)
222+
err = checkTCPPortIsReachable(ipAddress, port)
224223
if err != nil {
225-
return reconcile.Result{}, fmt.Errorf("waiting for TCP port: %v", err)
224+
log.WithField("host", ipAddress).WithField("port", port).WithField("pod", pod.Name).WithError(err).Error("checking if TCP port is open")
225+
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
226226
}
227227

228228
if component == registryFacade {
229229
err = checkRegistryFacade(ipAddress, port)
230230
if err != nil {
231231
log.WithError(err).Error("checking registry-facade")
232-
return reconcile.Result{RequeueAfter: time.Second * 10}, nil
232+
return reconcile.Result{RequeueAfter: defaultRequeueTime}, nil
233233
}
234234
}
235235

236-
time.Sleep(waitTimeout)
237-
238236
err = updateLabel(labelToUpdate, true, nodeName, r)
239237
if err != nil {
238+
log.WithError(err).Error("updating node label")
240239
return reconcile.Result{}, fmt.Errorf("trying to add the label: %v", err)
241240
}
242241

@@ -258,11 +257,6 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
258257
return err
259258
}
260259

261-
_, hasLabel := node.Labels[label]
262-
if add == hasLabel {
263-
return nil
264-
}
265-
266260
if add {
267261
node.Labels[label] = "true"
268262
log.WithField("label", label).WithField("node", nodeName).Info("adding label to node")
@@ -280,31 +274,14 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
280274
})
281275
}
282276

283-
func waitForTCPPortToBeReachable(host string, port string, timeout time.Duration) error {
284-
ctx, cancel := context.WithTimeout(context.Background(), timeout)
285-
defer cancel()
286-
287-
ticker := time.NewTicker(1 * time.Second)
288-
defer ticker.Stop()
289-
290-
for {
291-
select {
292-
case <-ctx.Done():
293-
return fmt.Errorf("port %v on host %v never reachable", port, host)
294-
case <-ticker.C:
295-
conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, port), 500*time.Millisecond)
296-
if err != nil {
297-
continue
298-
}
299-
300-
if conn != nil {
301-
conn.Close()
302-
return nil
303-
}
304-
305-
continue
306-
}
277+
func checkTCPPortIsReachable(host string, port string) error {
278+
conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, port), 1*time.Second)
279+
if err != nil {
280+
return err
307281
}
282+
defer conn.Close()
283+
284+
return nil
308285
}
309286

310287
func checkRegistryFacade(host, port string) error {

0 commit comments

Comments
 (0)