@@ -18,22 +18,22 @@ import (
18
18
"github.com/spf13/cobra"
19
19
corev1 "k8s.io/api/core/v1"
20
20
"k8s.io/apimachinery/pkg/api/errors"
21
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21
22
"k8s.io/apimachinery/pkg/runtime"
22
23
"k8s.io/apimachinery/pkg/types"
23
24
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
24
25
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
25
26
_ "k8s.io/client-go/plugin/pkg/client/auth"
26
27
"k8s.io/client-go/util/retry"
28
+ "k8s.io/utils/pointer"
27
29
ctrl "sigs.k8s.io/controller-runtime"
30
+ "sigs.k8s.io/controller-runtime/pkg/builder"
28
31
"sigs.k8s.io/controller-runtime/pkg/client"
29
32
"sigs.k8s.io/controller-runtime/pkg/controller"
30
- "sigs.k8s.io/controller-runtime/pkg/event"
31
- "sigs.k8s.io/controller-runtime/pkg/handler"
32
33
"sigs.k8s.io/controller-runtime/pkg/healthz"
33
34
"sigs.k8s.io/controller-runtime/pkg/metrics"
34
35
"sigs.k8s.io/controller-runtime/pkg/predicate"
35
36
"sigs.k8s.io/controller-runtime/pkg/reconcile"
36
- "sigs.k8s.io/controller-runtime/pkg/source"
37
37
38
38
"github.com/gitpod-io/gitpod/common-go/log"
39
39
)
@@ -46,6 +46,8 @@ const (
46
46
wsDaemon = "ws-daemon"
47
47
)
48
48
49
+ var defaultRequeueTime = time .Second * 10
50
+
49
51
// serveCmd represents the serve command
50
52
var runCmd = & cobra.Command {
51
53
Use : "run" ,
@@ -60,6 +62,10 @@ var runCmd = &cobra.Command{
60
62
LeaderElection : true ,
61
63
LeaderElectionID : "node-labeler.gitpod.io" ,
62
64
Namespace : namespace ,
65
+ // default sync period is 10h.
66
+ // in case node-labeler is restarted and not change happens, we could waste (at least) 20m in a node
67
+ // that never will run workspaces and the additional nodes cluster-autoscaler adds to compensate
68
+ SyncPeriod : pointer .Duration (2 * time .Minute ),
63
69
})
64
70
if err != nil {
65
71
log .WithError (err ).Fatal ("unable to start node-labeber" )
@@ -74,35 +80,38 @@ var runCmd = &cobra.Command{
74
80
client ,
75
81
}
76
82
77
- c , err := controller .New ("pod-watcher" , mgr , controller.Options {
78
- Reconciler : r ,
79
- MaxConcurrentReconciles : 20 ,
83
+ rfPredicate , err := predicate .LabelSelectorPredicate (metav1.LabelSelector {
84
+ MatchLabels : map [string ]string {
85
+ "app" : "gitpod" ,
86
+ "component" : "registry-facade" ,
87
+ },
80
88
})
81
89
if err != nil {
82
- log .WithError (err ).Fatal ("unable to bind controller watch event handler " )
90
+ log .WithError (err ).Fatal ("unable to create predicate " )
83
91
}
84
92
85
- metrics .Registry .MustRegister (NodeLabelerCounterVec )
86
- metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
87
-
88
- err = c .Watch (& source.Kind {Type : & corev1.Pod {}}, & handler.EnqueueRequestForObject {}, predicate.Funcs {
89
- CreateFunc : func (ce event.CreateEvent ) bool {
90
- return processPodEvent (ce .Object )
91
- },
92
- UpdateFunc : func (ue event.UpdateEvent ) bool {
93
- return processPodEvent (ue .ObjectNew )
94
- },
95
- DeleteFunc : func (deleteEvent event.DeleteEvent ) bool {
96
- return processPodEvent (deleteEvent .Object )
97
- },
98
- GenericFunc : func (genericEvent event.GenericEvent ) bool {
99
- return false
93
+ wsPredicate , err := predicate .LabelSelectorPredicate (metav1.LabelSelector {
94
+ MatchLabels : map [string ]string {
95
+ "app" : "gitpod" ,
96
+ "component" : "ws-daemon" ,
100
97
},
101
98
})
102
99
if err != nil {
103
- log .WithError (err ).Fatal ("unable to create controller" )
100
+ log .WithError (err ).Fatal ("unable to create predicate" )
101
+ }
102
+
103
+ err = ctrl .NewControllerManagedBy (mgr ).
104
+ Named ("pod-watcher" ).
105
+ For (& corev1.Pod {}, builder .WithPredicates (predicate .Or (rfPredicate , wsPredicate ))).
106
+ WithOptions (controller.Options {MaxConcurrentReconciles : 1 }).
107
+ Complete (r )
108
+ if err != nil {
109
+ log .WithError (err ).Fatal ("unable to bind controller watch event handler" )
104
110
}
105
111
112
+ metrics .Registry .MustRegister (NodeLabelerCounterVec )
113
+ metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
114
+
106
115
err = mgr .AddHealthzCheck ("healthz" , healthz .Ping )
107
116
if err != nil {
108
117
log .WithError (err ).Fatal ("unable to set up health check" )
@@ -132,14 +141,6 @@ var (
132
141
scheme = runtime .NewScheme ()
133
142
)
134
143
135
- func processPodEvent (pod client.Object ) bool {
136
- if strings .HasPrefix (pod .GetName (), registryFacade ) || strings .HasPrefix (pod .GetName (), wsDaemon ) {
137
- return true
138
- }
139
-
140
- return false
141
- }
142
-
143
144
type PodReconciler struct {
144
145
client.Client
145
146
}
@@ -157,16 +158,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
157
158
158
159
nodeName := pod .Spec .NodeName
159
160
if nodeName == "" {
160
- return reconcile.Result {RequeueAfter : time . Second * 10 }, err
161
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
161
162
}
162
163
163
164
var (
164
165
ipAddress string
165
166
port string
166
167
component string
167
168
labelToUpdate string
168
-
169
- waitTimeout time.Duration = 5 * time .Second
170
169
)
171
170
172
171
switch {
@@ -181,7 +180,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
181
180
ipAddress = pod .Status .PodIP
182
181
port = strconv .Itoa (wsdaemonPort )
183
182
default :
184
- log . WithField ( "pod" , pod . Name ). Info ( "Invalid pod. Skipping..." )
183
+ // nothing to do
185
184
return reconcile.Result {}, nil
186
185
}
187
186
@@ -198,7 +197,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
198
197
}
199
198
200
199
log .WithError (err ).Error ("removing node label" )
201
- return reconcile.Result {RequeueAfter : time . Second * 10 }, err
200
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, err
202
201
}
203
202
204
203
return reconcile.Result {}, err
@@ -215,28 +214,28 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
215
214
return reconcile.Result {}, fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
216
215
}
217
216
218
- if node .Labels [labelToUpdate ] == "true" {
219
- // Label already exists.
217
+ if labelValue , exists := node .Labels [labelToUpdate ]; exists && labelValue == "true" {
218
+ // nothing to do, the label already exists.
220
219
return reconcile.Result {}, nil
221
220
}
222
221
223
- err = waitForTCPPortToBeReachable (ipAddress , port , 30 * time . Second )
222
+ err = checkTCPPortIsReachable (ipAddress , port )
224
223
if err != nil {
225
- return reconcile.Result {}, fmt .Errorf ("waiting for TCP port: %v" , err )
224
+ log .WithField ("host" , ipAddress ).WithField ("port" , port ).WithField ("pod" , pod .Name ).WithError (err ).Error ("checking if TCP port is open" )
225
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
226
226
}
227
227
228
228
if component == registryFacade {
229
229
err = checkRegistryFacade (ipAddress , port )
230
230
if err != nil {
231
231
log .WithError (err ).Error ("checking registry-facade" )
232
- return reconcile.Result {RequeueAfter : time . Second * 10 }, nil
232
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
233
233
}
234
234
}
235
235
236
- time .Sleep (waitTimeout )
237
-
238
236
err = updateLabel (labelToUpdate , true , nodeName , r )
239
237
if err != nil {
238
+ log .WithError (err ).Error ("updating node label" )
240
239
return reconcile.Result {}, fmt .Errorf ("trying to add the label: %v" , err )
241
240
}
242
241
@@ -258,11 +257,6 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
258
257
return err
259
258
}
260
259
261
- _ , hasLabel := node .Labels [label ]
262
- if add == hasLabel {
263
- return nil
264
- }
265
-
266
260
if add {
267
261
node .Labels [label ] = "true"
268
262
log .WithField ("label" , label ).WithField ("node" , nodeName ).Info ("adding label to node" )
@@ -280,31 +274,14 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
280
274
})
281
275
}
282
276
283
- func waitForTCPPortToBeReachable (host string , port string , timeout time.Duration ) error {
284
- ctx , cancel := context .WithTimeout (context .Background (), timeout )
285
- defer cancel ()
286
-
287
- ticker := time .NewTicker (1 * time .Second )
288
- defer ticker .Stop ()
289
-
290
- for {
291
- select {
292
- case <- ctx .Done ():
293
- return fmt .Errorf ("port %v on host %v never reachable" , port , host )
294
- case <- ticker .C :
295
- conn , err := net .DialTimeout ("tcp" , net .JoinHostPort (host , port ), 500 * time .Millisecond )
296
- if err != nil {
297
- continue
298
- }
299
-
300
- if conn != nil {
301
- conn .Close ()
302
- return nil
303
- }
304
-
305
- continue
306
- }
277
+ func checkTCPPortIsReachable (host string , port string ) error {
278
+ conn , err := net .DialTimeout ("tcp" , net .JoinHostPort (host , port ), 1 * time .Second )
279
+ if err != nil {
280
+ return err
307
281
}
282
+ defer conn .Close ()
283
+
284
+ return nil
308
285
}
309
286
310
287
func checkRegistryFacade (host , port string ) error {
0 commit comments