@@ -18,22 +18,22 @@ import (
18
18
"github.com/spf13/cobra"
19
19
corev1 "k8s.io/api/core/v1"
20
20
"k8s.io/apimachinery/pkg/api/errors"
21
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21
22
"k8s.io/apimachinery/pkg/runtime"
22
23
"k8s.io/apimachinery/pkg/types"
23
24
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
24
25
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
25
26
_ "k8s.io/client-go/plugin/pkg/client/auth"
26
27
"k8s.io/client-go/util/retry"
28
+ "k8s.io/utils/pointer"
27
29
ctrl "sigs.k8s.io/controller-runtime"
30
+ "sigs.k8s.io/controller-runtime/pkg/builder"
28
31
"sigs.k8s.io/controller-runtime/pkg/client"
29
32
"sigs.k8s.io/controller-runtime/pkg/controller"
30
- "sigs.k8s.io/controller-runtime/pkg/event"
31
- "sigs.k8s.io/controller-runtime/pkg/handler"
32
33
"sigs.k8s.io/controller-runtime/pkg/healthz"
33
34
"sigs.k8s.io/controller-runtime/pkg/metrics"
34
35
"sigs.k8s.io/controller-runtime/pkg/predicate"
35
36
"sigs.k8s.io/controller-runtime/pkg/reconcile"
36
- "sigs.k8s.io/controller-runtime/pkg/source"
37
37
38
38
"github.com/gitpod-io/gitpod/common-go/log"
39
39
)
@@ -46,6 +46,8 @@ const (
46
46
wsDaemon = "ws-daemon"
47
47
)
48
48
49
+ var defaultRequeueTime = time .Second * 10
50
+
49
51
// serveCmd represents the serve command
50
52
var runCmd = & cobra.Command {
51
53
Use : "run" ,
@@ -60,6 +62,10 @@ var runCmd = &cobra.Command{
60
62
LeaderElection : true ,
61
63
LeaderElectionID : "node-labeler.gitpod.io" ,
62
64
Namespace : namespace ,
65
+ // default sync period is 10h.
66
+ // in case node-labeler is restarted and not change happens, we could waste (at least) 20m in a node
67
+ // that never will run workspaces and the additional nodes cluster-autoscaler adds to compensate
68
+ SyncPeriod : pointer .Duration (2 * time .Minute ),
63
69
})
64
70
if err != nil {
65
71
log .WithError (err ).Fatal ("unable to start node-labeber" )
@@ -74,35 +80,29 @@ var runCmd = &cobra.Command{
74
80
client ,
75
81
}
76
82
77
- c , err := controller .New ("pod-watcher" , mgr , controller.Options {
78
- Reconciler : r ,
79
- MaxConcurrentReconciles : 20 ,
83
+ componentPredicate , err := predicate .LabelSelectorPredicate (metav1.LabelSelector {
84
+ MatchExpressions : []metav1.LabelSelectorRequirement {{
85
+ Key : "component" ,
86
+ Operator : metav1 .LabelSelectorOpIn ,
87
+ Values : []string {"ws-daemon" , "registry-facade" },
88
+ }},
80
89
})
90
+ if err != nil {
91
+ log .WithError (err ).Fatal ("unable to create predicate" )
92
+ }
93
+
94
+ err = ctrl .NewControllerManagedBy (mgr ).
95
+ Named ("pod-watcher" ).
96
+ For (& corev1.Pod {}, builder .WithPredicates (predicate .Or (componentPredicate ))).
97
+ WithOptions (controller.Options {MaxConcurrentReconciles : 1 }).
98
+ Complete (r )
81
99
if err != nil {
82
100
log .WithError (err ).Fatal ("unable to bind controller watch event handler" )
83
101
}
84
102
85
103
metrics .Registry .MustRegister (NodeLabelerCounterVec )
86
104
metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
87
105
88
- err = c .Watch (& source.Kind {Type : & corev1.Pod {}}, & handler.EnqueueRequestForObject {}, predicate.Funcs {
89
- CreateFunc : func (ce event.CreateEvent ) bool {
90
- return processPodEvent (ce .Object )
91
- },
92
- UpdateFunc : func (ue event.UpdateEvent ) bool {
93
- return processPodEvent (ue .ObjectNew )
94
- },
95
- DeleteFunc : func (deleteEvent event.DeleteEvent ) bool {
96
- return processPodEvent (deleteEvent .Object )
97
- },
98
- GenericFunc : func (genericEvent event.GenericEvent ) bool {
99
- return false
100
- },
101
- })
102
- if err != nil {
103
- log .WithError (err ).Fatal ("unable to create controller" )
104
- }
105
-
106
106
err = mgr .AddHealthzCheck ("healthz" , healthz .Ping )
107
107
if err != nil {
108
108
log .WithError (err ).Fatal ("unable to set up health check" )
@@ -132,14 +132,6 @@ var (
132
132
scheme = runtime .NewScheme ()
133
133
)
134
134
135
- func processPodEvent (pod client.Object ) bool {
136
- if strings .HasPrefix (pod .GetName (), registryFacade ) || strings .HasPrefix (pod .GetName (), wsDaemon ) {
137
- return true
138
- }
139
-
140
- return false
141
- }
142
-
143
135
type PodReconciler struct {
144
136
client.Client
145
137
}
@@ -157,16 +149,14 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
157
149
158
150
nodeName := pod .Spec .NodeName
159
151
if nodeName == "" {
160
- return reconcile.Result {RequeueAfter : time . Second * 10 }, err
152
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
161
153
}
162
154
163
155
var (
164
156
ipAddress string
165
157
port string
166
158
component string
167
159
labelToUpdate string
168
-
169
- waitTimeout time.Duration = 5 * time .Second
170
160
)
171
161
172
162
switch {
@@ -181,7 +171,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
181
171
ipAddress = pod .Status .PodIP
182
172
port = strconv .Itoa (wsdaemonPort )
183
173
default :
184
- log . WithField ( "pod" , pod . Name ). Info ( "Invalid pod. Skipping..." )
174
+ // nothing to do
185
175
return reconcile.Result {}, nil
186
176
}
187
177
@@ -198,7 +188,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
198
188
}
199
189
200
190
log .WithError (err ).Error ("removing node label" )
201
- return reconcile.Result {RequeueAfter : time . Second * 10 }, err
191
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, err
202
192
}
203
193
204
194
return reconcile.Result {}, err
@@ -215,28 +205,28 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
215
205
return reconcile.Result {}, fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
216
206
}
217
207
218
- if node .Labels [labelToUpdate ] == "true" {
219
- // Label already exists.
208
+ if labelValue , exists := node .Labels [labelToUpdate ]; exists && labelValue == "true" {
209
+ // nothing to do, the label already exists.
220
210
return reconcile.Result {}, nil
221
211
}
222
212
223
- err = waitForTCPPortToBeReachable (ipAddress , port , 30 * time . Second )
213
+ err = checkTCPPortIsReachable (ipAddress , port )
224
214
if err != nil {
225
- return reconcile.Result {}, fmt .Errorf ("waiting for TCP port: %v" , err )
215
+ log .WithField ("host" , ipAddress ).WithField ("port" , port ).WithField ("pod" , pod .Name ).WithError (err ).Error ("checking if TCP port is open" )
216
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
226
217
}
227
218
228
219
if component == registryFacade {
229
220
err = checkRegistryFacade (ipAddress , port )
230
221
if err != nil {
231
222
log .WithError (err ).Error ("checking registry-facade" )
232
- return reconcile.Result {RequeueAfter : time . Second * 10 }, nil
223
+ return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
233
224
}
234
225
}
235
226
236
- time .Sleep (waitTimeout )
237
-
238
227
err = updateLabel (labelToUpdate , true , nodeName , r )
239
228
if err != nil {
229
+ log .WithError (err ).Error ("updating node label" )
240
230
return reconcile.Result {}, fmt .Errorf ("trying to add the label: %v" , err )
241
231
}
242
232
@@ -258,11 +248,6 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
258
248
return err
259
249
}
260
250
261
- _ , hasLabel := node .Labels [label ]
262
- if add == hasLabel {
263
- return nil
264
- }
265
-
266
251
if add {
267
252
node .Labels [label ] = "true"
268
253
log .WithField ("label" , label ).WithField ("node" , nodeName ).Info ("adding label to node" )
@@ -280,31 +265,14 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
280
265
})
281
266
}
282
267
283
- func waitForTCPPortToBeReachable (host string , port string , timeout time.Duration ) error {
284
- ctx , cancel := context .WithTimeout (context .Background (), timeout )
285
- defer cancel ()
286
-
287
- ticker := time .NewTicker (1 * time .Second )
288
- defer ticker .Stop ()
289
-
290
- for {
291
- select {
292
- case <- ctx .Done ():
293
- return fmt .Errorf ("port %v on host %v never reachable" , port , host )
294
- case <- ticker .C :
295
- conn , err := net .DialTimeout ("tcp" , net .JoinHostPort (host , port ), 500 * time .Millisecond )
296
- if err != nil {
297
- continue
298
- }
299
-
300
- if conn != nil {
301
- conn .Close ()
302
- return nil
303
- }
304
-
305
- continue
306
- }
268
+ func checkTCPPortIsReachable (host string , port string ) error {
269
+ conn , err := net .DialTimeout ("tcp" , net .JoinHostPort (host , port ), 1 * time .Second )
270
+ if err != nil {
271
+ return err
307
272
}
273
+ defer conn .Close ()
274
+
275
+ return nil
308
276
}
309
277
310
278
func checkRegistryFacade (host , port string ) error {
0 commit comments