5
5
"fmt"
6
6
"time"
7
7
8
+ "k8s.io/apimachinery/pkg/util/intstr"
9
+
8
10
"github.com/golang/glog"
9
11
mapiv1 "github.com/openshift/cluster-api/pkg/apis/machine/v1beta1"
10
12
healthcheckingv1alpha1 "github.com/openshift/machine-api-operator/pkg/apis/healthchecking/v1alpha1"
@@ -104,52 +106,120 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
104
106
return reconcile.Result {}, err
105
107
}
106
108
109
+ // fetch all targets
107
110
glog .V (3 ).Infof ("Reconciling %s: finding targets" , request .String ())
108
111
targets , err := r .getTargetsFromMHC (* mhc )
109
112
if err != nil {
110
113
return reconcile.Result {}, err
111
114
}
115
+ totalTargets := len (targets )
116
+
117
+ // health check all targets and reconcile mhc status
118
+ currentHealthy , needRemediationTargets , nextCheckTimes , errList := healthCheckTargets (targets )
119
+ if err := r .reconcileStatus (mhc , totalTargets , currentHealthy ); err != nil {
120
+ glog .Errorf ("Reconciling %s: error patching status: %v" , request .String (), err )
121
+ return reconcile.Result {}, err
122
+ }
123
+
124
+ // check MHC current health against MaxUnhealthy
125
+ if ! isAllowedRemediation (mhc ) {
126
+ glog .Warningf ("Reconciling %s: total targets: %v, maxUnhealthy: %v, unhealthy: %v. Short-circuiting remediation" ,
127
+ request .String (),
128
+ totalTargets ,
129
+ mhc .Spec .MaxUnhealthy ,
130
+ totalTargets - currentHealthy ,
131
+ )
132
+ return reconcile.Result {Requeue : true }, nil
133
+ }
134
+ glog .V (3 ).Infof ("Reconciling %s: monitoring MHC: total targets: %v, maxUnhealthy: %v, unhealthy: %v. Remediations are allowed" ,
135
+ request .String (),
136
+ totalTargets ,
137
+ mhc .Spec .MaxUnhealthy ,
138
+ totalTargets - currentHealthy ,
139
+ )
140
+
141
+ // remediate
142
+ for _ , t := range needRemediationTargets {
143
+ glog .V (3 ).Infof ("Reconciling %s: meet unhealthy criteria, triggers remediation" , t .string ())
144
+ if err := t .remediate (r ); err != nil {
145
+ glog .Errorf ("Reconciling %s: error remediating: %v" , t .string (), err )
146
+ errList = append (errList , err )
147
+ }
148
+ }
149
+
150
+ // return values
151
+ if len (errList ) > 0 {
152
+ requeueError := apimachineryutilerrors .NewAggregate (errList )
153
+ glog .V (3 ).Infof ("Reconciling %s: there were errors, requeuing: %v" , request .String (), requeueError )
154
+ return reconcile.Result {}, requeueError
155
+ }
156
+
157
+ if minNextCheck := minDuration (nextCheckTimes ); minNextCheck > 0 {
158
+ glog .V (3 ).Infof ("Reconciling %s: some targets might go unhealthy. Ensuring a requeue happens in %v" , request .String (), minNextCheck )
159
+ return reconcile.Result {RequeueAfter : minNextCheck }, nil
160
+ }
161
+
162
+ glog .V (3 ).Infof ("Reconciling %s: no more targets meet unhealthy criteria" , request .String ())
163
+ return reconcile.Result {}, nil
164
+ }
165
+
166
+ func isAllowedRemediation (mhc * healthcheckingv1alpha1.MachineHealthCheck ) bool {
167
+ if mhc .Spec .MaxUnhealthy == nil {
168
+ return true
169
+ }
170
+ maxUnhealthy , err := intstr .GetValueFromIntOrPercent (mhc .Spec .MaxUnhealthy , mhc .Status .ExpectedMachines , false )
171
+ if err != nil {
172
+ glog .Errorf ("%s: error decoding maxUnavailable, remediation won't be allowed: %v" , namespacedName (mhc ), err )
173
+ return false
174
+ }
175
+
176
+ // if noHealthy are above MaxUnavailable we short circuit any farther remediation
177
+ noHealthy := mhc .Status .ExpectedMachines - mhc .Status .CurrentHealthy
178
+ return (maxUnhealthy - noHealthy ) >= 0
179
+ }
180
+
181
+ func (r * ReconcileMachineHealthCheck ) reconcileStatus (mhc * healthcheckingv1alpha1.MachineHealthCheck , targets , currentHealthy int ) error {
182
+ baseToPatch := client .MergeFrom (mhc .DeepCopy ())
183
+ mhc .Status .ExpectedMachines = targets
184
+ mhc .Status .CurrentHealthy = currentHealthy
185
+ if err := r .client .Status ().Patch (context .Background (), mhc , baseToPatch ); err != nil {
186
+ return err
187
+ }
188
+ return nil
189
+ }
112
190
113
- // TODO: short circuit logic goes here:
114
- // Count all unhealthy targets, compare with allowed API field and update status
191
+ // healthCheckTargets health checks a slice of targets
192
+ // and gives a data to measure the average health
193
+ func healthCheckTargets (targets []target ) (int , []target , []time.Duration , []error ) {
115
194
var nextCheckTimes []time.Duration
116
195
var errList []error
196
+ var needRemediationTargets []target
197
+ var currentHealthy int
117
198
for _ , t := range targets {
118
199
glog .V (3 ).Infof ("Reconciling %s: health checking" , t .string ())
119
- unhealthy , nextCheck , err := t .isUnhealthy ()
200
+ needsRemediation , nextCheck , err := t .needsRemediation ()
120
201
if err != nil {
121
202
glog .Errorf ("Reconciling %s: error health checking: %v" , t .string (), err )
122
203
errList = append (errList , err )
123
204
continue
124
205
}
125
206
126
- if unhealthy {
127
- glog .V (3 ).Infof ("Reconciling %s: meet unhealthy criteria, triggers remediation" , t .string ())
128
- if err := r .remediate (t ); err != nil {
129
- glog .Errorf ("Reconciling %s: error remediating: %v" , t .string (), err )
130
- errList = append (errList , err )
131
- }
207
+ if needsRemediation {
208
+ needRemediationTargets = append (needRemediationTargets , t )
132
209
continue
133
210
}
211
+
134
212
if nextCheck > 0 {
135
213
glog .V (3 ).Infof ("Reconciling %s: is likely to go unhealthy in %v" , t .string (), nextCheck )
136
214
nextCheckTimes = append (nextCheckTimes , nextCheck )
215
+ continue
137
216
}
138
- }
139
-
140
- if len (errList ) > 0 {
141
- requeueError := apimachineryutilerrors .NewAggregate (errList )
142
- glog .V (3 ).Infof ("Reconciling %s: there were errors, requeuing: %v" , request .String (), requeueError )
143
- return reconcile.Result {}, requeueError
144
- }
145
217
146
- if minNextCheck := minDuration ( nextCheckTimes ); minNextCheck > 0 {
147
- glog . V ( 3 ). Infof ( "Reconciling %s: some targets might go unhealthy. Ensuring a requeue happens in %v" , request . String (), minNextCheck )
148
- return reconcile. Result { RequeueAfter : minNextCheck }, nil
218
+ if t . Machine . DeletionTimestamp == nil {
219
+ currentHealthy ++
220
+ }
149
221
}
150
-
151
- glog .V (3 ).Infof ("Reconciling %s: no targets meet unhealthy criteria" , request .String ())
152
- return reconcile.Result {}, nil
222
+ return currentHealthy , needRemediationTargets , nextCheckTimes , errList
153
223
}
154
224
155
225
func (r * ReconcileMachineHealthCheck ) getTargetsFromMHC (mhc healthcheckingv1alpha1.MachineHealthCheck ) ([]target , error ) {
@@ -283,7 +353,7 @@ func (r *ReconcileMachineHealthCheck) mhcRequestsFromMachine(o handler.MapObject
283
353
return requests
284
354
}
285
355
286
- func (r * ReconcileMachineHealthCheck ) remediate (t target ) error {
356
+ func (t * target ) remediate (r * ReconcileMachineHealthCheck ) error {
287
357
glog .Infof (" %s: start remediation logic" , t .string ())
288
358
if ! t .hasMachineSetOwner () {
289
359
glog .Infof ("%s: no machineSet controller owner, skipping remediation" , t .string ())
@@ -292,7 +362,7 @@ func (r *ReconcileMachineHealthCheck) remediate(t target) error {
292
362
293
363
remediationStrategy := t .MHC .Spec .RemediationStrategy
294
364
if remediationStrategy != nil && * remediationStrategy == remediationStrategyReboot {
295
- return r .remediationStrategyReboot (& t . Machine , t . Node )
365
+ return t .remediationStrategyReboot (r )
296
366
}
297
367
if t .isMaster () {
298
368
glog .Infof ("%s: master node, skipping remediation" , t .string ())
@@ -306,19 +376,19 @@ func (r *ReconcileMachineHealthCheck) remediate(t target) error {
306
376
return nil
307
377
}
308
378
309
- func (r * ReconcileMachineHealthCheck ) remediationStrategyReboot (machine * mapiv1. Machine , node * corev1. Node ) error {
379
+ func (t * target ) remediationStrategyReboot (r * ReconcileMachineHealthCheck ) error {
310
380
// we already have reboot annotation on the node, stop reconcile
311
- if _ , ok := node .Annotations [machineRebootAnnotationKey ]; ok {
381
+ if _ , ok := t . Node .Annotations [machineRebootAnnotationKey ]; ok {
312
382
return nil
313
383
}
314
384
315
- if node .Annotations == nil {
316
- node .Annotations = map [string ]string {}
385
+ if t . Node .Annotations == nil {
386
+ t . Node .Annotations = map [string ]string {}
317
387
}
318
388
319
- glog .Infof ("Machine %s has been unhealthy for too long, adding reboot annotation" , machine .Name )
320
- node .Annotations [machineRebootAnnotationKey ] = ""
321
- if err := r .client .Update (context .TODO (), node ); err != nil {
389
+ glog .Infof ("Machine %s has been unhealthy for too long, adding reboot annotation" , t . Machine .Name )
390
+ t . Node .Annotations [machineRebootAnnotationKey ] = ""
391
+ if err := r .client .Update (context .TODO (), t . Node ); err != nil {
322
392
return err
323
393
}
324
394
return nil
@@ -369,7 +439,7 @@ func (t *target) isMaster() bool {
369
439
return false
370
440
}
371
441
372
- func (t * target ) isUnhealthy () (bool , time.Duration , error ) {
442
+ func (t * target ) needsRemediation () (bool , time.Duration , error ) {
373
443
var nextCheckTimes []time.Duration
374
444
now := time .Now ()
375
445
@@ -448,6 +518,13 @@ func derefStringPointer(stringPointer *string) string {
448
518
return ""
449
519
}
450
520
521
+ func derefStringInt (intPointer * int ) int {
522
+ if intPointer != nil {
523
+ return 0
524
+ }
525
+ return * intPointer
526
+ }
527
+
451
528
func minDuration (durations []time.Duration ) time.Duration {
452
529
if len (durations ) == 0 {
453
530
return time .Duration (0 )
@@ -477,7 +554,7 @@ func hasMatchingLabels(machineHealthCheck *healthcheckingv1alpha1.MachineHealthC
477
554
}
478
555
// If a deployment with a nil or empty selector creeps in, it should match nothing, not everything.
479
556
if selector .Empty () {
480
- glog .V (2 ).Infof ("%q machineHealthCheck has empty selector" , machineHealthCheck .GetName ())
557
+ glog .V (3 ).Infof ("%q machineHealthCheck has empty selector" , machineHealthCheck .GetName ())
481
558
return false
482
559
}
483
560
if ! selector .Matches (labels .Set (machine .Labels )) {
0 commit comments