Skip to content

Commit e338576

Browse files
committed
Add short-circuiting support and drop MDB
1 parent 2426fa7 commit e338576

File tree

6 files changed

+703
-76
lines changed

6 files changed

+703
-76
lines changed

cmd/machine-healthcheck/main.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"flag"
55
"runtime"
66

7-
"github.com/openshift/machine-api-operator/pkg/controller/disruption"
87
"github.com/openshift/machine-api-operator/pkg/controller/machinehealthcheck"
98

109
"k8s.io/klog"
@@ -62,7 +61,7 @@ func main() {
6261
}
6362

6463
// Setup all Controllers
65-
if err := controller.AddToManager(mgr, opts, machinehealthcheck.Add, disruption.Add); err != nil {
64+
if err := controller.AddToManager(mgr, opts, machinehealthcheck.Add); err != nil {
6665
glog.Fatal(err)
6766
}
6867

install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ spec:
3737
spec:
3838
description: Specification of machine health check policy
3939
properties:
40+
maxUnhealthy:
41+
anyOf:
42+
- type: string
43+
- type: integer
44+
description: Any farther remediation is only allowed if at most "MaxUnhealthy"
45+
machines selected by "selector" are not healthy.
4046
remediationStrategy:
4147
description: RemediationStrategy to use in case of problem detection
4248
default is machine deletion
@@ -117,6 +123,20 @@ spec:
117123
type: object
118124
status:
119125
description: Most recently observed status of MachineHealthCheck resource
126+
properties:
127+
currentHealthy:
128+
description: total number of machines counted by this machine health
129+
check
130+
minimum: 0
131+
type: integer
132+
expectedMachines:
133+
description: total number of machines counted by this machine health
134+
check
135+
minimum: 0
136+
type: integer
137+
required:
138+
- currentHealthy
139+
- expectedMachines
120140
type: object
121141
type: object
122142
version: v1alpha1

pkg/apis/healthchecking/v1alpha1/machinehealthcheck_types.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package v1alpha1
33
import (
44
corev1 "k8s.io/api/core/v1"
55
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
6+
"k8s.io/apimachinery/pkg/util/intstr"
67
)
78

89
// ConfigMapNodeUnhealthyConditions contains the name of the unhealthy conditions config map
@@ -54,6 +55,11 @@ type MachineHealthCheckSpec struct {
5455
//
5556
// +kubebuilder:validation:MinItems=1
5657
UnhealthyConditions []UnhealthyCondition `json:"unhealthyConditions"`
58+
59+
// Any farther remediation is only allowed if at most "MaxUnhealthy" machines selected by
60+
// "selector" are not healthy.
61+
// +optional
62+
MaxUnhealthy *intstr.IntOrString `json:"maxUnhealthy,omitempty"`
5763
}
5864

5965
// UnhealthyCondition represents a Node condition type and value with a timeout
@@ -81,5 +87,11 @@ type UnhealthyCondition struct {
8187

8288
// MachineHealthCheckStatus defines the observed state of MachineHealthCheck
8389
type MachineHealthCheckStatus struct {
84-
// TODO(alberto)
90+
// total number of machines counted by this machine health check
91+
// +kubebuilder:validation:Minimum=0
92+
ExpectedMachines int `json:"expectedMachines"`
93+
94+
// total number of machines counted by this machine health check
95+
// +kubebuilder:validation:Minimum=0
96+
CurrentHealthy int `json:"currentHealthy" protobuf:"varint,4,opt,name=currentHealthy"`
8597
}

pkg/apis/healthchecking/v1alpha1/zz_generated.deepcopy.go

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/controller/machinehealthcheck/machinehealthcheck_controller.go

Lines changed: 110 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"fmt"
66
"time"
77

8+
"k8s.io/apimachinery/pkg/util/intstr"
9+
810
"github.com/golang/glog"
911
mapiv1 "github.com/openshift/cluster-api/pkg/apis/machine/v1beta1"
1012
healthcheckingv1alpha1 "github.com/openshift/machine-api-operator/pkg/apis/healthchecking/v1alpha1"
@@ -104,52 +106,120 @@ func (r *ReconcileMachineHealthCheck) Reconcile(request reconcile.Request) (reco
104106
return reconcile.Result{}, err
105107
}
106108

109+
// fetch all targets
107110
glog.V(3).Infof("Reconciling %s: finding targets", request.String())
108111
targets, err := r.getTargetsFromMHC(*mhc)
109112
if err != nil {
110113
return reconcile.Result{}, err
111114
}
115+
totalTargets := len(targets)
116+
117+
// health check all targets and reconcile mhc status
118+
currentHealthy, needRemediationTargets, nextCheckTimes, errList := healthCheckTargets(targets)
119+
if err := r.reconcileStatus(mhc, totalTargets, currentHealthy); err != nil {
120+
glog.Errorf("Reconciling %s: error patching status: %v", request.String(), err)
121+
return reconcile.Result{}, err
122+
}
123+
124+
// check MHC current health against MaxUnhealthy
125+
if !isAllowedRemediation(mhc) {
126+
glog.Warningf("Reconciling %s: total targets: %v, maxUnhealthy: %v, unhealthy: %v. Short-circuiting remediation",
127+
request.String(),
128+
totalTargets,
129+
mhc.Spec.MaxUnhealthy,
130+
totalTargets-currentHealthy,
131+
)
132+
return reconcile.Result{Requeue: true}, nil
133+
}
134+
glog.V(3).Infof("Reconciling %s: monitoring MHC: total targets: %v, maxUnhealthy: %v, unhealthy: %v. Remediations are allowed",
135+
request.String(),
136+
totalTargets,
137+
mhc.Spec.MaxUnhealthy,
138+
totalTargets-currentHealthy,
139+
)
140+
141+
// remediate
142+
for _, t := range needRemediationTargets {
143+
glog.V(3).Infof("Reconciling %s: meet unhealthy criteria, triggers remediation", t.string())
144+
if err := t.remediate(r); err != nil {
145+
glog.Errorf("Reconciling %s: error remediating: %v", t.string(), err)
146+
errList = append(errList, err)
147+
}
148+
}
149+
150+
// return values
151+
if len(errList) > 0 {
152+
requeueError := apimachineryutilerrors.NewAggregate(errList)
153+
glog.V(3).Infof("Reconciling %s: there were errors, requeuing: %v", request.String(), requeueError)
154+
return reconcile.Result{}, requeueError
155+
}
156+
157+
if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
158+
glog.V(3).Infof("Reconciling %s: some targets might go unhealthy. Ensuring a requeue happens in %v", request.String(), minNextCheck)
159+
return reconcile.Result{RequeueAfter: minNextCheck}, nil
160+
}
161+
162+
glog.V(3).Infof("Reconciling %s: no more targets meet unhealthy criteria", request.String())
163+
return reconcile.Result{}, nil
164+
}
165+
166+
func isAllowedRemediation(mhc *healthcheckingv1alpha1.MachineHealthCheck) bool {
167+
if mhc.Spec.MaxUnhealthy == nil {
168+
return true
169+
}
170+
maxUnhealthy, err := intstr.GetValueFromIntOrPercent(mhc.Spec.MaxUnhealthy, mhc.Status.ExpectedMachines, false)
171+
if err != nil {
172+
glog.Errorf("%s: error decoding maxUnavailable, remediation won't be allowed: %v", namespacedName(mhc), err)
173+
return false
174+
}
175+
176+
// if noHealthy are above MaxUnavailable we short circuit any farther remediation
177+
noHealthy := mhc.Status.ExpectedMachines - mhc.Status.CurrentHealthy
178+
return (maxUnhealthy - noHealthy) >= 0
179+
}
180+
181+
func (r *ReconcileMachineHealthCheck) reconcileStatus(mhc *healthcheckingv1alpha1.MachineHealthCheck, targets, currentHealthy int) error {
182+
baseToPatch := client.MergeFrom(mhc.DeepCopy())
183+
mhc.Status.ExpectedMachines = targets
184+
mhc.Status.CurrentHealthy = currentHealthy
185+
if err := r.client.Status().Patch(context.Background(), mhc, baseToPatch); err != nil {
186+
return err
187+
}
188+
return nil
189+
}
112190

113-
// TODO: short circuit logic goes here:
114-
// Count all unhealthy targets, compare with allowed API field and update status
191+
// healthCheckTargets health checks a slice of targets
192+
// and gives a data to measure the average health
193+
func healthCheckTargets(targets []target) (int, []target, []time.Duration, []error) {
115194
var nextCheckTimes []time.Duration
116195
var errList []error
196+
var needRemediationTargets []target
197+
var currentHealthy int
117198
for _, t := range targets {
118199
glog.V(3).Infof("Reconciling %s: health checking", t.string())
119-
unhealthy, nextCheck, err := t.isUnhealthy()
200+
needsRemediation, nextCheck, err := t.needsRemediation()
120201
if err != nil {
121202
glog.Errorf("Reconciling %s: error health checking: %v", t.string(), err)
122203
errList = append(errList, err)
123204
continue
124205
}
125206

126-
if unhealthy {
127-
glog.V(3).Infof("Reconciling %s: meet unhealthy criteria, triggers remediation", t.string())
128-
if err := r.remediate(t); err != nil {
129-
glog.Errorf("Reconciling %s: error remediating: %v", t.string(), err)
130-
errList = append(errList, err)
131-
}
207+
if needsRemediation {
208+
needRemediationTargets = append(needRemediationTargets, t)
132209
continue
133210
}
211+
134212
if nextCheck > 0 {
135213
glog.V(3).Infof("Reconciling %s: is likely to go unhealthy in %v", t.string(), nextCheck)
136214
nextCheckTimes = append(nextCheckTimes, nextCheck)
215+
continue
137216
}
138-
}
139-
140-
if len(errList) > 0 {
141-
requeueError := apimachineryutilerrors.NewAggregate(errList)
142-
glog.V(3).Infof("Reconciling %s: there were errors, requeuing: %v", request.String(), requeueError)
143-
return reconcile.Result{}, requeueError
144-
}
145217

146-
if minNextCheck := minDuration(nextCheckTimes); minNextCheck > 0 {
147-
glog.V(3).Infof("Reconciling %s: some targets might go unhealthy. Ensuring a requeue happens in %v", request.String(), minNextCheck)
148-
return reconcile.Result{RequeueAfter: minNextCheck}, nil
218+
if t.Machine.DeletionTimestamp == nil {
219+
currentHealthy++
220+
}
149221
}
150-
151-
glog.V(3).Infof("Reconciling %s: no targets meet unhealthy criteria", request.String())
152-
return reconcile.Result{}, nil
222+
return currentHealthy, needRemediationTargets, nextCheckTimes, errList
153223
}
154224

155225
func (r *ReconcileMachineHealthCheck) getTargetsFromMHC(mhc healthcheckingv1alpha1.MachineHealthCheck) ([]target, error) {
@@ -283,7 +353,7 @@ func (r *ReconcileMachineHealthCheck) mhcRequestsFromMachine(o handler.MapObject
283353
return requests
284354
}
285355

286-
func (r *ReconcileMachineHealthCheck) remediate(t target) error {
356+
func (t *target) remediate(r *ReconcileMachineHealthCheck) error {
287357
glog.Infof(" %s: start remediation logic", t.string())
288358
if !t.hasMachineSetOwner() {
289359
glog.Infof("%s: no machineSet controller owner, skipping remediation", t.string())
@@ -292,7 +362,7 @@ func (r *ReconcileMachineHealthCheck) remediate(t target) error {
292362

293363
remediationStrategy := t.MHC.Spec.RemediationStrategy
294364
if remediationStrategy != nil && *remediationStrategy == remediationStrategyReboot {
295-
return r.remediationStrategyReboot(&t.Machine, t.Node)
365+
return t.remediationStrategyReboot(r)
296366
}
297367
if t.isMaster() {
298368
glog.Infof("%s: master node, skipping remediation", t.string())
@@ -306,19 +376,19 @@ func (r *ReconcileMachineHealthCheck) remediate(t target) error {
306376
return nil
307377
}
308378

309-
func (r *ReconcileMachineHealthCheck) remediationStrategyReboot(machine *mapiv1.Machine, node *corev1.Node) error {
379+
func (t *target) remediationStrategyReboot(r *ReconcileMachineHealthCheck) error {
310380
// we already have reboot annotation on the node, stop reconcile
311-
if _, ok := node.Annotations[machineRebootAnnotationKey]; ok {
381+
if _, ok := t.Node.Annotations[machineRebootAnnotationKey]; ok {
312382
return nil
313383
}
314384

315-
if node.Annotations == nil {
316-
node.Annotations = map[string]string{}
385+
if t.Node.Annotations == nil {
386+
t.Node.Annotations = map[string]string{}
317387
}
318388

319-
glog.Infof("Machine %s has been unhealthy for too long, adding reboot annotation", machine.Name)
320-
node.Annotations[machineRebootAnnotationKey] = ""
321-
if err := r.client.Update(context.TODO(), node); err != nil {
389+
glog.Infof("Machine %s has been unhealthy for too long, adding reboot annotation", t.Machine.Name)
390+
t.Node.Annotations[machineRebootAnnotationKey] = ""
391+
if err := r.client.Update(context.TODO(), t.Node); err != nil {
322392
return err
323393
}
324394
return nil
@@ -369,7 +439,7 @@ func (t *target) isMaster() bool {
369439
return false
370440
}
371441

372-
func (t *target) isUnhealthy() (bool, time.Duration, error) {
442+
func (t *target) needsRemediation() (bool, time.Duration, error) {
373443
var nextCheckTimes []time.Duration
374444
now := time.Now()
375445

@@ -448,6 +518,13 @@ func derefStringPointer(stringPointer *string) string {
448518
return ""
449519
}
450520

521+
func derefStringInt(intPointer *int) int {
522+
if intPointer != nil {
523+
return 0
524+
}
525+
return *intPointer
526+
}
527+
451528
func minDuration(durations []time.Duration) time.Duration {
452529
if len(durations) == 0 {
453530
return time.Duration(0)
@@ -477,7 +554,7 @@ func hasMatchingLabels(machineHealthCheck *healthcheckingv1alpha1.MachineHealthC
477554
}
478555
// If a deployment with a nil or empty selector creeps in, it should match nothing, not everything.
479556
if selector.Empty() {
480-
glog.V(2).Infof("%q machineHealthCheck has empty selector", machineHealthCheck.GetName())
557+
glog.V(3).Infof("%q machineHealthCheck has empty selector", machineHealthCheck.GetName())
481558
return false
482559
}
483560
if !selector.Matches(labels.Set(machine.Labels)) {

0 commit comments

Comments
 (0)