Skip to content

Commit 2621fd6

Browse files
committed
Machine controller: drain node before machine deletion
The node draining code itself is imported from github.com/openshift/kubernetes-drain. At the same time it's currently impossible to use the controller-runtime client for node draining due to missing Patch operation (kubernetes-sigs/controller-runtime#235). Thus, the machine controller needs to initialize kubeclient as well in order to implement the node draining logic. Once the Patch operation is implemented, the draining logic can be updated to replace kube client with controller runtime client. Also, initialize event recorder to generate node draining event.
1 parent 0c3e884 commit 2621fd6

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

pkg/controller/machine/BUILD.bazel

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,15 @@ go_library(
1313
"//pkg/apis/machine/v1beta1:go_default_library",
1414
"//pkg/controller/error:go_default_library",
1515
"//pkg/util:go_default_library",
16+
"//vendor/github.com/go-log/log/info:go_default_library",
17+
"//vendor/github.com/openshift/kubernetes-drain:go_default_library",
1618
"//vendor/k8s.io/api/core/v1:go_default_library",
1719
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
1820
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
1921
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
22+
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
23+
"//vendor/k8s.io/client-go/rest:go_default_library",
24+
"//vendor/k8s.io/client-go/tools/record:go_default_library",
2025
"//vendor/k8s.io/klog:go_default_library",
2126
"//vendor/sigs.k8s.io/controller-runtime/pkg/client:go_default_library",
2227
"//vendor/sigs.k8s.io/controller-runtime/pkg/controller:go_default_library",

pkg/controller/machine/controller.go

Lines changed: 67 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,22 @@ package machine
1919
import (
2020
"context"
2121
"errors"
22+
"fmt"
2223
"os"
24+
"time"
2325

26+
"github.com/go-log/log/info"
2427
machinev1 "github.com/openshift/cluster-api/pkg/apis/machine/v1beta1"
2528
controllerError "github.com/openshift/cluster-api/pkg/controller/error"
2629
"github.com/openshift/cluster-api/pkg/util"
30+
kubedrain "github.com/openshift/kubernetes-drain"
2731
corev1 "k8s.io/api/core/v1"
2832
apierrors "k8s.io/apimachinery/pkg/api/errors"
2933
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3034
"k8s.io/apimachinery/pkg/runtime"
35+
"k8s.io/client-go/kubernetes"
36+
"k8s.io/client-go/rest"
37+
"k8s.io/client-go/tools/record"
3138
"k8s.io/klog"
3239
"sigs.k8s.io/controller-runtime/pkg/client"
3340
"sigs.k8s.io/controller-runtime/pkg/controller"
@@ -37,7 +44,12 @@ import (
3744
"sigs.k8s.io/controller-runtime/pkg/source"
3845
)
3946

40-
const NodeNameEnvVar = "NODE_NAME"
47+
const (
48+
NodeNameEnvVar = "NODE_NAME"
49+
50+
// ExcludeNodeDrainingAnnotation annotation explicitly skips node draining if set
51+
ExcludeNodeDrainingAnnotation = "machine.openshift.io/exclude-node-draining"
52+
)
4153

4254
var DefaultActuator Actuator
4355

@@ -48,10 +60,12 @@ func AddWithActuator(mgr manager.Manager, actuator Actuator) error {
4860
// newReconciler returns a new reconcile.Reconciler
4961
func newReconciler(mgr manager.Manager, actuator Actuator) reconcile.Reconciler {
5062
r := &ReconcileMachine{
51-
Client: mgr.GetClient(),
52-
scheme: mgr.GetScheme(),
53-
nodeName: os.Getenv(NodeNameEnvVar),
54-
actuator: actuator,
63+
Client: mgr.GetClient(),
64+
eventRecorder: mgr.GetRecorder("machine-controller"),
65+
config: mgr.GetConfig(),
66+
scheme: mgr.GetScheme(),
67+
nodeName: os.Getenv(NodeNameEnvVar),
68+
actuator: actuator,
5569
}
5670

5771
if r.nodeName == "" {
@@ -83,8 +97,11 @@ var _ reconcile.Reconciler = &ReconcileMachine{}
8397
// ReconcileMachine reconciles a Machine object
8498
type ReconcileMachine struct {
8599
client.Client
100+
config *rest.Config
86101
scheme *runtime.Scheme
87102

103+
eventRecorder record.EventRecorder
104+
88105
actuator Actuator
89106

90107
// nodeName is the name of the node on which the machine controller is running, if not present, it is loaded from NODE_NAME.
@@ -145,6 +162,51 @@ func (r *ReconcileMachine) Reconcile(request reconcile.Request) (reconcile.Resul
145162
return reconcile.Result{}, nil
146163
}
147164
klog.Infof("reconciling machine object %v triggers delete.", name)
165+
166+
// Drain node before deletion
167+
// If a machine is not linked to a node, just delete the machine. Since a node
168+
// can be unlinked from a machine when the node goes NotReady and is removed
169+
// by cloud controller manager. In that case some machines would never get
170+
// deleted without a manual intervention.
171+
if _, exists := m.ObjectMeta.Annotations[ExcludeNodeDrainingAnnotation]; !exists && m.Status.NodeRef != nil {
172+
if err := func() error {
173+
kubeClient, err := kubernetes.NewForConfig(r.config)
174+
if err != nil {
175+
return fmt.Errorf("unable to build kube client: %v", err)
176+
}
177+
node, err := kubeClient.CoreV1().Nodes().Get(m.Status.NodeRef.Name, metav1.GetOptions{})
178+
if err != nil {
179+
return fmt.Errorf("unable to get node %q: %v", m.Status.NodeRef.Name, err)
180+
}
181+
182+
if err := kubedrain.Drain(
183+
kubeClient,
184+
[]*corev1.Node{node},
185+
&kubedrain.DrainOptions{
186+
Force: true,
187+
IgnoreDaemonsets: true,
188+
DeleteLocalData: true,
189+
GracePeriodSeconds: -1,
190+
Logger: info.New(klog.V(0)),
191+
// If a pod is not evicted in 20 second, retry the eviction next time the
192+
// machine gets reconciled again (to allow other machines to be reconciled)
193+
Timeout: 20 * time.Second,
194+
},
195+
); err != nil {
196+
// Machine still tries to terminate after drain failure
197+
klog.Warningf("drain failed for machine %q: %v", m.Name, err)
198+
return &controllerError.RequeueAfterError{RequeueAfter: 20 * time.Second}
199+
}
200+
201+
klog.Infof("drain successful for machine %q", m.Name)
202+
r.eventRecorder.Eventf(m, corev1.EventTypeNormal, "Deleted", "Node %q drained", node.Name)
203+
204+
return nil
205+
}(); err != nil {
206+
return reconcile.Result{}, err
207+
}
208+
}
209+
148210
if err := r.actuator.Delete(ctx, cluster, m); err != nil {
149211
klog.Errorf("Error deleting machine object %v; %v", name, err)
150212
if requeueErr, ok := err.(*controllerError.RequeueAfterError); ok {

0 commit comments

Comments
 (0)