Skip to content

Commit 19cfd51

Browse files
committed
OCPBUGS-41981: (fix) registry pods do not come up again after node failure (#3366)
[PR 3201](operator-framework/operator-lifecycle-manager#3201) attempted to solve for the issue by deleting the pods stuck in `Terminating` due to unreachable node. However, the logic to do that was included in `EnsureRegistryServer`, which only gets executed if polling in requested by the user. This PR moves the logic of checking for dead pods out of `EnsureRegistryServer`, and puts it in `CheckRegistryServer` instead. This way, if there are any dead pods detected during `CheckRegistryServer`, the value of `healthy` is returned `false`, which inturn triggers `EnsureRegistryServer`. Upstream-repository: operator-lifecycle-manager Upstream-commit: f2431893193e7112f78298ad7682ff3e1b179d8c
1 parent de43551 commit 19cfd51

File tree

6 files changed

+187
-98
lines changed

6 files changed

+187
-98
lines changed

staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap.go

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,20 @@ package reconciler
33

44
import (
55
"context"
6+
"errors"
67
"fmt"
78

89
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
910
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
10-
"github.com/pkg/errors"
11+
pkgerrors "github.com/pkg/errors"
1112
"github.com/sirupsen/logrus"
1213
corev1 "k8s.io/api/core/v1"
1314
rbacv1 "k8s.io/api/rbac/v1"
1415
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
"k8s.io/apimachinery/pkg/labels"
1718
"k8s.io/apimachinery/pkg/util/intstr"
19+
"k8s.io/utils/ptr"
1820

1921
"github.com/operator-framework/api/pkg/operators/v1alpha1"
2022
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -322,27 +324,27 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry,
322324

323325
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
324326
if err := c.ensureServiceAccount(source, overwrite); err != nil {
325-
return errors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
327+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.serviceAccountName())
326328
}
327329
if err := c.ensureRole(source, overwrite); err != nil {
328-
return errors.Wrapf(err, "error ensuring role: %s", source.roleName())
330+
return pkgerrors.Wrapf(err, "error ensuring role: %s", source.roleName())
329331
}
330332
if err := c.ensureRoleBinding(source, overwrite); err != nil {
331-
return errors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
333+
return pkgerrors.Wrapf(err, "error ensuring rolebinding: %s", source.RoleBinding().GetName())
332334
}
333335
pod, err := source.Pod(image)
334336
if err != nil {
335337
return err
336338
}
337339
if err := c.ensurePod(source, overwritePod); err != nil {
338-
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
340+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
339341
}
340342
service, err := source.Service()
341343
if err != nil {
342344
return err
343345
}
344346
if err := c.ensureService(source, overwrite); err != nil {
345-
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
347+
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
346348
}
347349

348350
if overwritePod {
@@ -415,15 +417,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
415417
}
416418
for _, p := range currentPods {
417419
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
418-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
420+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
419421
}
420422
}
421423
}
422424
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(pod.GetNamespace()).Create(context.TODO(), pod, metav1.CreateOptions{})
423425
if err == nil {
424426
return nil
425427
}
426-
return errors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
428+
return pkgerrors.Wrapf(err, "error creating new pod: %s", pod.GetGenerateName())
427429
}
428430

429431
func (c *ConfigMapRegistryReconciler) ensureService(source configMapCatalogSourceDecorator, overwrite bool) error {
@@ -502,6 +504,34 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(logger *logrus.Entry,
502504
return
503505
}
504506

505-
healthy = true
506-
return
507+
podsAreLive, e := detectAndDeleteDeadPods(logger, c.OpClient, pods, source.GetNamespace())
508+
if e != nil {
509+
return false, fmt.Errorf("error deleting dead pods: %v", e)
510+
}
511+
return podsAreLive, nil
512+
}
513+
514+
// detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
515+
// but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
516+
// force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
517+
func detectAndDeleteDeadPods(logger *logrus.Entry, client operatorclient.ClientInterface, pods []*corev1.Pod, sourceNamespace string) (bool, error) {
518+
var forceDeletionErrs []error
519+
livePodFound := false
520+
for _, pod := range pods {
521+
if !isPodDead(pod) {
522+
livePodFound = true
523+
logger.WithFields(logrus.Fields{"pod.namespace": sourceNamespace, "pod.name": pod.GetName()}).Debug("pod is alive")
524+
continue
525+
}
526+
logger.WithFields(logrus.Fields{"pod.namespace": sourceNamespace, "pod.name": pod.GetName()}).Info("force deleting dead pod")
527+
if err := client.KubernetesInterface().CoreV1().Pods(sourceNamespace).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
528+
GracePeriodSeconds: ptr.To[int64](0),
529+
}); err != nil && !apierrors.IsNotFound(err) {
530+
forceDeletionErrs = append(forceDeletionErrs, err)
531+
}
532+
}
533+
if len(forceDeletionErrs) > 0 {
534+
return false, errors.Join(forceDeletionErrs...)
535+
}
536+
return livePodFound, nil
507537
}

staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/configmap_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,3 +515,55 @@ func TestConfigMapRegistryReconciler(t *testing.T) {
515515
})
516516
}
517517
}
518+
519+
func TestConfigMapRegistryChecker(t *testing.T) {
520+
validConfigMap := validConfigMap()
521+
validCatalogSource := validConfigMapCatalogSource(validConfigMap)
522+
type cluster struct {
523+
k8sObjs []runtime.Object
524+
}
525+
type in struct {
526+
cluster cluster
527+
catsrc *v1alpha1.CatalogSource
528+
}
529+
type out struct {
530+
healthy bool
531+
err error
532+
}
533+
tests := []struct {
534+
testName string
535+
in in
536+
out out
537+
}{
538+
{
539+
testName: "ConfigMap/ExistingRegistry/DeadPod",
540+
in: in{
541+
cluster: cluster{
542+
k8sObjs: append(withPodDeletedButNotRemoved(objectsForCatalogSource(t, validCatalogSource)), validConfigMap),
543+
},
544+
catsrc: validCatalogSource,
545+
},
546+
out: out{
547+
healthy: false,
548+
},
549+
},
550+
}
551+
for _, tt := range tests {
552+
t.Run(tt.testName, func(t *testing.T) {
553+
stopc := make(chan struct{})
554+
defer close(stopc)
555+
556+
factory, _ := fakeReconcilerFactory(t, stopc, withK8sObjs(tt.in.cluster.k8sObjs...))
557+
rec := factory.ReconcilerForSource(tt.in.catsrc)
558+
559+
healthy, err := rec.CheckRegistryServer(logrus.NewEntry(logrus.New()), tt.in.catsrc)
560+
561+
require.Equal(t, tt.out.err, err)
562+
if tt.out.err != nil {
563+
return
564+
}
565+
566+
require.Equal(t, tt.out.healthy, healthy)
567+
})
568+
}
569+
}

staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc.go

Lines changed: 13 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,25 @@ package reconciler
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
7-
86
"strings"
97
"time"
108

119
"github.com/google/go-cmp/cmp"
1210
"github.com/operator-framework/api/pkg/operators/v1alpha1"
11+
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
12+
controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
13+
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
14+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
15+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorlister"
16+
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
1317
pkgerrors "github.com/pkg/errors"
1418
"github.com/sirupsen/logrus"
1519
corev1 "k8s.io/api/core/v1"
1620
apierrors "k8s.io/apimachinery/pkg/api/errors"
1721
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1822
"k8s.io/apimachinery/pkg/labels"
1923
"k8s.io/apimachinery/pkg/util/intstr"
20-
"k8s.io/utils/ptr"
21-
22-
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
23-
controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
24-
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
25-
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
26-
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorlister"
27-
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/ownerutil"
2824
)
2925

3026
const (
@@ -343,31 +339,6 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
343339
func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, overwrite bool) error {
344340
// currentPods refers to the current pod instances of the catalog source
345341
currentPods := c.currentPods(logger, source)
346-
347-
var forceDeleteErrs []error
348-
// Remove dead pods from the slice without allocating a new slice
349-
// See https://go.dev/wiki/SliceTricks#filtering-without-allocating
350-
tmpSlice := currentPods[:0]
351-
for _, pod := range currentPods {
352-
if !isPodDead(pod) {
353-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
354-
tmpSlice = append(tmpSlice, pod)
355-
continue
356-
}
357-
358-
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
359-
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
360-
GracePeriodSeconds: ptr.To[int64](0),
361-
}); err != nil && !apierrors.IsNotFound(err) {
362-
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
363-
}
364-
}
365-
currentPods = tmpSlice
366-
367-
if len(forceDeleteErrs) > 0 {
368-
return errors.Join(forceDeleteErrs...)
369-
}
370-
371342
if len(currentPods) > 0 {
372343
if !overwrite {
373344
return nil
@@ -624,16 +595,19 @@ func (c *GrpcRegistryReconciler) CheckRegistryServer(logger *logrus.Entry, catal
624595
if err != nil {
625596
return false, err
626597
}
627-
current, err := c.currentPodsWithCorrectImageAndSpec(logger, source, serviceAccount)
598+
currentPods, err := c.currentPodsWithCorrectImageAndSpec(logger, source, serviceAccount)
628599
if err != nil {
629600
return false, err
630601
}
631-
if len(current) < 1 ||
602+
if len(currentPods) < 1 ||
632603
service == nil || c.currentServiceAccount(source) == nil {
633604
return false, nil
634605
}
635-
636-
return true, nil
606+
podsAreLive, e := detectAndDeleteDeadPods(logger, c.OpClient, currentPods, source.GetNamespace())
607+
if e != nil {
608+
return false, fmt.Errorf("error deleting dead pods: %v", e)
609+
}
610+
return podsAreLive, nil
637611
}
638612

639613
// promoteCatalog swaps the labels on the update pod so that the update pod is now reachable by the catalog service.

staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,23 @@ func grpcCatalogSourceWithName(name string) *v1alpha1.CatalogSource {
7272
return catsrc
7373
}
7474

75+
func withPodDeletedButNotRemoved(objs []runtime.Object) []runtime.Object {
76+
var out []runtime.Object
77+
for _, obj := range objs {
78+
o := obj.DeepCopyObject()
79+
if pod, ok := obj.(*corev1.Pod); ok {
80+
pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
81+
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
82+
Type: corev1.DisruptionTarget,
83+
Reason: "DeletionByTaintManager",
84+
Status: corev1.ConditionTrue,
85+
})
86+
o = pod
87+
}
88+
out = append(out, o)
89+
}
90+
return out
91+
}
7592
func TestGrpcRegistryReconciler(t *testing.T) {
7693
now := func() metav1.Time { return metav1.Date(2018, time.January, 26, 20, 40, 0, 0, time.UTC) }
7794
blockOwnerDeletion := true
@@ -545,6 +562,18 @@ func TestGrpcRegistryChecker(t *testing.T) {
545562
healthy: false,
546563
},
547564
},
565+
{
566+
testName: "Grpc/ExistingRegistry/Image/DeadPod",
567+
in: in{
568+
cluster: cluster{
569+
k8sObjs: withPodDeletedButNotRemoved(objectsForCatalogSource(t, validGrpcCatalogSource("test-img", ""))),
570+
},
571+
catsrc: validGrpcCatalogSource("test-img", ""),
572+
},
573+
out: out{
574+
healthy: false,
575+
},
576+
},
548577
{
549578
testName: "Grpc/ExistingRegistry/Image/OldPod/NotHealthy",
550579
in: in{

0 commit comments

Comments
 (0)