Skip to content

Commit d8a76e9

Browse files
joelanfordPer Goncalves da Silva
authored andcommitted
OCPBUGS-35305: [release-4.15] catalog-operator: delete catalog pods stuck in Terminating state due to unreachable node
Signed-off-by: Joe Lanford <[email protected]> Upstream-repository: operator-lifecycle-manager Upstream-commit: 68c24cf94371f7a73619966f9e49612fd16fa76a Signed-off-by: Per Goncalves da Silva <[email protected]>
1 parent d863e7c commit d8a76e9

File tree

14 files changed

+315
-268
lines changed

14 files changed

+315
-268
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ require (
2828
k8s.io/client-go v0.27.7
2929
k8s.io/code-generator v0.27.7
3030
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
31-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
31+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0
3232
sigs.k8s.io/controller-runtime v0.15.0
3333
sigs.k8s.io/controller-tools v0.8.0
3434
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,8 +1479,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3
14791479
k8s.io/kubectl v0.27.7 h1:HTEDa4s/oWjB3t5ysdW1yKlcNl9bzigcqWBq0LIIe3k=
14801480
k8s.io/kubectl v0.27.7/go.mod h1:Xb1Ubc8uN1i2RvSN1HCgSHTtzgX0woihMk/gW7XbjJU=
14811481
k8s.io/utils v0.0.0-20200324210504-a9aa75ae1b89/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew=
1482-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
1483-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
1482+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak=
1483+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
14841484
oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY=
14851485
oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324=
14861486
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

staging/operator-lifecycle-manager/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ FROM quay.io/fedora/fedora:37-x86_64 as builder
22
LABEL stage=builder
33
WORKDIR /build
44

5-
# install dependencies and go 1.16
5+
# install dependencies and go 1.21
66

77
# copy just enough of the git repo to parse HEAD, used to record version in OLM binaries
88
RUN dnf update -y && dnf install -y bash make git mercurial jq wget && dnf upgrade -y
9-
RUN curl -sSL https://go.dev/dl/go1.20.linux-amd64.tar.gz | tar -xzf - -C /usr/local
9+
RUN curl -sSL https://go.dev/dl/go1.21.9.linux-amd64.tar.gz | tar -xzf - -C /usr/local
1010
ENV PATH=/usr/local/go/bin:$PATH
1111
COPY .git/HEAD .git/HEAD
1212
COPY .git/refs/heads/. .git/refs/heads

staging/operator-lifecycle-manager/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ require (
5252
k8s.io/klog/v2 v2.90.1
5353
k8s.io/kube-aggregator v0.25.3
5454
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
55-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
55+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0
5656
sigs.k8s.io/controller-runtime v0.15.0
5757
sigs.k8s.io/controller-tools v0.8.0
5858
sigs.k8s.io/kind v0.20.0

staging/operator-lifecycle-manager/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,8 +1345,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5F
13451345
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg=
13461346
k8s.io/kubectl v0.27.7 h1:HTEDa4s/oWjB3t5ysdW1yKlcNl9bzigcqWBq0LIIe3k=
13471347
k8s.io/kubectl v0.27.7/go.mod h1:Xb1Ubc8uN1i2RvSN1HCgSHTtzgX0woihMk/gW7XbjJU=
1348-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
1349-
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
1348+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak=
1349+
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
13501350
oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY=
13511351
oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324=
13521352
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

staging/operator-lifecycle-manager/pkg/controller/registry/reconciler/grpc.go

Lines changed: 67 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,22 @@ package reconciler
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
7+
68
"strings"
79
"time"
810

911
"github.com/google/go-cmp/cmp"
1012
"github.com/operator-framework/api/pkg/operators/v1alpha1"
11-
"github.com/pkg/errors"
13+
pkgerrors "github.com/pkg/errors"
1214
"github.com/sirupsen/logrus"
1315
corev1 "k8s.io/api/core/v1"
1416
apierrors "k8s.io/apimachinery/pkg/api/errors"
1517
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1618
"k8s.io/apimachinery/pkg/labels"
1719
"k8s.io/apimachinery/pkg/util/intstr"
20+
"k8s.io/utils/ptr"
1821

1922
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
2023
controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
@@ -262,7 +265,7 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
262265
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
263266
sa, err := c.ensureSA(source)
264267
if err != nil && !apierrors.IsAlreadyExists(err) {
265-
return errors.Wrapf(err, "error ensuring service account: %s", source.GetName())
268+
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.GetName())
266269
}
267270

268271
sa, err = c.OpClient.GetServiceAccount(sa.GetNamespace(), sa.GetName())
@@ -285,20 +288,20 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
285288
return err
286289
}
287290
if err := c.ensurePod(logger, source, sa, overwritePod); err != nil {
288-
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
291+
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
289292
}
290293
if err := c.ensureUpdatePod(logger, sa, source); err != nil {
291294
if _, ok := err.(UpdateNotReadyErr); ok {
292295
return err
293296
}
294-
return errors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
297+
return pkgerrors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
295298
}
296299
service, err := source.Service()
297300
if err != nil {
298301
return err
299302
}
300303
if err := c.ensureService(source, overwrite); err != nil {
301-
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
304+
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
302305
}
303306

304307
if overwritePod {
@@ -338,16 +341,41 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
338341
}
339342

340343
func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, overwrite bool) error {
341-
// currentLivePods refers to the currently live instances of the catalog source
342-
currentLivePods := c.currentPods(logger, source)
343-
if len(currentLivePods) > 0 {
344+
// currentPods refers to the current pod instances of the catalog source
345+
currentPods := c.currentPods(logger, source)
346+
347+
var forceDeleteErrs []error
348+
// Remove dead pods from the slice without allocating a new slice
349+
// See https://go.dev/wiki/SliceTricks#filtering-without-allocating
350+
tmpSlice := currentPods[:0]
351+
for _, pod := range currentPods {
352+
if !isPodDead(pod) {
353+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
354+
tmpSlice = append(tmpSlice, pod)
355+
continue
356+
}
357+
358+
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
359+
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
360+
GracePeriodSeconds: ptr.To[int64](0),
361+
}); err != nil && !apierrors.IsNotFound(err) {
362+
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
363+
}
364+
}
365+
currentPods = tmpSlice
366+
367+
if len(forceDeleteErrs) > 0 {
368+
return errors.Join(forceDeleteErrs...)
369+
}
370+
371+
if len(currentPods) > 0 {
344372
if !overwrite {
345373
return nil
346374
}
347-
for _, p := range currentLivePods {
375+
for _, p := range currentPods {
348376
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": p.GetName()}).Info("deleting current pod")
349377
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
350-
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
378+
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
351379
}
352380
}
353381
}
@@ -358,7 +386,7 @@ func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCata
358386
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": desiredPod.Namespace}).Info("deleting current pod")
359387
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Create(context.TODO(), desiredPod, metav1.CreateOptions{})
360388
if err != nil {
361-
return errors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
389+
return pkgerrors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
362390
}
363391

364392
return nil
@@ -378,7 +406,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
378406
logger.Infof("catalog update required at %s", time.Now().String())
379407
pod, err := c.createUpdatePod(source, serviceAccount)
380408
if err != nil {
381-
return errors.Wrapf(err, "creating update catalog source pod")
409+
return pkgerrors.Wrapf(err, "creating update catalog source pod")
382410
}
383411
source.SetLastUpdateTime()
384412
return UpdateNotReadyErr{catalogName: source.GetName(), podName: pod.GetName()}
@@ -410,7 +438,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
410438
for _, p := range currentLivePods {
411439
logger.WithFields(logrus.Fields{"live-pod.namespace": source.GetNamespace(), "live-pod.name": p.Name}).Info("deleting current live pods")
412440
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
413-
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
441+
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
414442
}
415443
}
416444
// done syncing
@@ -420,7 +448,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
420448
// delete update pod right away, since the digest match, to prevent long-lived duplicate catalog pods
421449
logger.WithFields(logrus.Fields{"update-pod.namespace": updatePod.Namespace, "update-pod.name": updatePod.Name}).Debug("catalog polling result: no update; removing duplicate update pod")
422450
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), updatePod.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
423-
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
451+
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
424452
}
425453
}
426454

@@ -523,6 +551,29 @@ func imageChanged(logger *logrus.Entry, updatePod *corev1.Pod, servingPods []*co
523551
return false
524552
}
525553

554+
func isPodDead(pod *corev1.Pod) bool {
555+
for _, check := range []func(*corev1.Pod) bool{
556+
isPodDeletedByTaintManager,
557+
} {
558+
if check(pod) {
559+
return true
560+
}
561+
}
562+
return false
563+
}
564+
565+
func isPodDeletedByTaintManager(pod *corev1.Pod) bool {
566+
if pod.DeletionTimestamp == nil {
567+
return false
568+
}
569+
for _, condition := range pod.Status.Conditions {
570+
if condition.Type == corev1.DisruptionTarget && condition.Reason == "DeletionByTaintManager" && condition.Status == corev1.ConditionTrue {
571+
return true
572+
}
573+
}
574+
return false
575+
}
576+
526577
// imageID returns the ImageID of the primary catalog source container or an empty string if the image ID isn't available yet.
527578
// Note: the pod must be running and the container in a ready status to return a valid ImageID.
528579
func imageID(pod *corev1.Pod) string {
@@ -545,7 +596,7 @@ func imageID(pod *corev1.Pod) string {
545596
func (c *GrpcRegistryReconciler) removePods(pods []*corev1.Pod, namespace string) error {
546597
for _, p := range pods {
547598
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(namespace).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
548-
return errors.Wrapf(err, "error deleting pod: %s", p.GetName())
599+
return pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName())
549600
}
550601
}
551602
return nil
@@ -623,7 +674,7 @@ func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) {
623674
logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName())
624675
err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace())
625676
if err != nil {
626-
return true, errors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
677+
return true, pkgerrors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
627678
}
628679
return true, nil
629680
}

0 commit comments

Comments
 (0)