Skip to content

OCPBUGS-35305: [release-4.15] catalog-operator: delete catalog pods stuck in Terminating state due to unreachable node #779

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ require (
k8s.io/client-go v0.27.7
k8s.io/code-generator v0.27.7
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0
sigs.k8s.io/controller-runtime v0.15.0
sigs.k8s.io/controller-tools v0.8.0
)
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1479,8 +1479,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3
k8s.io/kubectl v0.27.7 h1:HTEDa4s/oWjB3t5ysdW1yKlcNl9bzigcqWBq0LIIe3k=
k8s.io/kubectl v0.27.7/go.mod h1:Xb1Ubc8uN1i2RvSN1HCgSHTtzgX0woihMk/gW7XbjJU=
k8s.io/utils v0.0.0-20200324210504-a9aa75ae1b89/go.mod h1:sZAwmy6armz5eXlNoLmJcl4F1QuKu7sr+mFQ0byX7Ew=
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak=
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY=
oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
Expand Down
4 changes: 2 additions & 2 deletions staging/operator-lifecycle-manager/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ FROM quay.io/fedora/fedora:37-x86_64 as builder
LABEL stage=builder
WORKDIR /build

# install dependencies and go 1.16
# install dependencies and go 1.21

# copy just enough of the git repo to parse HEAD, used to record version in OLM binaries
RUN dnf update -y && dnf install -y bash make git mercurial jq wget && dnf upgrade -y
RUN curl -sSL https://go.dev/dl/go1.20.linux-amd64.tar.gz | tar -xzf - -C /usr/local
RUN curl -sSL https://go.dev/dl/go1.21.9.linux-amd64.tar.gz | tar -xzf - -C /usr/local
ENV PATH=/usr/local/go/bin:$PATH
COPY .git/HEAD .git/HEAD
COPY .git/refs/heads/. .git/refs/heads
Expand Down
2 changes: 1 addition & 1 deletion staging/operator-lifecycle-manager/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ require (
k8s.io/klog/v2 v2.90.1
k8s.io/kube-aggregator v0.25.3
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0
sigs.k8s.io/controller-runtime v0.15.0
sigs.k8s.io/controller-tools v0.8.0
sigs.k8s.io/kind v0.20.0
Expand Down
4 changes: 2 additions & 2 deletions staging/operator-lifecycle-manager/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1345,8 +1345,8 @@ k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f h1:2kWPakN3i/k81b0gvD5C5F
k8s.io/kube-openapi v0.0.0-20230501164219-8b0f38b5fd1f/go.mod h1:byini6yhqGC14c3ebc/QwanvYwhuMWF6yz2F8uwW8eg=
k8s.io/kubectl v0.27.7 h1:HTEDa4s/oWjB3t5ysdW1yKlcNl9bzigcqWBq0LIIe3k=
k8s.io/kubectl v0.27.7/go.mod h1:Xb1Ubc8uN1i2RvSN1HCgSHTtzgX0woihMk/gW7XbjJU=
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 h1:kmDqav+P+/5e1i9tFfHq1qcF3sOrDp+YEkVDAHu7Jwk=
k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 h1:jgGTlFYnhF1PM1Ax/lAlxUPE+KfCIXHaathvJg1C3ak=
k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
oras.land/oras-go v1.2.4 h1:djpBY2/2Cs1PV87GSJlxv4voajVOMZxqqtq9AB8YNvY=
oras.land/oras-go v1.2.4/go.mod h1:DYcGfb3YF1nKjcezfX2SNlDAeQFKSXmf+qrFmrh4324=
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,22 @@ package reconciler

import (
"context"
"errors"
"fmt"

"strings"
"time"

"github.com/google/go-cmp/cmp"
"github.com/operator-framework/api/pkg/operators/v1alpha1"
"github.com/pkg/errors"
pkgerrors "github.com/pkg/errors"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"

"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
controllerclient "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/controller-runtime/client"
Expand Down Expand Up @@ -262,7 +265,7 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
sa, err := c.ensureSA(source)
if err != nil && !apierrors.IsAlreadyExists(err) {
return errors.Wrapf(err, "error ensuring service account: %s", source.GetName())
return pkgerrors.Wrapf(err, "error ensuring service account: %s", source.GetName())
}

sa, err = c.OpClient.GetServiceAccount(sa.GetNamespace(), sa.GetName())
Expand All @@ -285,20 +288,20 @@ func (c *GrpcRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry, cata
return err
}
if err := c.ensurePod(logger, source, sa, overwritePod); err != nil {
return errors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
return pkgerrors.Wrapf(err, "error ensuring pod: %s", pod.GetName())
}
if err := c.ensureUpdatePod(logger, sa, source); err != nil {
if _, ok := err.(UpdateNotReadyErr); ok {
return err
}
return errors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
return pkgerrors.Wrapf(err, "error ensuring updated catalog source pod: %s", pod.GetName())
}
service, err := source.Service()
if err != nil {
return err
}
if err := c.ensureService(source, overwrite); err != nil {
return errors.Wrapf(err, "error ensuring service: %s", service.GetName())
return pkgerrors.Wrapf(err, "error ensuring service: %s", service.GetName())
}

if overwritePod {
Expand Down Expand Up @@ -338,16 +341,41 @@ func isRegistryServiceStatusValid(source *grpcCatalogSourceDecorator) (bool, err
}

func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCatalogSourceDecorator, serviceAccount *corev1.ServiceAccount, overwrite bool) error {
// currentLivePods refers to the currently live instances of the catalog source
currentLivePods := c.currentPods(logger, source)
if len(currentLivePods) > 0 {
// currentPods refers to the current pod instances of the catalog source
currentPods := c.currentPods(logger, source)

var forceDeleteErrs []error
// Remove dead pods from the slice without allocating a new slice
// See https://go.dev/wiki/SliceTricks#filtering-without-allocating
tmpSlice := currentPods[:0]
for _, pod := range currentPods {
if !isPodDead(pod) {
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Debug("pod is alive")
tmpSlice = append(tmpSlice, pod)
continue
}

logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": pod.GetName()}).Info("force deleting dead pod")
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), pod.GetName(), metav1.DeleteOptions{
GracePeriodSeconds: ptr.To[int64](0),
}); err != nil && !apierrors.IsNotFound(err) {
forceDeleteErrs = append(forceDeleteErrs, pkgerrors.Wrapf(err, "error deleting old pod: %s", pod.GetName()))
}
}
currentPods = tmpSlice

if len(forceDeleteErrs) > 0 {
return errors.Join(forceDeleteErrs...)
}

if len(currentPods) > 0 {
if !overwrite {
return nil
}
for _, p := range currentLivePods {
for _, p := range currentPods {
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": p.GetName()}).Info("deleting current pod")
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "error deleting old pod: %s", p.GetName())
return pkgerrors.Wrapf(err, "error deleting old pod: %s", p.GetName())
}
}
}
Expand All @@ -358,7 +386,7 @@ func (c *GrpcRegistryReconciler) ensurePod(logger *logrus.Entry, source grpcCata
logger.WithFields(logrus.Fields{"pod.namespace": source.GetNamespace(), "pod.name": desiredPod.Namespace}).Info("deleting current pod")
_, err = c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Create(context.TODO(), desiredPod, metav1.CreateOptions{})
if err != nil {
return errors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
return pkgerrors.Wrapf(err, "error creating new pod: %s", desiredPod.GetGenerateName())
}

return nil
Expand All @@ -378,7 +406,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
logger.Infof("catalog update required at %s", time.Now().String())
pod, err := c.createUpdatePod(source, serviceAccount)
if err != nil {
return errors.Wrapf(err, "creating update catalog source pod")
return pkgerrors.Wrapf(err, "creating update catalog source pod")
}
source.SetLastUpdateTime()
return UpdateNotReadyErr{catalogName: source.GetName(), podName: pod.GetName()}
Expand Down Expand Up @@ -410,7 +438,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
for _, p := range currentLivePods {
logger.WithFields(logrus.Fields{"live-pod.namespace": source.GetNamespace(), "live-pod.name": p.Name}).Info("deleting current live pods")
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName()), "detected imageID change: error deleting old catalog source pod")
}
}
// done syncing
Expand All @@ -420,7 +448,7 @@ func (c *GrpcRegistryReconciler) ensureUpdatePod(logger *logrus.Entry, serviceAc
// delete update pod right away, since the digest match, to prevent long-lived duplicate catalog pods
logger.WithFields(logrus.Fields{"update-pod.namespace": updatePod.Namespace, "update-pod.name": updatePod.Name}).Debug("catalog polling result: no update; removing duplicate update pod")
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(source.GetNamespace()).Delete(context.TODO(), updatePod.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(errors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
return pkgerrors.Wrapf(pkgerrors.Wrapf(err, "error deleting pod: %s", updatePod.GetName()), "duplicate catalog polling pod")
}
}

Expand Down Expand Up @@ -523,6 +551,29 @@ func imageChanged(logger *logrus.Entry, updatePod *corev1.Pod, servingPods []*co
return false
}

func isPodDead(pod *corev1.Pod) bool {
for _, check := range []func(*corev1.Pod) bool{
isPodDeletedByTaintManager,
} {
if check(pod) {
return true
}
}
return false
}

func isPodDeletedByTaintManager(pod *corev1.Pod) bool {
if pod.DeletionTimestamp == nil {
return false
}
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.DisruptionTarget && condition.Reason == "DeletionByTaintManager" && condition.Status == corev1.ConditionTrue {
return true
}
}
return false
}

// imageID returns the ImageID of the primary catalog source container or an empty string if the image ID isn't available yet.
// Note: the pod must be running and the container in a ready status to return a valid ImageID.
func imageID(pod *corev1.Pod) string {
Expand All @@ -545,7 +596,7 @@ func imageID(pod *corev1.Pod) string {
func (c *GrpcRegistryReconciler) removePods(pods []*corev1.Pod, namespace string) error {
for _, p := range pods {
if err := c.OpClient.KubernetesInterface().CoreV1().Pods(namespace).Delete(context.TODO(), p.GetName(), *metav1.NewDeleteOptions(1)); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "error deleting pod: %s", p.GetName())
return pkgerrors.Wrapf(err, "error deleting pod: %s", p.GetName())
}
}
return nil
Expand Down Expand Up @@ -623,7 +674,7 @@ func (c *GrpcRegistryReconciler) podFailed(pod *corev1.Pod) (bool, error) {
logrus.WithField("UpdatePod", pod.GetName()).Infof("catalog polling result: update pod %s failed to start", pod.GetName())
err := c.removePods([]*corev1.Pod{pod}, pod.GetNamespace())
if err != nil {
return true, errors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
return true, pkgerrors.Wrapf(err, "error deleting failed catalog polling pod: %s", pod.GetName())
}
return true, nil
}
Expand Down
Loading