@@ -3,18 +3,20 @@ package reconciler
3
3
4
4
import (
5
5
"context"
6
+ "errors"
6
7
"fmt"
7
8
8
9
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
9
10
hashutil "github.com/operator-framework/operator-lifecycle-manager/pkg/lib/kubernetes/pkg/util/hash"
10
- "github.com/pkg/errors"
11
+ pkgerrors "github.com/pkg/errors"
11
12
"github.com/sirupsen/logrus"
12
13
corev1 "k8s.io/api/core/v1"
13
14
rbacv1 "k8s.io/api/rbac/v1"
14
15
apierrors "k8s.io/apimachinery/pkg/api/errors"
15
16
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16
17
"k8s.io/apimachinery/pkg/labels"
17
18
"k8s.io/apimachinery/pkg/util/intstr"
19
+ "k8s.io/utils/ptr"
18
20
19
21
"github.com/operator-framework/api/pkg/operators/v1alpha1"
20
22
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
@@ -322,27 +324,27 @@ func (c *ConfigMapRegistryReconciler) EnsureRegistryServer(logger *logrus.Entry,
322
324
323
325
//TODO: if any of these error out, we should write a status back (possibly set RegistryServiceStatus to nil so they get recreated)
324
326
if err := c .ensureServiceAccount (source , overwrite ); err != nil {
325
- return errors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
327
+ return pkgerrors .Wrapf (err , "error ensuring service account: %s" , source .serviceAccountName ())
326
328
}
327
329
if err := c .ensureRole (source , overwrite ); err != nil {
328
- return errors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
330
+ return pkgerrors .Wrapf (err , "error ensuring role: %s" , source .roleName ())
329
331
}
330
332
if err := c .ensureRoleBinding (source , overwrite ); err != nil {
331
- return errors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
333
+ return pkgerrors .Wrapf (err , "error ensuring rolebinding: %s" , source .RoleBinding ().GetName ())
332
334
}
333
335
pod , err := source .Pod (image )
334
336
if err != nil {
335
337
return err
336
338
}
337
339
if err := c .ensurePod (source , overwritePod ); err != nil {
338
- return errors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
340
+ return pkgerrors .Wrapf (err , "error ensuring pod: %s" , pod .GetName ())
339
341
}
340
342
service , err := source .Service ()
341
343
if err != nil {
342
344
return err
343
345
}
344
346
if err := c .ensureService (source , overwrite ); err != nil {
345
- return errors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
347
+ return pkgerrors .Wrapf (err , "error ensuring service: %s" , service .GetName ())
346
348
}
347
349
348
350
if overwritePod {
@@ -415,15 +417,15 @@ func (c *ConfigMapRegistryReconciler) ensurePod(source configMapCatalogSourceDec
415
417
}
416
418
for _ , p := range currentPods {
417
419
if err := c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Delete (context .TODO (), p .GetName (), * metav1 .NewDeleteOptions (1 )); err != nil && ! apierrors .IsNotFound (err ) {
418
- return errors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
420
+ return pkgerrors .Wrapf (err , "error deleting old pod: %s" , p .GetName ())
419
421
}
420
422
}
421
423
}
422
424
_ , err = c .OpClient .KubernetesInterface ().CoreV1 ().Pods (pod .GetNamespace ()).Create (context .TODO (), pod , metav1.CreateOptions {})
423
425
if err == nil {
424
426
return nil
425
427
}
426
- return errors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
428
+ return pkgerrors .Wrapf (err , "error creating new pod: %s" , pod .GetGenerateName ())
427
429
}
428
430
429
431
func (c * ConfigMapRegistryReconciler ) ensureService (source configMapCatalogSourceDecorator , overwrite bool ) error {
@@ -502,6 +504,34 @@ func (c *ConfigMapRegistryReconciler) CheckRegistryServer(logger *logrus.Entry,
502
504
return
503
505
}
504
506
505
- healthy = true
506
- return
507
+ podsAreLive , e := detectAndDeleteDeadPods (logger , c .OpClient , pods , source .GetNamespace ())
508
+ if e != nil {
509
+ return false , fmt .Errorf ("error deleting dead pods: %v" , e )
510
+ }
511
+ return podsAreLive , nil
512
+ }
513
+
514
+ // detectAndDeleteDeadPods determines if there are registry client pods that are in the deleted state
515
+ // but have not been removed by GC (eg the node goes down before GC can remove them), and attempts to
516
+ // force delete the pods. If there are live registry pods remaining, it returns true, otherwise returns false.
517
+ func detectAndDeleteDeadPods (logger * logrus.Entry , client operatorclient.ClientInterface , pods []* corev1.Pod , sourceNamespace string ) (bool , error ) {
518
+ var forceDeletionErrs []error
519
+ livePodFound := false
520
+ for _ , pod := range pods {
521
+ if ! isPodDead (pod ) {
522
+ livePodFound = true
523
+ logger .WithFields (logrus.Fields {"pod.namespace" : sourceNamespace , "pod.name" : pod .GetName ()}).Debug ("pod is alive" )
524
+ continue
525
+ }
526
+ logger .WithFields (logrus.Fields {"pod.namespace" : sourceNamespace , "pod.name" : pod .GetName ()}).Info ("force deleting dead pod" )
527
+ if err := client .KubernetesInterface ().CoreV1 ().Pods (sourceNamespace ).Delete (context .TODO (), pod .GetName (), metav1.DeleteOptions {
528
+ GracePeriodSeconds : ptr.To [int64 ](0 ),
529
+ }); err != nil && ! apierrors .IsNotFound (err ) {
530
+ forceDeletionErrs = append (forceDeletionErrs , err )
531
+ }
532
+ }
533
+ if len (forceDeletionErrs ) > 0 {
534
+ return false , errors .Join (forceDeletionErrs ... )
535
+ }
536
+ return livePodFound , nil
507
537
}
0 commit comments