Skip to content

Commit 260bc91

Browse files
committed
Check for bundle unpack failure via status conditions
Instead of checking a specific error type when the bundle unpack job fails, a new BundleLookupFailure condition is set to to indicate a failed bundle lookup. The InstallPlan is transitioned to Failed based on the presence of this condition. While the InstallPlan is waiting on a BundleLookupPending condition the message for that condition is also update with the initcontainer statuses of the unpack pods, since that can surface ImagePullErrs when the lookup is stalled on a non-existent bundle image. The bundle unpack Job's BackoffLimit is also set to a lower value to fail fast on repeated crashes and the pod restart policy is set to never to preserve the container logs after the job is terminated via the backoff limit.
1 parent 048cdb4 commit 260bc91

File tree

6 files changed

+189
-79
lines changed

6 files changed

+189
-79
lines changed

cmd/catalog/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ var (
7171
"profiling", false, "serve profiling data (on port 8080)")
7272

7373
installPlanTimeout = flag.Duration("install-plan-retry-timeout", 1*time.Minute, "time since first attempt at which plan execution errors are considered fatal")
74-
bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", catalog.DefaultBundleUnpackTimeout, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")
74+
bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", 10*time.Minute, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")
7575
)
7676

7777
func init() {

pkg/controller/bundle/bundle_unpacker.go

Lines changed: 112 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
apierrors "k8s.io/apimachinery/pkg/api/errors"
1616
"k8s.io/apimachinery/pkg/api/resource"
1717
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
k8slabels "k8s.io/apimachinery/pkg/labels"
1819
"k8s.io/client-go/kubernetes"
1920
listersbatchv1 "k8s.io/client-go/listers/batch/v1"
2021
listerscorev1 "k8s.io/client-go/listers/core/v1"
@@ -26,6 +27,12 @@ import (
2627
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver/projection"
2728
)
2829

30+
const (
31+
// TODO: Move to operator-framework/api/pkg/operators/v1alpha1
32+
// BundleLookupFailed describes conditions types for when BundleLookups fail
33+
BundleLookupFailed operatorsv1alpha1.BundleLookupConditionType = "BundleLookupFailed"
34+
)
35+
2936
type BundleUnpackResult struct {
3037
*operatorsv1alpha1.BundleLookup
3138

@@ -76,7 +83,12 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
7683
Name: cmRef.Name,
7784
},
7885
Spec: corev1.PodSpec{
79-
RestartPolicy: corev1.RestartPolicyOnFailure,
86+
// With restartPolicy = "OnFailure" when the spec.backoffLimit is reached, the job controller will delete all
87+
// the job's pods to stop them from crashlooping forever.
88+
// By setting restartPolicy = "Never" the pods don't get cleaned up since they're not running after a failure.
89+
// Keeping the pods around after failures helps in inspecting the logs of a failed bundle unpack job.
90+
// See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
91+
RestartPolicy: corev1.RestartPolicyNever,
8092
ImagePullSecrets: secrets,
8193
Containers: []corev1.Container{
8294
{
@@ -172,6 +184,15 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
172184
job.Spec.ActiveDeadlineSeconds = &t
173185
}
174186

187+
// By default the BackoffLimit is set to 6 which with exponential backoff 10s + 20s + 40s ...
188+
// translates to ~10m of waiting time.
189+
// We want to fail faster than that when we have repeated failures from the bundle unpack pod
190+
// so we set it to 3 which is ~1m of waiting time
191+
// See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
192+
// TODO (haseeb): Should this be configurable as well?
193+
backOffLimit := int32(3)
194+
job.Spec.BackoffLimit = &backOffLimit
195+
175196
return job
176197
}
177198

@@ -188,6 +209,7 @@ type ConfigMapUnpacker struct {
188209
csLister listersoperatorsv1alpha1.CatalogSourceLister
189210
cmLister listerscorev1.ConfigMapLister
190211
jobLister listersbatchv1.JobLister
212+
podLister listerscorev1.PodLister
191213
roleLister listersrbacv1.RoleLister
192214
rbLister listersrbacv1.RoleBindingLister
193215
loader *configmap.BundleLoader
@@ -252,6 +274,12 @@ func WithJobLister(jobLister listersbatchv1.JobLister) ConfigMapUnpackerOption {
252274
}
253275
}
254276

277+
func WithPodLister(podLister listerscorev1.PodLister) ConfigMapUnpackerOption {
278+
return func(unpacker *ConfigMapUnpacker) {
279+
unpacker.podLister = podLister
280+
}
281+
}
282+
255283
func WithRoleLister(roleLister listersrbacv1.RoleLister) ConfigMapUnpackerOption {
256284
return func(unpacker *ConfigMapUnpacker) {
257285
unpacker.roleLister = roleLister
@@ -290,6 +318,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
290318
err = fmt.Errorf("configmap lister is nil")
291319
case c.jobLister == nil:
292320
err = fmt.Errorf("job lister is nil")
321+
case c.podLister == nil:
322+
err = fmt.Errorf("pod lister is nil")
293323
case c.roleLister == nil:
294324
err = fmt.Errorf("role lister is nil")
295325
case c.rbLister == nil:
@@ -306,6 +336,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
306336
const (
307337
CatalogSourceMissingReason = "CatalogSourceMissing"
308338
CatalogSourceMissingMessage = "referenced catalogsource not found"
339+
JobFailedReason = "JobFailed"
340+
JobFailedMessage = "unpack job has failed"
309341
JobIncompleteReason = "JobIncomplete"
310342
JobIncompleteMessage = "unpack job not completed"
311343
JobNotStartedReason = "JobNotStarted"
@@ -317,11 +349,21 @@ const (
317349
func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup) (result *BundleUnpackResult, err error) {
318350
result = newBundleUnpackResult(lookup)
319351

352+
// if bundle lookup failed condition already present, then there is nothing more to do
353+
failedCond := result.GetCondition(BundleLookupFailed)
354+
if failedCond.Status == corev1.ConditionTrue {
355+
return result, nil
356+
}
357+
320358
// if pending condition is missing, bundle has already been unpacked
321359
cond := result.GetCondition(operatorsv1alpha1.BundleLookupPending)
322360
if cond.Status == corev1.ConditionUnknown {
323361
return result, nil
324362
}
363+
// if pending condition is false, bundle unpack has already failed so nothing more to do
364+
if cond.Status == corev1.ConditionFalse {
365+
return result, nil
366+
}
325367

326368
now := c.now()
327369

@@ -379,23 +421,49 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
379421
// Return a BundleJobError so we can mark the InstallPlan as Failed
380422
isFailed, jobCond := jobConditionTrue(job, batchv1.JobFailed)
381423
if isFailed {
382-
cond.Status = corev1.ConditionTrue
383-
cond.Reason = jobCond.Reason
384-
cond.Message = jobCond.Message
424+
// Add the BundleLookupFailed condition with the message and reason from the job failure
425+
failedCond.Status = corev1.ConditionTrue
426+
failedCond.Reason = jobCond.Reason
427+
failedCond.Message = jobCond.Message
428+
failedCond.LastTransitionTime = &now
429+
result.SetCondition(failedCond)
430+
431+
// BundleLookupPending is false with reason being job failed
432+
cond.Status = corev1.ConditionFalse
433+
cond.Reason = JobFailedReason
434+
cond.Message = JobFailedMessage
385435
cond.LastTransitionTime = &now
386436
result.SetCondition(cond)
387437

388-
err = NewBundleJobError(fmt.Sprintf("Bundle extract Job failed with Reason: %v, and Message: %v", jobCond.Reason, jobCond.Message))
389438
return
390439
}
391440

392441
isComplete, _ := jobConditionTrue(job, batchv1.JobComplete)
393-
if !isComplete && (cond.Status != corev1.ConditionTrue || cond.Reason != JobIncompleteReason) {
394-
cond.Status = corev1.ConditionTrue
395-
cond.Reason = JobIncompleteReason
396-
cond.Message = JobIncompleteMessage
397-
cond.LastTransitionTime = &now
398-
result.SetCondition(cond)
442+
if !isComplete {
443+
444+
// In the case of an image pull failure for a non-existent image the bundle unpack job
445+
// can stay pending until the ActiveDeadlineSeconds timeout ~10m
446+
// To indicate why it's pending we inspect the container statuses of the
447+
// unpack Job pods to surface that information on the bundle lookup conditions
448+
pendingMessage := JobIncompleteMessage
449+
var pendingContainerStatusMsgs string
450+
pendingContainerStatusMsgs, err = c.pendingContainerStatusMessages(job)
451+
if err != nil {
452+
return
453+
}
454+
455+
if pendingContainerStatusMsgs != "" {
456+
pendingMessage = pendingMessage + ": " + pendingContainerStatusMsgs
457+
}
458+
459+
// Update BundleLookupPending condition if there are any changes
460+
if cond.Status != corev1.ConditionTrue || cond.Reason != JobIncompleteReason || cond.Message != pendingMessage {
461+
cond.Status = corev1.ConditionTrue
462+
cond.Reason = JobIncompleteReason
463+
cond.Message = pendingMessage
464+
cond.LastTransitionTime = &now
465+
result.SetCondition(cond)
466+
}
399467

400468
return
401469
}
@@ -423,6 +491,39 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
423491
return
424492
}
425493

494+
func (c *ConfigMapUnpacker) pendingContainerStatusMessages(job *batchv1.Job) (string, error) {
495+
containerStatusMessages := ""
496+
// List pods for unpack job
497+
podLabel := map[string]string{"job-name": job.GetName()}
498+
pods, listErr := c.podLister.Pods(job.GetNamespace()).List(k8slabels.SelectorFromSet(podLabel))
499+
if listErr != nil {
500+
return containerStatusMessages, fmt.Errorf("Failed to list pods for job(%s): %v", job.GetName(), listErr)
501+
}
502+
503+
// Ideally there should be just 1 pod running but inspect all pods in the pending phase
504+
// to see if any are stuck on an ImagePullBackOff or ErrImagePull error
505+
for _, pod := range pods {
506+
if pod.Status.Phase != corev1.PodPending {
507+
// skip status check for non-pending pods
508+
continue
509+
}
510+
511+
for _, ic := range pod.Status.InitContainerStatuses {
512+
if ic.Ready {
513+
// only check non-ready containers for their waiting reasons
514+
continue
515+
}
516+
517+
// Aggregate the wait reasons for all pending containers
518+
containerStatusMessages = containerStatusMessages +
519+
fmt.Sprintf("Unpack pod(%s/%s) container(%s) is pending. Reason: %s, Message: %s \n",
520+
pod.Namespace, pod.Name, ic.Name, ic.State.Waiting.Reason, ic.State.Waiting.Message)
521+
}
522+
}
523+
524+
return containerStatusMessages, nil
525+
}
526+
426527
func (c *ConfigMapUnpacker) ensureConfigmap(csRef *corev1.ObjectReference, name string) (cm *corev1.ConfigMap, err error) {
427528
fresh := &corev1.ConfigMap{}
428529
fresh.SetNamespace(csRef.Namespace)

pkg/controller/bundle/bundle_unpacker_test.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ func TestConfigMapUnpacker(t *testing.T) {
3838
now := func() metav1.Time {
3939
return start
4040
}
41+
backoffLimit := int32(3)
4142

4243
type fields struct {
4344
objs []runtime.Object
@@ -192,12 +193,13 @@ func TestConfigMapUnpacker(t *testing.T) {
192193
},
193194
},
194195
Spec: batchv1.JobSpec{
196+
BackoffLimit: &backoffLimit,
195197
Template: corev1.PodTemplateSpec{
196198
ObjectMeta: metav1.ObjectMeta{
197199
Name: pathHash,
198200
},
199201
Spec: corev1.PodSpec{
200-
RestartPolicy: corev1.RestartPolicyOnFailure,
202+
RestartPolicy: corev1.RestartPolicyNever,
201203
ImagePullSecrets: []corev1.LocalObjectReference{{Name: "my-secret"}},
202204
Containers: []corev1.Container{
203205
{
@@ -368,12 +370,13 @@ func TestConfigMapUnpacker(t *testing.T) {
368370
},
369371
},
370372
Spec: batchv1.JobSpec{
373+
BackoffLimit: &backoffLimit,
371374
Template: corev1.PodTemplateSpec{
372375
ObjectMeta: metav1.ObjectMeta{
373376
Name: pathHash,
374377
},
375378
Spec: corev1.PodSpec{
376-
RestartPolicy: corev1.RestartPolicyOnFailure,
379+
RestartPolicy: corev1.RestartPolicyNever,
377380
Containers: []corev1.Container{
378381
{
379382
Name: "extract",
@@ -582,12 +585,13 @@ func TestConfigMapUnpacker(t *testing.T) {
582585
},
583586
},
584587
Spec: batchv1.JobSpec{
588+
BackoffLimit: &backoffLimit,
585589
Template: corev1.PodTemplateSpec{
586590
ObjectMeta: metav1.ObjectMeta{
587591
Name: pathHash,
588592
},
589593
Spec: corev1.PodSpec{
590-
RestartPolicy: corev1.RestartPolicyOnFailure,
594+
RestartPolicy: corev1.RestartPolicyNever,
591595
Containers: []corev1.Container{
592596
{
593597
Name: "extract",
@@ -761,6 +765,7 @@ func TestConfigMapUnpacker(t *testing.T) {
761765
factory := informers.NewSharedInformerFactory(client, period)
762766
cmLister := factory.Core().V1().ConfigMaps().Lister()
763767
jobLister := factory.Batch().V1().Jobs().Lister()
768+
podLister := factory.Core().V1().Pods().Lister()
764769
roleLister := factory.Rbac().V1().Roles().Lister()
765770
rbLister := factory.Rbac().V1().RoleBindings().Lister()
766771

@@ -781,6 +786,7 @@ func TestConfigMapUnpacker(t *testing.T) {
781786
WithCatalogSourceLister(csLister),
782787
WithConfigMapLister(cmLister),
783788
WithJobLister(jobLister),
789+
WithPodLister(podLister),
784790
WithRoleLister(roleLister),
785791
WithRoleBindingLister(rbLister),
786792
WithOPMImage(opmImage),

pkg/controller/bundle/errors.go

Lines changed: 0 additions & 28 deletions
This file was deleted.

0 commit comments

Comments
 (0)