operator-framework
diff --git a/‎cmd/catalog/main.go
Lines changed: 1 addition & 1 deletion b/‎cmd/catalog/main.go
Lines changed: 1 addition & 1 deletion
diff --git a/‎pkg/controller/bundle/bundle_unpacker.go
Lines changed: 112 additions & 11 deletions b/‎pkg/controller/bundle/bundle_unpacker.go
Lines changed: 112 additions & 11 deletions
diff --git a/‎pkg/controller/bundle/bundle_unpacker_test.go
Lines changed: 9 additions & 3 deletions b/‎pkg/controller/bundle/bundle_unpacker_test.go
Lines changed: 9 additions & 3 deletions
diff --git a/‎pkg/controller/bundle/errors.go
Lines changed: 0 additions & 28 deletions b/‎pkg/controller/bundle/errors.go
Lines changed: 0 additions & 28 deletions
@@ -71,7 +71,7 @@ var (
 		"profiling", false, "serve profiling data (on port 8080)")
 
 	installPlanTimeout  = flag.Duration("install-plan-retry-timeout", 1*time.Minute, "time since first attempt at which plan execution errors are considered fatal")
-	bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", catalog.DefaultBundleUnpackTimeout, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")
+	bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", 10*time.Minute, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")
 )
 
 func init() {
 
@@ -15,6 +15,7 @@ import (
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	k8slabels "k8s.io/apimachinery/pkg/labels"
 	"k8s.io/client-go/kubernetes"
 	listersbatchv1 "k8s.io/client-go/listers/batch/v1"
 	listerscorev1 "k8s.io/client-go/listers/core/v1"
@@ -26,6 +27,12 @@ import (
 	"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver/projection"
 )
 
+const (
+	// TODO: Move to operator-framework/api/pkg/operators/v1alpha1
+	// BundleLookupFailed describes conditions types for when BundleLookups fail
+	BundleLookupFailed operatorsv1alpha1.BundleLookupConditionType = "BundleLookupFailed"
+)
+
 type BundleUnpackResult struct {
 	*operatorsv1alpha1.BundleLookup
 
@@ -76,7 +83,12 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
 					Name: cmRef.Name,
 				},
 				Spec: corev1.PodSpec{
-					RestartPolicy:    corev1.RestartPolicyOnFailure,
+					// With restartPolicy = "OnFailure" when the spec.backoffLimit is reached, the job controller will delete all
+					// the job's pods to stop them from crashlooping forever.
+					// By setting restartPolicy = "Never" the pods don't get cleaned up since they're not running after a failure.
+					// Keeping the pods around after failures helps in inspecting the logs of a failed bundle unpack job.
+					// See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
+					RestartPolicy:    corev1.RestartPolicyNever,
 					ImagePullSecrets: secrets,
 					Containers: []corev1.Container{
 						{
@@ -172,6 +184,15 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
 		job.Spec.ActiveDeadlineSeconds = &t
 	}
 
+	// By default the BackoffLimit is set to 6 which with exponential backoff 10s + 20s + 40s ...
+	// translates to ~10m of waiting time.
+	// We want to fail faster than that when we have repeated failures from the bundle unpack pod
+	// so we set it to 3 which is ~1m of waiting time
+	// See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
+	// TODO (haseeb): Should this be configurable as well?
+	backOffLimit := int32(3)
+	job.Spec.BackoffLimit = &backOffLimit
+
 	return job
 }
 
@@ -188,6 +209,7 @@ type ConfigMapUnpacker struct {
 	csLister      listersoperatorsv1alpha1.CatalogSourceLister
 	cmLister      listerscorev1.ConfigMapLister
 	jobLister     listersbatchv1.JobLister
+	podLister     listerscorev1.PodLister
 	roleLister    listersrbacv1.RoleLister
 	rbLister      listersrbacv1.RoleBindingLister
 	loader        *configmap.BundleLoader
@@ -252,6 +274,12 @@ func WithJobLister(jobLister listersbatchv1.JobLister) ConfigMapUnpackerOption {
 	}
 }
 
+func WithPodLister(podLister listerscorev1.PodLister) ConfigMapUnpackerOption {
+	return func(unpacker *ConfigMapUnpacker) {
+		unpacker.podLister = podLister
+	}
+}
+
 func WithRoleLister(roleLister listersrbacv1.RoleLister) ConfigMapUnpackerOption {
 	return func(unpacker *ConfigMapUnpacker) {
 		unpacker.roleLister = roleLister
@@ -290,6 +318,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
 		err = fmt.Errorf("configmap lister is nil")
 	case c.jobLister == nil:
 		err = fmt.Errorf("job lister is nil")
+	case c.podLister == nil:
+		err = fmt.Errorf("pod lister is nil")
 	case c.roleLister == nil:
 		err = fmt.Errorf("role lister is nil")
 	case c.rbLister == nil:
@@ -306,6 +336,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
 const (
 	CatalogSourceMissingReason  = "CatalogSourceMissing"
 	CatalogSourceMissingMessage = "referenced catalogsource not found"
+	JobFailedReason             = "JobFailed"
+	JobFailedMessage            = "unpack job has failed"
 	JobIncompleteReason         = "JobIncomplete"
 	JobIncompleteMessage        = "unpack job not completed"
 	JobNotStartedReason         = "JobNotStarted"
@@ -317,11 +349,21 @@ const (
 func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup) (result *BundleUnpackResult, err error) {
 	result = newBundleUnpackResult(lookup)
 
+	// if bundle lookup failed condition already present, then there is nothing more to do
+	failedCond := result.GetCondition(BundleLookupFailed)
+	if failedCond.Status == corev1.ConditionTrue {
+		return result, nil
+	}
+
 	// if pending condition is missing, bundle has already been unpacked
 	cond := result.GetCondition(operatorsv1alpha1.BundleLookupPending)
 	if cond.Status == corev1.ConditionUnknown {
 		return result, nil
 	}
+	// if pending condition is false, bundle unpack has already failed so nothing more to do
+	if cond.Status == corev1.ConditionFalse {
+		return result, nil
+	}
 
 	now := c.now()
 
@@ -379,23 +421,49 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
 	// Return a BundleJobError so we can mark the InstallPlan as Failed
 	isFailed, jobCond := jobConditionTrue(job, batchv1.JobFailed)
 	if isFailed {
-		cond.Status = corev1.ConditionTrue
-		cond.Reason = jobCond.Reason
-		cond.Message = jobCond.Message
+		// Add the BundleLookupFailed condition with the message and reason from the job failure
+		failedCond.Status = corev1.ConditionTrue
+		failedCond.Reason = jobCond.Reason
+		failedCond.Message = jobCond.Message
+		failedCond.LastTransitionTime = &now
+		result.SetCondition(failedCond)
+
+		// BundleLookupPending is false with reason being job failed
+		cond.Status = corev1.ConditionFalse
+		cond.Reason = JobFailedReason
+		cond.Message = JobFailedMessage
 		cond.LastTransitionTime = &now
 		result.SetCondition(cond)
 
-		err = NewBundleJobError(fmt.Sprintf("Bundle extract Job failed with Reason: %v, and Message: %v", jobCond.Reason, jobCond.Message))
 		return
 	}
 
 	isComplete, _ := jobConditionTrue(job, batchv1.JobComplete)
-	if !isComplete && (cond.Status != corev1.ConditionTrue || cond.Reason != JobIncompleteReason) {
-		cond.Status = corev1.ConditionTrue
-		cond.Reason = JobIncompleteReason
-		cond.Message = JobIncompleteMessage
-		cond.LastTransitionTime = &now
-		result.SetCondition(cond)
+	if !isComplete {
+
+		// In the case of an image pull failure for a non-existent image the bundle unpack job
+		// can stay pending until the ActiveDeadlineSeconds timeout ~10m
+		// To indicate why it's pending we inspect the container statuses of the
+		// unpack Job pods to surface that information on the bundle lookup conditions
+		pendingMessage := JobIncompleteMessage
+		var pendingContainerStatusMsgs string
+		pendingContainerStatusMsgs, err = c.pendingContainerStatusMessages(job)
+		if err != nil {
+			return
+		}
+
+		if pendingContainerStatusMsgs != "" {
+			pendingMessage = pendingMessage + ": " + pendingContainerStatusMsgs
+		}
+
+		// Update BundleLookupPending condition if there are any changes
+		if cond.Status != corev1.ConditionTrue || cond.Reason != JobIncompleteReason || cond.Message != pendingMessage {
+			cond.Status = corev1.ConditionTrue
+			cond.Reason = JobIncompleteReason
+			cond.Message = pendingMessage
+			cond.LastTransitionTime = &now
+			result.SetCondition(cond)
+		}
 
 		return
 	}
@@ -423,6 +491,39 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
 	return
 }
 
+func (c *ConfigMapUnpacker) pendingContainerStatusMessages(job *batchv1.Job) (string, error) {
+	containerStatusMessages := ""
+	// List pods for unpack job
+	podLabel := map[string]string{"job-name": job.GetName()}
+	pods, listErr := c.podLister.Pods(job.GetNamespace()).List(k8slabels.SelectorFromSet(podLabel))
+	if listErr != nil {
+		return containerStatusMessages, fmt.Errorf("Failed to list pods for job(%s): %v", job.GetName(), listErr)
+	}
+
+	// Ideally there should be just 1 pod running but inspect all pods in the pending phase
+	// to see if any are stuck on an ImagePullBackOff or ErrImagePull error
+	for _, pod := range pods {
+		if pod.Status.Phase != corev1.PodPending {
+			// skip status check for non-pending pods
+			continue
+		}
+
+		for _, ic := range pod.Status.InitContainerStatuses {
+			if ic.Ready {
+				// only check non-ready containers for their waiting reasons
+				continue
+			}
+
+			// Aggregate the wait reasons for all pending containers
+			containerStatusMessages = containerStatusMessages +
+				fmt.Sprintf("Unpack pod(%s/%s) container(%s) is pending. Reason: %s, Message: %s \n",
+					pod.Namespace, pod.Name, ic.Name, ic.State.Waiting.Reason, ic.State.Waiting.Message)
+		}
+	}
+
+	return containerStatusMessages, nil
+}
+
 func (c *ConfigMapUnpacker) ensureConfigmap(csRef *corev1.ObjectReference, name string) (cm *corev1.ConfigMap, err error) {
 	fresh := &corev1.ConfigMap{}
 	fresh.SetNamespace(csRef.Namespace)
 
@@ -38,6 +38,7 @@ func TestConfigMapUnpacker(t *testing.T) {
 	now := func() metav1.Time {
 		return start
 	}
+	backoffLimit := int32(3)
 
 	type fields struct {
 		objs []runtime.Object
@@ -192,12 +193,13 @@ func TestConfigMapUnpacker(t *testing.T) {
 							},
 						},
 						Spec: batchv1.JobSpec{
+							BackoffLimit: &backoffLimit,
 							Template: corev1.PodTemplateSpec{
 								ObjectMeta: metav1.ObjectMeta{
 									Name: pathHash,
 								},
 								Spec: corev1.PodSpec{
-									RestartPolicy:    corev1.RestartPolicyOnFailure,
+									RestartPolicy:    corev1.RestartPolicyNever,
 									ImagePullSecrets: []corev1.LocalObjectReference{{Name: "my-secret"}},
 									Containers: []corev1.Container{
 										{
@@ -368,12 +370,13 @@ func TestConfigMapUnpacker(t *testing.T) {
 							},
 						},
 						Spec: batchv1.JobSpec{
+							BackoffLimit: &backoffLimit,
 							Template: corev1.PodTemplateSpec{
 								ObjectMeta: metav1.ObjectMeta{
 									Name: pathHash,
 								},
 								Spec: corev1.PodSpec{
-									RestartPolicy: corev1.RestartPolicyOnFailure,
+									RestartPolicy: corev1.RestartPolicyNever,
 									Containers: []corev1.Container{
 										{
 											Name:    "extract",
@@ -582,12 +585,13 @@ func TestConfigMapUnpacker(t *testing.T) {
 							},
 						},
 						Spec: batchv1.JobSpec{
+							BackoffLimit: &backoffLimit,
 							Template: corev1.PodTemplateSpec{
 								ObjectMeta: metav1.ObjectMeta{
 									Name: pathHash,
 								},
 								Spec: corev1.PodSpec{
-									RestartPolicy: corev1.RestartPolicyOnFailure,
+									RestartPolicy: corev1.RestartPolicyNever,
 									Containers: []corev1.Container{
 										{
 											Name:    "extract",
@@ -761,6 +765,7 @@ func TestConfigMapUnpacker(t *testing.T) {
 			factory := informers.NewSharedInformerFactory(client, period)
 			cmLister := factory.Core().V1().ConfigMaps().Lister()
 			jobLister := factory.Batch().V1().Jobs().Lister()
+			podLister := factory.Core().V1().Pods().Lister()
 			roleLister := factory.Rbac().V1().Roles().Lister()
 			rbLister := factory.Rbac().V1().RoleBindings().Lister()
 
@@ -781,6 +786,7 @@ func TestConfigMapUnpacker(t *testing.T) {
 				WithCatalogSourceLister(csLister),
 				WithConfigMapLister(cmLister),
 				WithJobLister(jobLister),
+				WithPodLister(podLister),
 				WithRoleLister(roleLister),
 				WithRoleBindingLister(rbLister),
 				WithOPMImage(opmImage),
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ var (`
`71`	`71`	`"profiling", false, "serve profiling data (on port 8080)")`
`72`	`72`
`73`	`73`	`installPlanTimeout = flag.Duration("install-plan-retry-timeout", 1*time.Minute, "time since first attempt at which plan execution errors are considered fatal")`
`74`		`- bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", catalog.DefaultBundleUnpackTimeout, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")`
	`74`	`+ bundleUnpackTimeout = flag.Duration("bundle-unpack-timeout", 10*time.Minute, "The time duration after which the bundle unpack Job for an installplan will be aborted and the installplan will be Failed. 0 is considered as having no timeout.")`
`75`	`75`	`)`
`76`	`76`
`77`	`77`	`func init() {`