@@ -15,6 +15,7 @@ import (
15
15
apierrors "k8s.io/apimachinery/pkg/api/errors"
16
16
"k8s.io/apimachinery/pkg/api/resource"
17
17
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18
+ k8slabels "k8s.io/apimachinery/pkg/labels"
18
19
"k8s.io/client-go/kubernetes"
19
20
listersbatchv1 "k8s.io/client-go/listers/batch/v1"
20
21
listerscorev1 "k8s.io/client-go/listers/core/v1"
@@ -26,6 +27,12 @@ import (
26
27
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver/projection"
27
28
)
28
29
30
+ const (
31
+ // TODO: Move to operator-framework/api/pkg/operators/v1alpha1
32
+ // BundleLookupFailed describes conditions types for when BundleLookups fail
33
+ BundleLookupFailed operatorsv1alpha1.BundleLookupConditionType = "BundleLookupFailed"
34
+ )
35
+
29
36
type BundleUnpackResult struct {
30
37
* operatorsv1alpha1.BundleLookup
31
38
@@ -76,7 +83,12 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
76
83
Name : cmRef .Name ,
77
84
},
78
85
Spec : corev1.PodSpec {
79
- RestartPolicy : corev1 .RestartPolicyOnFailure ,
86
+ // With restartPolicy = "OnFailure" when the spec.backoffLimit is reached, the job controller will delete all
87
+ // the job's pods to stop them from crashlooping forever.
88
+ // By setting restartPolicy = "Never" the pods don't get cleaned up since they're not running after a failure.
89
+ // Keeping the pods around after failures helps in inspecting the logs of a failed bundle unpack job.
90
+ // See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
91
+ RestartPolicy : corev1 .RestartPolicyNever ,
80
92
ImagePullSecrets : secrets ,
81
93
Containers : []corev1.Container {
82
94
{
@@ -172,6 +184,15 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
172
184
job .Spec .ActiveDeadlineSeconds = & t
173
185
}
174
186
187
+ // By default the BackoffLimit is set to 6 which with exponential backoff 10s + 20s + 40s ...
188
+ // translates to ~10m of waiting time.
189
+ // We want to fail faster than that when we have repeated failures from the bundle unpack pod
190
+ // so we set it to 3 which is ~1m of waiting time
191
+ // See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
192
+ // TODO (haseeb): Should this be configurable as well?
193
+ backOffLimit := int32 (3 )
194
+ job .Spec .BackoffLimit = & backOffLimit
195
+
175
196
return job
176
197
}
177
198
@@ -188,6 +209,7 @@ type ConfigMapUnpacker struct {
188
209
csLister listersoperatorsv1alpha1.CatalogSourceLister
189
210
cmLister listerscorev1.ConfigMapLister
190
211
jobLister listersbatchv1.JobLister
212
+ podLister listerscorev1.PodLister
191
213
roleLister listersrbacv1.RoleLister
192
214
rbLister listersrbacv1.RoleBindingLister
193
215
loader * configmap.BundleLoader
@@ -252,6 +274,12 @@ func WithJobLister(jobLister listersbatchv1.JobLister) ConfigMapUnpackerOption {
252
274
}
253
275
}
254
276
277
+ func WithPodLister (podLister listerscorev1.PodLister ) ConfigMapUnpackerOption {
278
+ return func (unpacker * ConfigMapUnpacker ) {
279
+ unpacker .podLister = podLister
280
+ }
281
+ }
282
+
255
283
func WithRoleLister (roleLister listersrbacv1.RoleLister ) ConfigMapUnpackerOption {
256
284
return func (unpacker * ConfigMapUnpacker ) {
257
285
unpacker .roleLister = roleLister
@@ -290,6 +318,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
290
318
err = fmt .Errorf ("configmap lister is nil" )
291
319
case c .jobLister == nil :
292
320
err = fmt .Errorf ("job lister is nil" )
321
+ case c .podLister == nil :
322
+ err = fmt .Errorf ("pod lister is nil" )
293
323
case c .roleLister == nil :
294
324
err = fmt .Errorf ("role lister is nil" )
295
325
case c .rbLister == nil :
@@ -306,6 +336,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
306
336
const (
307
337
CatalogSourceMissingReason = "CatalogSourceMissing"
308
338
CatalogSourceMissingMessage = "referenced catalogsource not found"
339
+ JobFailedReason = "JobFailed"
340
+ JobFailedMessage = "unpack job has failed"
309
341
JobIncompleteReason = "JobIncomplete"
310
342
JobIncompleteMessage = "unpack job not completed"
311
343
JobNotStartedReason = "JobNotStarted"
@@ -317,11 +349,21 @@ const (
317
349
func (c * ConfigMapUnpacker ) UnpackBundle (lookup * operatorsv1alpha1.BundleLookup ) (result * BundleUnpackResult , err error ) {
318
350
result = newBundleUnpackResult (lookup )
319
351
352
+ // if bundle lookup failed condition already present, then there is nothing more to do
353
+ failedCond := result .GetCondition (BundleLookupFailed )
354
+ if failedCond .Status == corev1 .ConditionTrue {
355
+ return result , nil
356
+ }
357
+
320
358
// if pending condition is missing, bundle has already been unpacked
321
359
cond := result .GetCondition (operatorsv1alpha1 .BundleLookupPending )
322
360
if cond .Status == corev1 .ConditionUnknown {
323
361
return result , nil
324
362
}
363
+ // if pending condition is false, bundle unpack has already failed so nothing more to do
364
+ if cond .Status == corev1 .ConditionFalse {
365
+ return result , nil
366
+ }
325
367
326
368
now := c .now ()
327
369
@@ -379,23 +421,49 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
379
421
// Return a BundleJobError so we can mark the InstallPlan as Failed
380
422
isFailed , jobCond := jobConditionTrue (job , batchv1 .JobFailed )
381
423
if isFailed {
382
- cond .Status = corev1 .ConditionTrue
383
- cond .Reason = jobCond .Reason
384
- cond .Message = jobCond .Message
424
+ // Add the BundleLookupFailed condition with the message and reason from the job failure
425
+ failedCond .Status = corev1 .ConditionTrue
426
+ failedCond .Reason = jobCond .Reason
427
+ failedCond .Message = jobCond .Message
428
+ failedCond .LastTransitionTime = & now
429
+ result .SetCondition (failedCond )
430
+
431
+ // BundleLookupPending is false with reason being job failed
432
+ cond .Status = corev1 .ConditionFalse
433
+ cond .Reason = JobFailedReason
434
+ cond .Message = JobFailedMessage
385
435
cond .LastTransitionTime = & now
386
436
result .SetCondition (cond )
387
437
388
- err = NewBundleJobError (fmt .Sprintf ("Bundle extract Job failed with Reason: %v, and Message: %v" , jobCond .Reason , jobCond .Message ))
389
438
return
390
439
}
391
440
392
441
isComplete , _ := jobConditionTrue (job , batchv1 .JobComplete )
393
- if ! isComplete && (cond .Status != corev1 .ConditionTrue || cond .Reason != JobIncompleteReason ) {
394
- cond .Status = corev1 .ConditionTrue
395
- cond .Reason = JobIncompleteReason
396
- cond .Message = JobIncompleteMessage
397
- cond .LastTransitionTime = & now
398
- result .SetCondition (cond )
442
+ if ! isComplete {
443
+
444
+ // In the case of an image pull failure for a non-existent image the bundle unpack job
445
+ // can stay pending until the ActiveDeadlineSeconds timeout ~10m
446
+ // To indicate why it's pending we inspect the container statuses of the
447
+ // unpack Job pods to surface that information on the bundle lookup conditions
448
+ pendingMessage := JobIncompleteMessage
449
+ var pendingContainerStatusMsgs string
450
+ pendingContainerStatusMsgs , err = c .pendingContainerStatusMessages (job )
451
+ if err != nil {
452
+ return
453
+ }
454
+
455
+ if pendingContainerStatusMsgs != "" {
456
+ pendingMessage = pendingMessage + ": " + pendingContainerStatusMsgs
457
+ }
458
+
459
+ // Update BundleLookupPending condition if there are any changes
460
+ if cond .Status != corev1 .ConditionTrue || cond .Reason != JobIncompleteReason || cond .Message != pendingMessage {
461
+ cond .Status = corev1 .ConditionTrue
462
+ cond .Reason = JobIncompleteReason
463
+ cond .Message = pendingMessage
464
+ cond .LastTransitionTime = & now
465
+ result .SetCondition (cond )
466
+ }
399
467
400
468
return
401
469
}
@@ -423,6 +491,39 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
423
491
return
424
492
}
425
493
494
+ func (c * ConfigMapUnpacker ) pendingContainerStatusMessages (job * batchv1.Job ) (string , error ) {
495
+ containerStatusMessages := ""
496
+ // List pods for unpack job
497
+ podLabel := map [string ]string {"job-name" : job .GetName ()}
498
+ pods , listErr := c .podLister .Pods (job .GetNamespace ()).List (k8slabels .SelectorFromSet (podLabel ))
499
+ if listErr != nil {
500
+ return containerStatusMessages , fmt .Errorf ("Failed to list pods for job(%s): %v" , job .GetName (), listErr )
501
+ }
502
+
503
+ // Ideally there should be just 1 pod running but inspect all pods in the pending phase
504
+ // to see if any are stuck on an ImagePullBackOff or ErrImagePull error
505
+ for _ , pod := range pods {
506
+ if pod .Status .Phase != corev1 .PodPending {
507
+ // skip status check for non-pending pods
508
+ continue
509
+ }
510
+
511
+ for _ , ic := range pod .Status .InitContainerStatuses {
512
+ if ic .Ready {
513
+ // only check non-ready containers for their waiting reasons
514
+ continue
515
+ }
516
+
517
+ // Aggregate the wait reasons for all pending containers
518
+ containerStatusMessages = containerStatusMessages +
519
+ fmt .Sprintf ("Unpack pod(%s/%s) container(%s) is pending. Reason: %s, Message: %s \n " ,
520
+ pod .Namespace , pod .Name , ic .Name , ic .State .Waiting .Reason , ic .State .Waiting .Message )
521
+ }
522
+ }
523
+
524
+ return containerStatusMessages , nil
525
+ }
526
+
426
527
func (c * ConfigMapUnpacker ) ensureConfigmap (csRef * corev1.ObjectReference , name string ) (cm * corev1.ConfigMap , err error ) {
427
528
fresh := & corev1.ConfigMap {}
428
529
fresh .SetNamespace (csRef .Namespace )
0 commit comments