4
4
"context"
5
5
"crypto/sha256"
6
6
"fmt"
7
+ "strings"
8
+ "time"
7
9
8
10
"github.com/operator-framework/operator-registry/pkg/api"
9
11
"github.com/operator-framework/operator-registry/pkg/configmap"
@@ -14,6 +16,7 @@ import (
14
16
apierrors "k8s.io/apimachinery/pkg/api/errors"
15
17
"k8s.io/apimachinery/pkg/api/resource"
16
18
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19
+ k8slabels "k8s.io/apimachinery/pkg/labels"
17
20
"k8s.io/client-go/kubernetes"
18
21
listersbatchv1 "k8s.io/client-go/listers/batch/v1"
19
22
listerscorev1 "k8s.io/client-go/listers/core/v1"
@@ -25,6 +28,19 @@ import (
25
28
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/registry/resolver/projection"
26
29
)
27
30
31
+ const (
32
+ // TODO: Move to operator-framework/api/pkg/operators/v1alpha1
33
+ // BundleLookupFailed describes conditions types for when BundleLookups fail
34
+ BundleLookupFailed operatorsv1alpha1.BundleLookupConditionType = "BundleLookupFailed"
35
+
36
+ // TODO: This can be a spec field
37
+ // BundleUnpackTimeoutAnnotationKey allows setting a bundle unpack timeout per InstallPlan
38
+ // and overrides the default specified by the --bundle-unpack-timeout flag
39
+ // The time duration should be in the same format as accepted by time.ParseDuration()
40
+ // e.g 1m30s
41
+ BundleUnpackTimeoutAnnotationKey = "operatorframework.io/bundle-unpack-timeout"
42
+ )
43
+
28
44
type BundleUnpackResult struct {
29
45
* operatorsv1alpha1.BundleLookup
30
46
@@ -66,7 +82,7 @@ func newBundleUnpackResult(lookup *operatorsv1alpha1.BundleLookup) *BundleUnpack
66
82
}
67
83
}
68
84
69
- func (c * ConfigMapUnpacker ) job (cmRef * corev1.ObjectReference , bundlePath string , secrets []corev1.LocalObjectReference ) * batchv1.Job {
85
+ func (c * ConfigMapUnpacker ) job (cmRef * corev1.ObjectReference , bundlePath string , secrets []corev1.LocalObjectReference , annotationUnpackTimeout time. Duration ) * batchv1.Job {
70
86
job := & batchv1.Job {
71
87
Spec : batchv1.JobSpec {
72
88
//ttlSecondsAfterFinished: 0 // can use in the future to not have to clean up job
@@ -75,7 +91,12 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
75
91
Name : cmRef .Name ,
76
92
},
77
93
Spec : corev1.PodSpec {
78
- RestartPolicy : corev1 .RestartPolicyOnFailure ,
94
+ // With restartPolicy = "OnFailure" when the spec.backoffLimit is reached, the job controller will delete all
95
+ // the job's pods to stop them from crashlooping forever.
96
+ // By setting restartPolicy = "Never" the pods don't get cleaned up since they're not running after a failure.
97
+ // Keeping the pods around after failures helps in inspecting the logs of a failed bundle unpack job.
98
+ // See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
99
+ RestartPolicy : corev1 .RestartPolicyNever ,
79
100
ImagePullSecrets : secrets ,
80
101
Containers : []corev1.Container {
81
102
{
@@ -165,26 +186,57 @@ func (c *ConfigMapUnpacker) job(cmRef *corev1.ObjectReference, bundlePath string
165
186
job .SetName (cmRef .Name )
166
187
job .SetOwnerReferences ([]metav1.OwnerReference {ownerRef (cmRef )})
167
188
189
+ // By default the BackoffLimit is set to 6 which with exponential backoff 10s + 20s + 40s ...
190
+ // translates to ~10m of waiting time.
191
+ // We want to fail faster than that when we have repeated failures from the bundle unpack pod
192
+ // so we set it to 3 which is ~1m of waiting time
193
+ // See: https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy
194
+ backOffLimit := int32 (3 )
195
+ job .Spec .BackoffLimit = & backOffLimit
196
+
197
+ // Set ActiveDeadlineSeconds as the unpack timeout
198
+ // Don't set a timeout if it is 0
199
+ if c .unpackTimeout != time .Duration (0 ) {
200
+ t := int64 (c .unpackTimeout .Seconds ())
201
+ job .Spec .ActiveDeadlineSeconds = & t
202
+ }
203
+
204
+ // Check annotationUnpackTimeout which is the annotation override for the default unpack timeout
205
+ // A negative timeout means the annotation was unset or malformed so we ignore it
206
+ if annotationUnpackTimeout < time .Duration (0 ) {
207
+ return job
208
+ }
209
+ // // 0 means no timeout so we unset ActiveDeadlineSeconds
210
+ if annotationUnpackTimeout == time .Duration (0 ) {
211
+ job .Spec .ActiveDeadlineSeconds = nil
212
+ return job
213
+ }
214
+
215
+ timeoutSeconds := int64 (annotationUnpackTimeout .Seconds ())
216
+ job .Spec .ActiveDeadlineSeconds = & timeoutSeconds
217
+
168
218
return job
169
219
}
170
220
171
221
//go:generate go run github.com/maxbrunsfeld/counterfeiter/v6 . Unpacker
172
222
173
223
type Unpacker interface {
174
- UnpackBundle (lookup * operatorsv1alpha1.BundleLookup ) (result * BundleUnpackResult , err error )
224
+ UnpackBundle (lookup * operatorsv1alpha1.BundleLookup , timeout time. Duration ) (result * BundleUnpackResult , err error )
175
225
}
176
226
177
227
type ConfigMapUnpacker struct {
178
- opmImage string
179
- utilImage string
180
- client kubernetes.Interface
181
- csLister listersoperatorsv1alpha1.CatalogSourceLister
182
- cmLister listerscorev1.ConfigMapLister
183
- jobLister listersbatchv1.JobLister
184
- roleLister listersrbacv1.RoleLister
185
- rbLister listersrbacv1.RoleBindingLister
186
- loader * configmap.BundleLoader
187
- now func () metav1.Time
228
+ opmImage string
229
+ utilImage string
230
+ client kubernetes.Interface
231
+ csLister listersoperatorsv1alpha1.CatalogSourceLister
232
+ cmLister listerscorev1.ConfigMapLister
233
+ jobLister listersbatchv1.JobLister
234
+ podLister listerscorev1.PodLister
235
+ roleLister listersrbacv1.RoleLister
236
+ rbLister listersrbacv1.RoleBindingLister
237
+ loader * configmap.BundleLoader
238
+ now func () metav1.Time
239
+ unpackTimeout time.Duration
188
240
}
189
241
190
242
type ConfigMapUnpackerOption func (* ConfigMapUnpacker )
@@ -202,6 +254,12 @@ func NewConfigmapUnpacker(options ...ConfigMapUnpackerOption) (*ConfigMapUnpacke
202
254
return unpacker , nil
203
255
}
204
256
257
+ func WithUnpackTimeout (timeout time.Duration ) ConfigMapUnpackerOption {
258
+ return func (unpacker * ConfigMapUnpacker ) {
259
+ unpacker .unpackTimeout = timeout
260
+ }
261
+ }
262
+
205
263
func WithOPMImage (opmImage string ) ConfigMapUnpackerOption {
206
264
return func (unpacker * ConfigMapUnpacker ) {
207
265
unpacker .opmImage = opmImage
@@ -238,6 +296,12 @@ func WithJobLister(jobLister listersbatchv1.JobLister) ConfigMapUnpackerOption {
238
296
}
239
297
}
240
298
299
+ func WithPodLister (podLister listerscorev1.PodLister ) ConfigMapUnpackerOption {
300
+ return func (unpacker * ConfigMapUnpacker ) {
301
+ unpacker .podLister = podLister
302
+ }
303
+ }
304
+
241
305
func WithRoleLister (roleLister listersrbacv1.RoleLister ) ConfigMapUnpackerOption {
242
306
return func (unpacker * ConfigMapUnpacker ) {
243
307
unpacker .roleLister = roleLister
@@ -276,6 +340,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
276
340
err = fmt .Errorf ("configmap lister is nil" )
277
341
case c .jobLister == nil :
278
342
err = fmt .Errorf ("job lister is nil" )
343
+ case c .podLister == nil :
344
+ err = fmt .Errorf ("pod lister is nil" )
279
345
case c .roleLister == nil :
280
346
err = fmt .Errorf ("role lister is nil" )
281
347
case c .rbLister == nil :
@@ -292,6 +358,8 @@ func (c *ConfigMapUnpacker) validate() (err error) {
292
358
const (
293
359
CatalogSourceMissingReason = "CatalogSourceMissing"
294
360
CatalogSourceMissingMessage = "referenced catalogsource not found"
361
+ JobFailedReason = "JobFailed"
362
+ JobFailedMessage = "unpack job has failed"
295
363
JobIncompleteReason = "JobIncomplete"
296
364
JobIncompleteMessage = "unpack job not completed"
297
365
JobNotStartedReason = "JobNotStarted"
@@ -300,25 +368,32 @@ const (
300
368
NotUnpackedMessage = "bundle contents have not yet been persisted to installplan status"
301
369
)
302
370
303
- func (c * ConfigMapUnpacker ) UnpackBundle (lookup * operatorsv1alpha1.BundleLookup ) (result * BundleUnpackResult , err error ) {
371
+ func (c * ConfigMapUnpacker ) UnpackBundle (lookup * operatorsv1alpha1.BundleLookup , timeout time.Duration ) (result * BundleUnpackResult , err error ) {
372
+
304
373
result = newBundleUnpackResult (lookup )
305
374
306
- // if pending condition is missing, bundle has already been unpacked
307
- cond := result .GetCondition (operatorsv1alpha1 .BundleLookupPending )
308
- if cond .Status == corev1 .ConditionUnknown {
375
+ // if bundle lookup failed condition already present, then there is nothing more to do
376
+ failedCond := result .GetCondition (BundleLookupFailed )
377
+ if failedCond .Status == corev1 .ConditionTrue {
378
+ return result , nil
379
+ }
380
+
381
+ // if pending condition is not true then bundle has already been unpacked(unknown)
382
+ pendingCond := result .GetCondition (operatorsv1alpha1 .BundleLookupPending )
383
+ if pendingCond .Status != corev1 .ConditionTrue {
309
384
return result , nil
310
385
}
311
386
312
387
now := c .now ()
313
388
314
389
var cs * operatorsv1alpha1.CatalogSource
315
390
if cs , err = c .csLister .CatalogSources (result .CatalogSourceRef .Namespace ).Get (result .CatalogSourceRef .Name ); err != nil {
316
- if apierrors .IsNotFound (err ) && cond .Reason != CatalogSourceMissingReason {
317
- cond .Status = corev1 .ConditionTrue
318
- cond .Reason = CatalogSourceMissingReason
319
- cond .Message = CatalogSourceMissingMessage
320
- cond .LastTransitionTime = & now
321
- result .SetCondition (cond )
391
+ if apierrors .IsNotFound (err ) && pendingCond .Reason != CatalogSourceMissingReason {
392
+ pendingCond .Status = corev1 .ConditionTrue
393
+ pendingCond .Reason = CatalogSourceMissingReason
394
+ pendingCond .Message = CatalogSourceMissingMessage
395
+ pendingCond .LastTransitionTime = & now
396
+ result .SetCondition (pendingCond )
322
397
err = nil
323
398
}
324
399
@@ -356,17 +431,50 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
356
431
secrets = append (secrets , corev1.LocalObjectReference {Name : secretName })
357
432
}
358
433
var job * batchv1.Job
359
- job , err = c .ensureJob (cmRef , result .Path , secrets )
360
- if err != nil {
434
+ job , err = c .ensureJob (cmRef , result .Path , secrets , timeout )
435
+ if err != nil || job == nil {
436
+ // ensureJob can return nil if the job present does not match the expected job (spec and ownerefs)
437
+ // The current job is deleted in that case so UnpackBundle needs to be retried
438
+ return
439
+ }
440
+
441
+ // Check if bundle unpack job has failed due a timeout
442
+ // Return a BundleJobError so we can mark the InstallPlan as Failed
443
+ if jobCond , isFailed := getCondition (job , batchv1 .JobFailed ); isFailed {
444
+ // Add the BundleLookupFailed condition with the message and reason from the job failure
445
+ failedCond .Status = corev1 .ConditionTrue
446
+ failedCond .Reason = jobCond .Reason
447
+ failedCond .Message = jobCond .Message
448
+ failedCond .LastTransitionTime = & now
449
+ result .SetCondition (failedCond )
450
+
361
451
return
362
452
}
363
453
364
- if ! jobConditionTrue (job , batchv1 .JobComplete ) && (cond .Status != corev1 .ConditionTrue || cond .Reason != JobIncompleteReason ) {
365
- cond .Status = corev1 .ConditionTrue
366
- cond .Reason = JobIncompleteReason
367
- cond .Message = JobIncompleteMessage
368
- cond .LastTransitionTime = & now
369
- result .SetCondition (cond )
454
+ if _ , isComplete := getCondition (job , batchv1 .JobComplete ); ! isComplete {
455
+ // In the case of an image pull failure for a non-existent image the bundle unpack job
456
+ // can stay pending until the ActiveDeadlineSeconds timeout ~10m
457
+ // To indicate why it's pending we inspect the container statuses of the
458
+ // unpack Job pods to surface that information on the bundle lookup conditions
459
+ pendingMessage := JobIncompleteMessage
460
+ var pendingContainerStatusMsgs string
461
+ pendingContainerStatusMsgs , err = c .pendingContainerStatusMessages (job )
462
+ if err != nil {
463
+ return
464
+ }
465
+
466
+ if pendingContainerStatusMsgs != "" {
467
+ pendingMessage = pendingMessage + ": " + pendingContainerStatusMsgs
468
+ }
469
+
470
+ // Update BundleLookupPending condition if there are any changes
471
+ if pendingCond .Status != corev1 .ConditionTrue || pendingCond .Reason != JobIncompleteReason || pendingCond .Message != pendingMessage {
472
+ pendingCond .Status = corev1 .ConditionTrue
473
+ pendingCond .Reason = JobIncompleteReason
474
+ pendingCond .Message = pendingMessage
475
+ pendingCond .LastTransitionTime = & now
476
+ result .SetCondition (pendingCond )
477
+ }
370
478
371
479
return
372
480
}
@@ -394,6 +502,39 @@ func (c *ConfigMapUnpacker) UnpackBundle(lookup *operatorsv1alpha1.BundleLookup)
394
502
return
395
503
}
396
504
505
+ func (c * ConfigMapUnpacker ) pendingContainerStatusMessages (job * batchv1.Job ) (string , error ) {
506
+ containerStatusMessages := []string {}
507
+ // List pods for unpack job
508
+ podLabel := map [string ]string {"job-name" : job .GetName ()}
509
+ pods , listErr := c .podLister .Pods (job .GetNamespace ()).List (k8slabels .SelectorFromSet (podLabel ))
510
+ if listErr != nil {
511
+ return "" , fmt .Errorf ("Failed to list pods for job(%s): %v" , job .GetName (), listErr )
512
+ }
513
+
514
+ // Ideally there should be just 1 pod running but inspect all pods in the pending phase
515
+ // to see if any are stuck on an ImagePullBackOff or ErrImagePull error
516
+ for _ , pod := range pods {
517
+ if pod .Status .Phase != corev1 .PodPending {
518
+ // skip status check for non-pending pods
519
+ continue
520
+ }
521
+
522
+ for _ , ic := range pod .Status .InitContainerStatuses {
523
+ if ic .Ready {
524
+ // only check non-ready containers for their waiting reasons
525
+ continue
526
+ }
527
+
528
+ // Aggregate the wait reasons for all pending containers
529
+ containerStatusMessages = append (containerStatusMessages ,
530
+ fmt .Sprintf ("Unpack pod(%s/%s) container(%s) is pending. Reason: %s, Message: %s" ,
531
+ pod .Namespace , pod .Name , ic .Name , ic .State .Waiting .Reason , ic .State .Waiting .Message ))
532
+ }
533
+ }
534
+
535
+ return strings .Join (containerStatusMessages , " | " ), nil
536
+ }
537
+
397
538
func (c * ConfigMapUnpacker ) ensureConfigmap (csRef * corev1.ObjectReference , name string ) (cm * corev1.ConfigMap , err error ) {
398
539
fresh := & corev1.ConfigMap {}
399
540
fresh .SetNamespace (csRef .Namespace )
@@ -408,8 +549,8 @@ func (c *ConfigMapUnpacker) ensureConfigmap(csRef *corev1.ObjectReference, name
408
549
return
409
550
}
410
551
411
- func (c * ConfigMapUnpacker ) ensureJob (cmRef * corev1.ObjectReference , bundlePath string , secrets []corev1.LocalObjectReference ) (job * batchv1.Job , err error ) {
412
- fresh := c .job (cmRef , bundlePath , secrets )
552
+ func (c * ConfigMapUnpacker ) ensureJob (cmRef * corev1.ObjectReference , bundlePath string , secrets []corev1.LocalObjectReference , timeout time. Duration ) (job * batchv1.Job , err error ) {
553
+ fresh := c .job (cmRef , bundlePath , secrets , timeout )
413
554
job , err = c .jobLister .Jobs (fresh .GetNamespace ()).Get (fresh .GetName ())
414
555
if err != nil {
415
556
if apierrors .IsNotFound (err ) {
@@ -540,16 +681,19 @@ func ownerRef(ref *corev1.ObjectReference) metav1.OwnerReference {
540
681
}
541
682
}
542
683
543
- // jobConditionTrue returns true if the given job has the given condition with the given condition type true, and returns false otherwise.
544
- func jobConditionTrue (job * batchv1.Job , conditionType batchv1.JobConditionType ) bool {
684
+ // getCondition returns true if the given job has the given condition with the given condition type true, and returns false otherwise.
685
+ // Also returns the condition if true
686
+ func getCondition (job * batchv1.Job , conditionType batchv1.JobConditionType ) (condition * batchv1.JobCondition , isTrue bool ) {
545
687
if job == nil {
546
- return false
688
+ return
547
689
}
548
690
549
691
for _ , cond := range job .Status .Conditions {
550
692
if cond .Type == conditionType && cond .Status == corev1 .ConditionTrue {
551
- return true
693
+ condition = & cond
694
+ isTrue = true
695
+ return
552
696
}
553
697
}
554
- return false
698
+ return
555
699
}
0 commit comments