Skip to content

Commit b5633f2

Browse files
committed
(fix)InstallPlan: Do not tranisition IP to failed on OG/SA failure
In operator-framework#2077, a new phase `Failed` was introduced for InstallPlans, and failure in detecting a valid OperatorGroup(OG) or a Service Account(SA) for the namespace the InstallPlan was being created in would transition the InstallPlan to the `Failed` state, i.e failure to detected these resources when the InstallPlan was reconciled the first time was considered a permanant failure. This is a regression from the previous behavior of InstallPlans where failure to detect OG/SA would requeue the InstallPlan for reconciliation, so creating the required resources before the retry limit of the informer queue was reached would transition the InstallPlan from the `Installing` phase to the `Complete` phase(unless the bundle unpacking step failed, in which case operator-framework#2093 introduced transitioning the InstallPlan to the `Failed` phase). This regression introduced oddities for users who has infra built that applies a set of manifests simultaneously to install an operator that includes a Subscription to an operator (that creates InstallPlans) along with the required OG/SAs. In those cases, whenever there was a delay in the reconciliation of the OG/SA, the InstallPlan would be transitioned to a state of permanant faliure. This PR: * Removes the logic that transitioned the InstallPlan to `Failed`. Instead, the InstallPlan will again be requeued for any reconciliation error. * Introduces logic to bubble up reconciliation error through the InstallPlan's status.Conditions, eg: When no OperatorGroup is detected: ``` conditions: - lastTransitionTime: "2021-06-23T18:16:00Z" lastUpdateTime: "2021-06-23T18:16:16Z" message: attenuated service account query failed - no operator group found that is managing this namespace reason: InstallCheckFailed status: "False" type: Installed ``` Then when a valid OperatorGroup is created: ``` conditions: - lastTransitionTime: "2021-06-23T18:33:37Z" lastUpdateTime: "2021-06-23T18:33:37Z" status: "True" type: Installed ``` Signed-off-by: Anik Bhattacharjee <[email protected]>
1 parent cd40303 commit b5633f2

File tree

4 files changed

+58
-225
lines changed

4 files changed

+58
-225
lines changed

doc/design/architecture.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,12 @@ None --> Planning +------>------->------> Installing +---> Complete
102102
| Phase | Description |
103103
|------------------|------------------------------------------------------------------------------------------------|
104104
| None | initial phase, once seen by the Operator, it is immediately transitioned to `Planning` |
105-
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
105+
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
106106
| RequiresApproval | occurs when using manual approval, will not transition phase until `approved` field is true |
107-
| Installing | resolved resources in the InstallPlan `Status` block are being created |
107+
| Installing | waiting for reconciliation of required resource(OG/SA etc), or resolved resources in the
108+
InstallPlan `Status` block are being created |
108109
| Complete | all resolved resources in the `Status` block exist |
109-
| Failed | occurs when resources fail to install or there is an invalid OperatorGroup |
110+
| Failed | occurs when resources fail to install or when bundle unpacking fails |
110111

111112
### Subscription Control Loop
112113

pkg/controller/operators/catalog/operator.go

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,27 +1387,23 @@ func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {
13871387

13881388
querier := o.serviceAccountQuerier.NamespaceQuerier(plan.GetNamespace())
13891389
ref, err := querier()
1390-
if err != nil {
1391-
1392-
// Retry sync if non-fatal error
1393-
if !scoped.IsOperatorGroupError(err) {
1394-
syncError = fmt.Errorf("attenuated service account query failed - %v", err)
1395-
return
1396-
}
1397-
1398-
// Mark the InstallPlan as failed for a fatal Operator Group related error
1399-
logger.Infof("attenuated service account query failed - %v", err)
1400-
ipFailError := fmt.Errorf("invalid operator group - %v", err)
14011390

1402-
if err := o.transitionInstallPlanToFailed(plan, logger, v1alpha1.InstallPlanReasonInstallCheckFailed, ipFailError.Error()); err != nil {
1403-
// retry for failure to update status
1391+
if err != nil {
1392+
// Set status condition/message and retry sync if any error
1393+
ipFailError := fmt.Errorf("attenuated service account query failed - %v", err)
1394+
logger.Infof(ipFailError.Error())
1395+
now := o.now()
1396+
out := plan.DeepCopy()
1397+
out.Status.SetCondition(v1alpha1.ConditionFailed(v1alpha1.InstallPlanInstalled,
1398+
v1alpha1.InstallPlanReasonInstallCheckFailed, ipFailError.Error(), &now))
1399+
_, err := o.client.OperatorsV1alpha1().InstallPlans(plan.GetNamespace()).UpdateStatus(context.TODO(), out, metav1.UpdateOptions{})
1400+
if err != nil {
1401+
logger = logger.WithField("updateError", err.Error())
1402+
logger.Errorf("error updating InstallPlan status")
14041403
syncError = err
14051404
return
14061405
}
1407-
1408-
// Requeue subscription to propagate SubscriptionInstallPlanFailed condtion to subscription
1409-
o.requeueSubscriptionForInstallPlan(plan, logger)
1410-
1406+
syncError = ipFailError
14111407
return
14121408
}
14131409

pkg/controller/operators/catalog/operator_test.go

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,12 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
166166
)
167167

168168
tests := []struct {
169-
testName string
170-
err error
171-
in *v1alpha1.InstallPlan
172-
expectedPhase v1alpha1.InstallPlanPhase
173-
174-
clientObjs []runtime.Object
169+
testName string
170+
err error
171+
in *v1alpha1.InstallPlan
172+
expectedPhase v1alpha1.InstallPlanPhase
173+
expectedCondition *v1alpha1.InstallPlanCondition
174+
clientObjs []runtime.Object
175175
}{
176176
{
177177
testName: "NoStatus",
@@ -182,16 +182,20 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
182182
{
183183
// This checks that an installplan is marked as failed when no operatorgroup is present
184184
testName: "HasSteps/NoOperatorGroup",
185-
err: nil,
186-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
187-
in: ipWithSteps,
185+
err: fmt.Errorf("attenuated service account query failed - no operator group found that is managing this namespace"),
186+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
187+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
188+
Message: "no operator group found that is managing this namespace"},
189+
in: ipWithSteps,
188190
},
189191
{
190192
// This checks that an installplan is marked as failed when multiple operator groups are present for the same namespace
191193
testName: "HasSteps/TooManyOperatorGroups",
192-
err: nil,
193-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
194+
err: fmt.Errorf("attenuated service account query failed - more than one operator group(s) are managing this namespace count=2"),
195+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
194196
in: ipWithSteps,
197+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
198+
Message: "more than one operator group(s) are managing this namespace count=2"},
195199
clientObjs: []runtime.Object{
196200
operatorGroup("og1", "sa", namespace,
197201
&corev1.ObjectReference{
@@ -210,9 +214,11 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
210214
{
211215
// This checks that an installplan is marked as failed when no service account is synced for the operator group, i.e the service account ref doesn't exist
212216
testName: "HasSteps/NonExistentServiceAccount",
213-
err: nil,
214-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
215-
in: ipWithSteps,
217+
err: fmt.Errorf("attenuated service account query failed - please make sure the service account exists. sa=sa1 operatorgroup=ns/og"),
218+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
219+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
220+
Message: "please make sure the service account exists. sa=sa1 operatorgroup=ns/og"},
221+
in: ipWithSteps,
216222
clientObjs: []runtime.Object{
217223
operatorGroup("og", "sa1", namespace, nil),
218224
},
@@ -236,6 +242,11 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
236242
require.NoError(t, err)
237243

238244
require.Equal(t, tt.expectedPhase, ip.Status.Phase)
245+
246+
if tt.expectedCondition != nil {
247+
require.True(t, hasExpectedCondition(ip, *tt.expectedCondition))
248+
}
249+
239250
})
240251
}
241252
}
@@ -1830,3 +1841,15 @@ func operatorGroup(ogName, saName, namespace string, saRef *corev1.ObjectReferen
18301841
},
18311842
}
18321843
}
1844+
1845+
func hasExpectedCondition(ip *v1alpha1.InstallPlan, expectedCondition v1alpha1.InstallPlanCondition) bool {
1846+
1847+
hasCondition := false
1848+
for _, cond := range ip.Status.Conditions {
1849+
if cond.Type == expectedCondition.Type && cond.Message == expectedCondition.Message && cond.Status == expectedCondition.Status {
1850+
hasCondition = true
1851+
break
1852+
}
1853+
}
1854+
return hasCondition
1855+
}

test/e2e/installplan_e2e_test.go

Lines changed: 4 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -3145,162 +3145,6 @@ var _ = Describe("Install Plan", func() {
31453145
require.Equal(GinkgoT(), 1, len(ips.Items), "If this test fails it should be taken seriously and not treated as a flake. \n%v", ips.Items)
31463146
})
31473147

3148-
It("should fail an InstallPlan when no OperatorGroup is present", func() {
3149-
3150-
ns := &corev1.Namespace{}
3151-
ns.SetName(genName("ns-"))
3152-
3153-
c := newKubeClient()
3154-
crc := newCRClient()
3155-
3156-
// Create a namespace
3157-
ns, err := c.KubernetesInterface().CoreV1().Namespaces().Create(context.TODO(), ns, metav1.CreateOptions{})
3158-
Expect(err).NotTo(HaveOccurred())
3159-
3160-
deleteOpts := &metav1.DeleteOptions{}
3161-
defer func() {
3162-
err := c.KubernetesInterface().CoreV1().Namespaces().Delete(context.TODO(), ns.GetName(), *deleteOpts)
3163-
Expect(err).NotTo(HaveOccurred())
3164-
}()
3165-
3166-
// Create InstallPlan
3167-
installPlanName := "ip"
3168-
ip := newInstallPlanWithDummySteps(installPlanName, ns.GetName(), operatorsv1alpha1.InstallPlanPhaseNone)
3169-
outIP, err := crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).Create(context.TODO(), ip, metav1.CreateOptions{})
3170-
Expect(err).NotTo(HaveOccurred())
3171-
Expect(outIP).NotTo(BeNil())
3172-
3173-
// The status gets ignored on create so we need to update it else the InstallPlan sync ignores
3174-
// InstallPlans without any steps or bundle lookups
3175-
outIP.Status = ip.Status
3176-
_, err = crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).UpdateStatus(context.TODO(), outIP, metav1.UpdateOptions{})
3177-
Expect(err).NotTo(HaveOccurred())
3178-
3179-
fetchedInstallPlan, err := fetchInstallPlanWithNamespace(GinkgoT(), crc, installPlanName, ns.GetName(), buildInstallPlanPhaseCheckFunc(operatorsv1alpha1.InstallPlanPhaseFailed))
3180-
Expect(err).NotTo(HaveOccurred())
3181-
Expect(fetchedInstallPlan).NotTo(BeNil())
3182-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with phase %s", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Phase))
3183-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with conditions %+v", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Conditions))
3184-
})
3185-
3186-
It("should fail an InstallPlan when multiple OperatorGroups are present", func() {
3187-
3188-
ns := &corev1.Namespace{}
3189-
ns.SetName(genName("ns-"))
3190-
3191-
c := newKubeClient()
3192-
crc := newCRClient()
3193-
3194-
// Create a namespace
3195-
ns, err := c.KubernetesInterface().CoreV1().Namespaces().Create(context.TODO(), ns, metav1.CreateOptions{})
3196-
Expect(err).NotTo(HaveOccurred())
3197-
deleteOpts := &metav1.DeleteOptions{}
3198-
defer func() {
3199-
err = c.KubernetesInterface().CoreV1().Namespaces().Delete(context.TODO(), ns.GetName(), *deleteOpts)
3200-
Expect(err).ToNot(HaveOccurred())
3201-
}()
3202-
3203-
// Create 2 operatorgroups in the same namespace
3204-
og1 := &operatorsv1.OperatorGroup{
3205-
ObjectMeta: metav1.ObjectMeta{
3206-
Name: "og1",
3207-
},
3208-
Spec: operatorsv1.OperatorGroupSpec{
3209-
TargetNamespaces: []string{ns.GetName()},
3210-
},
3211-
}
3212-
og2 := &operatorsv1.OperatorGroup{
3213-
ObjectMeta: metav1.ObjectMeta{
3214-
Name: "og2",
3215-
},
3216-
Spec: operatorsv1.OperatorGroupSpec{
3217-
TargetNamespaces: []string{ns.GetName()},
3218-
},
3219-
}
3220-
_, err = crc.OperatorsV1().OperatorGroups(ns.GetName()).Create(context.TODO(), og1, metav1.CreateOptions{})
3221-
Expect(err).ToNot(HaveOccurred())
3222-
_, err = crc.OperatorsV1().OperatorGroups(ns.GetName()).Create(context.TODO(), og2, metav1.CreateOptions{})
3223-
Expect(err).ToNot(HaveOccurred())
3224-
3225-
installPlanName := "ip"
3226-
ip := newInstallPlanWithDummySteps(installPlanName, ns.GetName(), operatorsv1alpha1.InstallPlanPhaseNone)
3227-
outIP, err := crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).Create(context.TODO(), ip, metav1.CreateOptions{})
3228-
Expect(err).NotTo(HaveOccurred())
3229-
Expect(outIP).NotTo(BeNil())
3230-
3231-
// The status gets ignored on create so we need to update it else the InstallPlan sync ignores
3232-
// InstallPlans without any steps or bundle lookups
3233-
outIP.Status = ip.Status
3234-
_, err = crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).UpdateStatus(context.TODO(), outIP, metav1.UpdateOptions{})
3235-
Expect(err).NotTo(HaveOccurred())
3236-
3237-
fetchedInstallPlan, err := fetchInstallPlanWithNamespace(GinkgoT(), crc, installPlanName, ns.GetName(), buildInstallPlanPhaseCheckFunc(operatorsv1alpha1.InstallPlanPhaseFailed))
3238-
Expect(err).ToNot(HaveOccurred())
3239-
Expect(fetchedInstallPlan).NotTo(BeNil())
3240-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with status %s", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Phase))
3241-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with conditions %+v", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Conditions))
3242-
})
3243-
3244-
It("should fail an InstallPlan when an OperatorGroup specifies an unsynced ServiceAccount with no ServiceAccountRef", func() {
3245-
3246-
ns := &corev1.Namespace{}
3247-
ns.SetName(genName("ns-"))
3248-
3249-
c := newKubeClient()
3250-
crc := newCRClient()
3251-
3252-
// Create a namespace
3253-
ns, err := c.KubernetesInterface().CoreV1().Namespaces().Create(context.TODO(), ns, metav1.CreateOptions{})
3254-
Expect(err).NotTo(HaveOccurred())
3255-
deleteOpts := &metav1.DeleteOptions{}
3256-
defer func() {
3257-
err = c.KubernetesInterface().CoreV1().Namespaces().Delete(context.TODO(), ns.GetName(), *deleteOpts)
3258-
Expect(err).ToNot(HaveOccurred())
3259-
}()
3260-
3261-
// OperatorGroup with serviceaccount specified
3262-
og := &operatorsv1.OperatorGroup{
3263-
ObjectMeta: metav1.ObjectMeta{
3264-
Name: "og",
3265-
},
3266-
Spec: operatorsv1.OperatorGroupSpec{
3267-
TargetNamespaces: []string{ns.GetName()},
3268-
ServiceAccountName: "foobar",
3269-
},
3270-
}
3271-
og, err = crc.OperatorsV1().OperatorGroups(ns.GetName()).Create(context.TODO(), og, metav1.CreateOptions{})
3272-
Expect(err).ToNot(HaveOccurred())
3273-
Expect(og).NotTo(BeNil())
3274-
3275-
// Update OperatorGroup status with namespace but no service account reference
3276-
now := metav1.Now()
3277-
og.Status = operatorsv1.OperatorGroupStatus{
3278-
Namespaces: []string{ns.GetName()},
3279-
ServiceAccountRef: nil,
3280-
LastUpdated: &now,
3281-
}
3282-
_, err = crc.OperatorsV1().OperatorGroups(ns.GetName()).UpdateStatus(context.TODO(), og, metav1.UpdateOptions{})
3283-
Expect(err).ToNot(HaveOccurred())
3284-
3285-
installPlanName := "ip"
3286-
ip := newInstallPlanWithDummySteps(installPlanName, ns.GetName(), operatorsv1alpha1.InstallPlanPhaseNone)
3287-
outIP, err := crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).Create(context.TODO(), ip, metav1.CreateOptions{})
3288-
Expect(err).NotTo(HaveOccurred())
3289-
Expect(outIP).NotTo(BeNil())
3290-
3291-
// The status gets ignored on create so we need to update it else the InstallPlan sync ignores
3292-
// InstallPlans without any steps or bundle lookups
3293-
outIP.Status = ip.Status
3294-
_, err = crc.OperatorsV1alpha1().InstallPlans(ns.GetName()).UpdateStatus(context.TODO(), outIP, metav1.UpdateOptions{})
3295-
Expect(err).NotTo(HaveOccurred())
3296-
3297-
fetchedInstallPlan, err := fetchInstallPlanWithNamespace(GinkgoT(), crc, installPlanName, ns.GetName(), buildInstallPlanPhaseCheckFunc(operatorsv1alpha1.InstallPlanPhaseFailed))
3298-
Expect(err).NotTo(HaveOccurred())
3299-
Expect(fetchedInstallPlan).NotTo(BeNil())
3300-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with status %s", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Phase))
3301-
ctx.Ctx().Logf(fmt.Sprintf("Install plan %s fetched with conditions %+v", fetchedInstallPlan.GetName(), fetchedInstallPlan.Status.Conditions))
3302-
})
3303-
33043148
When("waiting on the bundle unpacking job", func() {
33053149
var (
33063150
ns *corev1.Namespace
@@ -3345,7 +3189,7 @@ var _ = Describe("Install Plan", func() {
33453189
return ctx.Ctx().Client().Create(context.Background(), og)
33463190
}, timeout, interval).Should(Succeed(), "could not create OperatorGroup")
33473191

3348-
// Wait for the OperatorGroup to be synced so the InstallPlan doesn't fail due to an invalid OperatorGroup
3192+
// Wait for the OperatorGroup to be synced so the InstallPlan doesn't have to be resynced due to an invalid OperatorGroup
33493193
Eventually(
33503194
func() ([]string, error) {
33513195
err := ctx.Ctx().Client().Get(context.Background(), client.ObjectKeyFromObject(og), og)
@@ -3641,7 +3485,7 @@ var _ = Describe("Install Plan", func() {
36413485

36423486
// Wait for the OperatorGroup to be synced and have a status.ServiceAccountRef
36433487
// before moving on. Otherwise the catalog operator treats it as an invalid OperatorGroup
3644-
// and fails the InstallPlan
3488+
// and the InstallPlan is resynced
36453489
Eventually(func() (*corev1.ObjectReference, error) {
36463490
outOG, err := crc.OperatorsV1().OperatorGroups(ns.GetName()).Get(context.TODO(), og.Name, metav1.GetOptions{})
36473491
if err != nil {
@@ -3897,7 +3741,7 @@ var _ = Describe("Install Plan", func() {
38973741

38983742
// Wait for the OperatorGroup to be synced and have a status.ServiceAccountRef
38993743
// before moving on. Otherwise the catalog operator treats it as an invalid OperatorGroup
3900-
// and fails the InstallPlan
3744+
// and the InstallPlan is resynced
39013745
Eventually(func() (*corev1.ObjectReference, error) {
39023746
outOG, err := crc.OperatorsV1().OperatorGroups(ns.GetName()).Get(context.TODO(), og.Name, metav1.GetOptions{})
39033747
if err != nil {
@@ -4072,7 +3916,7 @@ func validateCRDVersions(t GinkgoTInterface, c operatorclient.ClientInterface, n
40723916

40733917
func buildInstallPlanPhaseCheckFunc(phases ...operatorsv1alpha1.InstallPlanPhase) checkInstallPlanFunc {
40743918
return func(fip *operatorsv1alpha1.InstallPlan) bool {
4075-
ctx.Ctx().Logf("installplan is %s", fip.Status.Phase)
3919+
ctx.Ctx().Logf("installplan %v is in phase %v", fip.GetName(), fip.Status.Phase)
40763920
satisfiesAny := false
40773921
for _, phase := range phases {
40783922
satisfiesAny = satisfiesAny || fip.Status.Phase == phase
@@ -4298,34 +4142,3 @@ func newCSV(name, namespace, replaces string, version semver.Version, owned []ap
42984142

42994143
return csv
43004144
}
4301-
4302-
func newInstallPlanWithDummySteps(name, namespace string, phase operatorsv1alpha1.InstallPlanPhase) *operatorsv1alpha1.InstallPlan {
4303-
return &operatorsv1alpha1.InstallPlan{
4304-
ObjectMeta: metav1.ObjectMeta{
4305-
Name: name,
4306-
Namespace: namespace,
4307-
},
4308-
Spec: operatorsv1alpha1.InstallPlanSpec{
4309-
ClusterServiceVersionNames: []string{"foobar"},
4310-
Approval: operatorsv1alpha1.ApprovalAutomatic,
4311-
Approved: true,
4312-
},
4313-
Status: operatorsv1alpha1.InstallPlanStatus{
4314-
CatalogSources: []string{"catalog"},
4315-
Phase: phase,
4316-
Plan: []*operatorsv1alpha1.Step{
4317-
{
4318-
Resource: operatorsv1alpha1.StepResource{
4319-
CatalogSource: "catalog",
4320-
CatalogSourceNamespace: namespace,
4321-
Group: "",
4322-
Version: "v1",
4323-
Kind: "Foo",
4324-
Name: "bar",
4325-
},
4326-
Status: operatorsv1alpha1.StepStatusUnknown,
4327-
},
4328-
},
4329-
},
4330-
}
4331-
}

0 commit comments

Comments
 (0)