Skip to content

Commit 01239e5

Browse files
committed
(fix)InstallPlan: Do not tranisition IP to failed on OG/SA failure
In #2077, a new phase `Failed` was introduced for InstallPlans, and failure in detecting a valid OperatorGroup(OG) or a Service Account(SA) for the namespace the InstallPlan was being created in would transition the InstallPlan to the `Failed` state, i.e failure to detected these resources when the InstallPlan was reconciled the first time was considered a permanant failure. This is a regression from the previous behavior of InstallPlans where failure to detect OG/SA would requeue the InstallPlan for reconciliation, so creating the required resources before the retry limit of the informer queue was reached would transition the InstallPlan from the `Installing` phase to the `Complete` phase(unless the bundle unpacking step failed, in which case #2093 introduced transitioning the InstallPlan to the `Failed` phase). This regression introduced oddities for users who has infra built that applies a set of manifests simultaneously to install an operator that includes a Subscription to an operator (that creates InstallPlans) along with the required OG/SAs. In those cases, whenever there was a delay in the reconciliation of the OG/SA, the InstallPlan would be transitioned to a state of permanant faliure. This PR: * Removes the logic that transitioned the InstallPlan to `Failed`. Instead, the InstallPlan will again be requeued for any reconciliation error. * Introduces logic to bubble up reconciliation error through the InstallPlan's status.Conditions, eg: When no OperatorGroup is detected: ``` conditions: - lastTransitionTime: "2021-06-23T18:16:00Z" lastUpdateTime: "2021-06-23T18:16:16Z" message: attenuated service account query failed - no operator group found that is managing this namespace reason: InstallCheckFailed status: "False" type: Installed ``` Then when a valid OperatorGroup is created: ``` conditions: - lastTransitionTime: "2021-06-23T18:33:37Z" lastUpdateTime: "2021-06-23T18:33:37Z" status: "True" type: Installed ``` Signed-off-by: Anik Bhattacharjee <[email protected]>
1 parent cd40303 commit 01239e5

File tree

4 files changed

+167
-183
lines changed

4 files changed

+167
-183
lines changed

doc/design/architecture.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,12 @@ None --> Planning +------>------->------> Installing +---> Complete
102102
| Phase | Description |
103103
|------------------|------------------------------------------------------------------------------------------------|
104104
| None | initial phase, once seen by the Operator, it is immediately transitioned to `Planning` |
105-
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
105+
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
106106
| RequiresApproval | occurs when using manual approval, will not transition phase until `approved` field is true |
107-
| Installing | resolved resources in the InstallPlan `Status` block are being created |
107+
| Installing | waiting for reconciliation of required resource(OG/SA etc), or resolved resources in the
108+
InstallPlan `Status` block are being created |
108109
| Complete | all resolved resources in the `Status` block exist |
109-
| Failed | occurs when resources fail to install or there is an invalid OperatorGroup |
110+
| Failed | occurs when resources fail to install or when bundle unpacking fails |
110111

111112
### Subscription Control Loop
112113

pkg/controller/operators/catalog/operator.go

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1387,28 +1387,36 @@ func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {
13871387

13881388
querier := o.serviceAccountQuerier.NamespaceQuerier(plan.GetNamespace())
13891389
ref, err := querier()
1390-
if err != nil {
1391-
1392-
// Retry sync if non-fatal error
1393-
if !scoped.IsOperatorGroupError(err) {
1394-
syncError = fmt.Errorf("attenuated service account query failed - %v", err)
1395-
return
1396-
}
1397-
1398-
// Mark the InstallPlan as failed for a fatal Operator Group related error
1399-
logger.Infof("attenuated service account query failed - %v", err)
1400-
ipFailError := fmt.Errorf("invalid operator group - %v", err)
14011390

1402-
if err := o.transitionInstallPlanToFailed(plan, logger, v1alpha1.InstallPlanReasonInstallCheckFailed, ipFailError.Error()); err != nil {
1403-
// retry for failure to update status
1391+
out := plan.DeepCopy()
1392+
if err != nil {
1393+
// Set status condition/message and retry sync if any error
1394+
ipFailError := fmt.Errorf("attenuated service account query failed - %v", err)
1395+
logger.Infof(ipFailError.Error())
1396+
err := o.setInstallPlanInstalledCond(out, v1alpha1.InstallPlanReasonInstallCheckFailed, err.Error(), logger)
1397+
if err != nil {
14041398
syncError = err
14051399
return
14061400
}
1407-
1408-
// Requeue subscription to propagate SubscriptionInstallPlanFailed condtion to subscription
1409-
o.requeueSubscriptionForInstallPlan(plan, logger)
1410-
1401+
syncError = ipFailError
14111402
return
1403+
} else {
1404+
// reset condition/message if it had been set in previous sync. This condition is being reset since any delay in the next steps
1405+
// (bundle unpacking/plan step errors being retried for a duration) could lead to this condition sticking around, even after
1406+
// the serviceAccountQuerier returns no error since the error has been resolved (by creating the required resources), which would
1407+
// be confusing to the user
1408+
1409+
// NOTE: this makes the assumption that the InstallPlanInstalledCheckFailed reason is only set in the previous if clause, which is
1410+
// true in the current iteration of the catalog operator. Any future implementation change that aims at setting the reason as
1411+
// InstallPlanInstalledCheckFailed must make sure that either this assumption is not breached, or the condition being set elsewhere
1412+
// is not being unset here unintentionally.
1413+
if cond := out.Status.GetCondition(v1alpha1.InstallPlanInstalled); cond.Reason == v1alpha1.InstallPlanReasonInstallCheckFailed {
1414+
err := o.setInstallPlanInstalledCond(out, v1alpha1.InstallPlanConditionReason(corev1.ConditionUnknown), "", logger)
1415+
if err != nil {
1416+
syncError = err
1417+
return
1418+
}
1419+
}
14121420
}
14131421

14141422
if ref != nil {
@@ -1553,6 +1561,18 @@ func (o *Operator) requeueSubscriptionForInstallPlan(plan *v1alpha1.InstallPlan,
15531561
}
15541562
}
15551563

1564+
func (o *Operator) setInstallPlanInstalledCond(ip *v1alpha1.InstallPlan, reason v1alpha1.InstallPlanConditionReason, message string, logger *logrus.Entry) error {
1565+
now := o.now()
1566+
ip.Status.SetCondition(v1alpha1.ConditionFailed(v1alpha1.InstallPlanInstalled, reason, message, &now))
1567+
_, err := o.client.OperatorsV1alpha1().InstallPlans(ip.GetNamespace()).UpdateStatus(context.TODO(), ip, metav1.UpdateOptions{})
1568+
if err != nil {
1569+
logger = logger.WithField("updateError", err.Error())
1570+
logger.Errorf("error updating InstallPlan status")
1571+
return err
1572+
}
1573+
return nil
1574+
}
1575+
15561576
type installPlanTransitioner interface {
15571577
ExecutePlan(*v1alpha1.InstallPlan) error
15581578
}

pkg/controller/operators/catalog/operator_test.go

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -166,12 +166,12 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
166166
)
167167

168168
tests := []struct {
169-
testName string
170-
err error
171-
in *v1alpha1.InstallPlan
172-
expectedPhase v1alpha1.InstallPlanPhase
173-
174-
clientObjs []runtime.Object
169+
testName string
170+
err error
171+
in *v1alpha1.InstallPlan
172+
expectedPhase v1alpha1.InstallPlanPhase
173+
expectedCondition *v1alpha1.InstallPlanCondition
174+
clientObjs []runtime.Object
175175
}{
176176
{
177177
testName: "NoStatus",
@@ -180,18 +180,22 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
180180
in: installPlan("p", namespace, v1alpha1.InstallPlanPhaseNone),
181181
},
182182
{
183-
// This checks that an installplan is marked as failed when no operatorgroup is present
183+
// This checks that an installplan's status.Condition contains a condition with error message when no operatorgroup is present
184184
testName: "HasSteps/NoOperatorGroup",
185-
err: nil,
186-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
187-
in: ipWithSteps,
185+
err: fmt.Errorf("attenuated service account query failed - no operator group found that is managing this namespace"),
186+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
187+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
188+
Message: "no operator group found that is managing this namespace"},
189+
in: ipWithSteps,
188190
},
189191
{
190-
// This checks that an installplan is marked as failed when multiple operator groups are present for the same namespace
192+
// This checks that an installplan's status.Condition contains a condition with error message when multiple operator groups are present for the same namespace
191193
testName: "HasSteps/TooManyOperatorGroups",
192-
err: nil,
193-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
194+
err: fmt.Errorf("attenuated service account query failed - more than one operator group(s) are managing this namespace count=2"),
195+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
194196
in: ipWithSteps,
197+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
198+
Message: "more than one operator group(s) are managing this namespace count=2"},
195199
clientObjs: []runtime.Object{
196200
operatorGroup("og1", "sa", namespace,
197201
&corev1.ObjectReference{
@@ -208,11 +212,13 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
208212
},
209213
},
210214
{
211-
// This checks that an installplan is marked as failed when no service account is synced for the operator group, i.e the service account ref doesn't exist
215+
// This checks that an installplan's status.Condition contains a condition with error message when no service account is synced for the operator group, i.e the service account ref doesn't exist
212216
testName: "HasSteps/NonExistentServiceAccount",
213-
err: nil,
214-
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
215-
in: ipWithSteps,
217+
err: fmt.Errorf("attenuated service account query failed - please make sure the service account exists. sa=sa1 operatorgroup=ns/og"),
218+
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
219+
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
220+
Message: "please make sure the service account exists. sa=sa1 operatorgroup=ns/og"},
221+
in: ipWithSteps,
216222
clientObjs: []runtime.Object{
217223
operatorGroup("og", "sa1", namespace, nil),
218224
},
@@ -236,6 +242,11 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
236242
require.NoError(t, err)
237243

238244
require.Equal(t, tt.expectedPhase, ip.Status.Phase)
245+
246+
if tt.expectedCondition != nil {
247+
require.True(t, hasExpectedCondition(ip, *tt.expectedCondition))
248+
}
249+
239250
})
240251
}
241252
}
@@ -1830,3 +1841,12 @@ func operatorGroup(ogName, saName, namespace string, saRef *corev1.ObjectReferen
18301841
},
18311842
}
18321843
}
1844+
1845+
func hasExpectedCondition(ip *v1alpha1.InstallPlan, expectedCondition v1alpha1.InstallPlanCondition) bool {
1846+
for _, cond := range ip.Status.Conditions {
1847+
if cond.Type == expectedCondition.Type && cond.Message == expectedCondition.Message && cond.Status == expectedCondition.Status {
1848+
return true
1849+
}
1850+
}
1851+
return false
1852+
}

0 commit comments

Comments
 (0)