Skip to content

(fix)InstallPlan: Do not tranisition IP to failed on OG/SA failure #2215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/design/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,12 @@ None --> Planning +------>------->------> Installing +---> Complete
| Phase | Description |
|------------------|------------------------------------------------------------------------------------------------|
| None | initial phase, once seen by the Operator, it is immediately transitioned to `Planning` |
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
| Planning | dependencies between resources are being resolved, to be stored in the InstallPlan `Status` |
| RequiresApproval | occurs when using manual approval, will not transition phase until `approved` field is true |
| Installing | resolved resources in the InstallPlan `Status` block are being created |
| Installing | waiting for reconciliation of required resource(OG/SA etc), or resolved resources in the
InstallPlan `Status` block are being created |
| Complete | all resolved resources in the `Status` block exist |
| Failed | occurs when resources fail to install or there is an invalid OperatorGroup |
| Failed | occurs when resources fail to install or when bundle unpacking fails |

### Subscription Control Loop

Expand Down
54 changes: 37 additions & 17 deletions pkg/controller/operators/catalog/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1387,28 +1387,36 @@ func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {

querier := o.serviceAccountQuerier.NamespaceQuerier(plan.GetNamespace())
ref, err := querier()
if err != nil {

// Retry sync if non-fatal error
if !scoped.IsOperatorGroupError(err) {
syncError = fmt.Errorf("attenuated service account query failed - %v", err)
return
}

// Mark the InstallPlan as failed for a fatal Operator Group related error
logger.Infof("attenuated service account query failed - %v", err)
ipFailError := fmt.Errorf("invalid operator group - %v", err)

if err := o.transitionInstallPlanToFailed(plan, logger, v1alpha1.InstallPlanReasonInstallCheckFailed, ipFailError.Error()); err != nil {
// retry for failure to update status
out := plan.DeepCopy()
if err != nil {
// Set status condition/message and retry sync if any error
ipFailError := fmt.Errorf("attenuated service account query failed - %v", err)
logger.Infof(ipFailError.Error())
_, err := o.setInstallPlanInstalledCond(out, v1alpha1.InstallPlanReasonInstallCheckFailed, err.Error(), logger)
if err != nil {
syncError = err
return
}

// Requeue subscription to propagate SubscriptionInstallPlanFailed condtion to subscription
o.requeueSubscriptionForInstallPlan(plan, logger)

syncError = ipFailError
return
} else {
// reset condition/message if it had been set in previous sync. This condition is being reset since any delay in the next steps
// (bundle unpacking/plan step errors being retried for a duration) could lead to this condition sticking around, even after
// the serviceAccountQuerier returns no error since the error has been resolved (by creating the required resources), which would
// be confusing to the user
Comment on lines +1404 to +1407
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably add a unit test case to test this path and check that this holds, i.e pass an IP with the condtionInstalling=false and reason InstallPlanInstalledCheckFailed and check that the condition gets reset with an OG present.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added an e2e test for this instead since an e2e test feels more natural for this.


// NOTE: this makes the assumption that the InstallPlanInstalledCheckFailed reason is only set in the previous if clause, which is
// true in the current iteration of the catalog operator. Any future implementation change that aims at setting the reason as
// InstallPlanInstalledCheckFailed must make sure that either this assumption is not breached, or the condition being set elsewhere
// is not being unset here unintentionally.
if cond := out.Status.GetCondition(v1alpha1.InstallPlanInstalled); cond.Reason == v1alpha1.InstallPlanReasonInstallCheckFailed {
plan, err = o.setInstallPlanInstalledCond(out, v1alpha1.InstallPlanConditionReason(corev1.ConditionUnknown), "", logger)
if err != nil {
syncError = err
return
}
}
}

if ref != nil {
Expand Down Expand Up @@ -1553,6 +1561,18 @@ func (o *Operator) requeueSubscriptionForInstallPlan(plan *v1alpha1.InstallPlan,
}
}

func (o *Operator) setInstallPlanInstalledCond(ip *v1alpha1.InstallPlan, reason v1alpha1.InstallPlanConditionReason, message string, logger *logrus.Entry) (*v1alpha1.InstallPlan, error) {
now := o.now()
ip.Status.SetCondition(v1alpha1.ConditionFailed(v1alpha1.InstallPlanInstalled, reason, message, &now))
outIP, err := o.client.OperatorsV1alpha1().InstallPlans(ip.GetNamespace()).UpdateStatus(context.TODO(), ip, metav1.UpdateOptions{})
if err != nil {
logger = logger.WithField("updateError", err.Error())
logger.Errorf("error updating InstallPlan status")
return nil, nil
}
return outIP, nil
}

type installPlanTransitioner interface {
ExecutePlan(*v1alpha1.InstallPlan) error
}
Expand Down
54 changes: 37 additions & 17 deletions pkg/controller/operators/catalog/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
)

tests := []struct {
testName string
err error
in *v1alpha1.InstallPlan
expectedPhase v1alpha1.InstallPlanPhase

clientObjs []runtime.Object
testName string
err error
in *v1alpha1.InstallPlan
expectedPhase v1alpha1.InstallPlanPhase
expectedCondition *v1alpha1.InstallPlanCondition
clientObjs []runtime.Object
}{
{
testName: "NoStatus",
Expand All @@ -180,18 +180,22 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
in: installPlan("p", namespace, v1alpha1.InstallPlanPhaseNone),
},
{
// This checks that an installplan is marked as failed when no operatorgroup is present
// This checks that an installplan's status.Condition contains a condition with error message when no operatorgroup is present
testName: "HasSteps/NoOperatorGroup",
err: nil,
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
in: ipWithSteps,
err: fmt.Errorf("attenuated service account query failed - no operator group found that is managing this namespace"),
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
Message: "no operator group found that is managing this namespace"},
in: ipWithSteps,
},
{
// This checks that an installplan is marked as failed when multiple operator groups are present for the same namespace
// This checks that an installplan's status.Condition contains a condition with error message when multiple operator groups are present for the same namespace
testName: "HasSteps/TooManyOperatorGroups",
err: nil,
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
err: fmt.Errorf("attenuated service account query failed - more than one operator group(s) are managing this namespace count=2"),
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
in: ipWithSteps,
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
Message: "more than one operator group(s) are managing this namespace count=2"},
clientObjs: []runtime.Object{
operatorGroup("og1", "sa", namespace,
&corev1.ObjectReference{
Expand All @@ -208,11 +212,13 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
},
},
{
// This checks that an installplan is marked as failed when no service account is synced for the operator group, i.e the service account ref doesn't exist
// This checks that an installplan's status.Condition contains a condition with error message when no service account is synced for the operator group, i.e the service account ref doesn't exist
testName: "HasSteps/NonExistentServiceAccount",
err: nil,
expectedPhase: v1alpha1.InstallPlanPhaseFailed,
in: ipWithSteps,
err: fmt.Errorf("attenuated service account query failed - please make sure the service account exists. sa=sa1 operatorgroup=ns/og"),
expectedPhase: v1alpha1.InstallPlanPhaseInstalling,
expectedCondition: &v1alpha1.InstallPlanCondition{Type: v1alpha1.InstallPlanInstalled, Status: corev1.ConditionFalse, Reason: v1alpha1.InstallPlanReasonInstallCheckFailed,
Message: "please make sure the service account exists. sa=sa1 operatorgroup=ns/og"},
in: ipWithSteps,
clientObjs: []runtime.Object{
operatorGroup("og", "sa1", namespace, nil),
},
Expand All @@ -236,6 +242,11 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
require.NoError(t, err)

require.Equal(t, tt.expectedPhase, ip.Status.Phase)

if tt.expectedCondition != nil {
require.True(t, hasExpectedCondition(ip, *tt.expectedCondition))
}

})
}
}
Expand Down Expand Up @@ -1830,3 +1841,12 @@ func operatorGroup(ogName, saName, namespace string, saRef *corev1.ObjectReferen
},
}
}

func hasExpectedCondition(ip *v1alpha1.InstallPlan, expectedCondition v1alpha1.InstallPlanCondition) bool {
for _, cond := range ip.Status.Conditions {
if cond.Type == expectedCondition.Type && cond.Message == expectedCondition.Message && cond.Status == expectedCondition.Status {
return true
}
}
return false
}
Loading