Skip to content

Bug 1859178: fix(installplans): GC older installplans #1669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions pkg/controller/operators/catalog/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"reflect"
"sort"
"strings"
"sync"
"time"
Expand All @@ -22,6 +23,7 @@ import (
extinf "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
utilclock "k8s.io/apimachinery/pkg/util/clock"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
Expand Down Expand Up @@ -67,6 +69,8 @@ const (
roleKind = "Role"
roleBindingKind = "RoleBinding"
generatedByKey = "olm.generated-by"
maxInstallPlanCount = 5
maxDeletesPerSweep = 5
)

// Operator represents a Kubernetes operator that executes InstallPlans by
Expand Down Expand Up @@ -777,6 +781,8 @@ func (o *Operator) syncResolvingNamespace(obj interface{}) error {
"id": queueinformer.NewLoopID(),
})

o.gcInstallPlans(logger, namespace)

// get the set of sources that should be used for resolution and best-effort get their connections working
logger.Debug("resolving sources")

Expand Down Expand Up @@ -1180,6 +1186,77 @@ func (o *Operator) unpackBundles(plan *v1alpha1.InstallPlan) (bool, *v1alpha1.In
return unpacked, out, nil
}

// gcInstallPlans garbage collects installplans that are too old
// installplans are ownerrefd to all subscription inputs, so they will not otherwise
// be GCd unless all inputs have been deleted.
func (o *Operator) gcInstallPlans(log logrus.FieldLogger, namespace string) {
allIps, err := o.lister.OperatorsV1alpha1().InstallPlanLister().InstallPlans(namespace).List(labels.Everything())
if err != nil {
log.Warn("unable to list installplans for GC")
}

if len(allIps) <= maxInstallPlanCount {
return
}

// we only consider maxDeletesPerSweep more than the allowed number of installplans for delete at one time
ips := allIps
if len(ips) > maxInstallPlanCount + maxDeletesPerSweep {
ips = allIps[:maxInstallPlanCount+maxDeletesPerSweep]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so here we limit the number of extra installplans removed to 8 at a time? Makes sense, but for these clusters with 1000s of installplans they could be waiting a while. I was thinking of a more async process that goes through and deletes these extra ips, eventually reporting status back to OLM. Its definitely more work though and this is an urgent bugfix so idk

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't fully understand the slicing with indices here. Is this array ordered/sorted? Otherwise, why just randomly pick a bunch of InstallPlans from the first 10 in the array?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, there are a few clusters with InstallPlans in order of 1000s. I know one of them has at least 1200. Given the sync period of 15 mins (though in reality the namespace sync happens more often due to resources changes) and the limit of 5 IPs per sweep, would this take a bit too much time to clean up especially in the case of idle cluster? In fact, I would prefer to keep the InstallPlans that are currently referenced in the Subscriptions and clean up the others in a higher rate. Maybe this can be changed in the future instead of now due to the urgent need of getting this in.

Copy link
Member

@njhale njhale Jul 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't fully understand the slicing with indices here. Is this array ordered/sorted? Otherwise, why just randomly pick a bunch of InstallPlans from the first 10 in the array?

IIUC, for each sweep we take up to maxDeletesPerSweep InstallPlans to delete, and leave the rest for the next sweep. We're essentially only sorting a chunk of the list on each sweep, and after X sweeps we'll have sorted the whole thing.

}

byGen := map[int][]*v1alpha1.InstallPlan{}
for _, ip := range ips {
gen, ok := byGen[ip.Spec.Generation]
if !ok {
gen = make([]*v1alpha1.InstallPlan, 0)
}
byGen[ip.Spec.Generation] = append(gen, ip)
}

gens := make([]int, 0)
for i := range byGen {
gens = append(gens, i)
}

sort.Ints(gens)

toDelete := make([]*v1alpha1.InstallPlan, 0)

for _, i := range gens {
g := byGen[i]

if len(ips)-len(toDelete) <= maxInstallPlanCount {
break
}

// if removing all installplans at this generation doesn't dip below the max, safe to delete all of them
if len(ips)-len(toDelete)-len(g) >= maxInstallPlanCount {
toDelete = append(toDelete, g...)
continue
}

// CreationTimestamp sorting shouldn't ever be hit unless there is a bug that causes installplans to be
// generated without bumping the generation. It is here as a safeguard only.

// sort by creation time
sort.Slice(g, func(i, j int) bool {
if !g[i].CreationTimestamp.Equal(&g[j].CreationTimestamp) {
return g[i].CreationTimestamp.Before(&g[j].CreationTimestamp)
}
// final fallback to lexicographic sort, in case many installplans are created with the same timestamp
return g[i].GetName() < g[j].GetName()
})
toDelete = append(toDelete, g[:len(ips)-len(toDelete)-maxInstallPlanCount]...)
}

for _, i := range toDelete {
if err := o.client.OperatorsV1alpha1().InstallPlans(namespace).Delete(context.TODO(), i.GetName(), metav1.DeleteOptions{}); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking we want to set the DeleteOption here?

Copy link
Member

@exdx exdx Jul 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the InstallPlan has no dependents then an Orphan PropagationPolicy might be more efficient?

log.WithField("deleting", i.GetName()).WithError(err).Warn("error GCing old installplan - may have already been deleted")
}
}
}

func (o *Operator) syncInstallPlans(obj interface{}) (syncError error) {
plan, ok := obj.(*v1alpha1.InstallPlan)
if !ok {
Expand Down
82 changes: 81 additions & 1 deletion pkg/controller/operators/catalog/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ import (
"errors"
"fmt"
"io/ioutil"
"math/rand"
"reflect"
"strings"
"testing"
"testing/quick"
"time"

"github.com/sirupsen/logrus"
Expand All @@ -28,6 +31,7 @@ import (
"k8s.io/apimachinery/pkg/types"
utilclock "k8s.io/apimachinery/pkg/util/clock"
utilyaml "k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/apiserver/pkg/storage/names"
fakedynamic "k8s.io/client-go/dynamic/fake"
"k8s.io/client-go/informers"
k8sfake "k8s.io/client-go/kubernetes/fake"
Expand Down Expand Up @@ -168,7 +172,7 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
},
},
),
err: fmt.Errorf("attenuated service account query failed - no operator group found that is managing this namespace"),
err: fmt.Errorf("attenuated service account query failed - no operator group found that is managing this namespace"),
},
}

Expand All @@ -186,6 +190,82 @@ func TestSyncInstallPlanUnhappy(t *testing.T) {
}
}

type ipSet []v1alpha1.InstallPlan

func (ipSet) Generate(rand *rand.Rand, size int) reflect.Value {
ips := []v1alpha1.InstallPlan{}

// each i is the generation value
for i := 0; i < rand.Intn(size)+1; i++ {

// generate a few at each generation to account for bugs that don't increment the generation
for j := 0; j < rand.Intn(3); j++ {
ips = append(ips, v1alpha1.InstallPlan{
ObjectMeta: metav1.ObjectMeta{
Namespace: "ns",
Name: names.SimpleNameGenerator.GenerateName(fmt.Sprintf("%d", i)),
},
Spec: v1alpha1.InstallPlanSpec{
Generation: i,
},
})
}
}
return reflect.ValueOf(ipSet(ips))
}

func TestGCInstallPlans(t *testing.T) {
f := func(ips ipSet) bool {
if len(ips) == 0 {
return true
}
ctx, cancel := context.WithCancel(context.TODO())
defer cancel()

var maxGen int64 = 0
for _, i := range ips {
if g := i.Generation; g > maxGen {
maxGen = g
}
}
objs := make([]runtime.Object, 0)
for _, i := range ips {
objs = append(objs, i.DeepCopy())
}
op, err := NewFakeOperator(ctx, "ns", []string{"ns"}, withClientObjs(objs...))
require.NoError(t, err)

out := make([]v1alpha1.InstallPlan, 0)
for {
op.gcInstallPlans(logrus.New(), "ns")
require.NoError(t, err)

outList, err := op.client.OperatorsV1alpha1().InstallPlans("ns").List(ctx, metav1.ListOptions{})
require.NoError(t, err)
out = outList.Items

if len(out) <= maxInstallPlanCount {
break
}
}

keptMax := false
for _, o := range out {
if o.Generation == maxGen {
keptMax = true
break
}
}
require.True(t, keptMax)

if len(ips) < maxInstallPlanCount {
return len(out) == len(ips)
}
return len(out) == maxInstallPlanCount
}
require.NoError(t, quick.Check(f, nil))
}

func TestExecutePlan(t *testing.T) {
namespace := "ns"

Expand Down