Skip to content

[ws-manager-mk2] Support workspace snapshots #16471

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions components/ws-daemon/pkg/controller/snapshot_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.

package controller

import (
"context"
"fmt"

"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"

workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

// SnapshotReconciler reconciles a Snapshot object
type SnapshotReconciler struct {
client.Client
nodeName string
operations *WorkspaceOperations
}

func NewSnapshotController(c client.Client, nodeName string, wso *WorkspaceOperations) *SnapshotReconciler {
return &SnapshotReconciler{
Client: c,
nodeName: nodeName,
operations: wso,
}
}

// SetupWithManager sets up the controller with the Manager.
func (r *SnapshotReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Named("snapshot").
For(&workspacev1.Snapshot{}).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed this recently for the other controllers, lets explicitly name each controller, as the name shows up in the metrics and allows us to differentiate different controllers for the same resource. (By default the controller name is the name of the resource it owns)

Suggested change
For(&workspacev1.Snapshot{}).
Named("snapshot").
For(&workspacev1.Snapshot{}).

WithEventFilter(snapshotEventFilter(r.nodeName)).
Complete(r)
}

func snapshotEventFilter(nodeName string) predicate.Predicate {
return predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
if ss, ok := e.Object.(*workspacev1.Snapshot); ok {
return ss.Spec.NodeName == nodeName
}
return false
},
UpdateFunc: func(ue event.UpdateEvent) bool {
return false
},
DeleteFunc: func(de event.DeleteEvent) bool {
return false
},
}
}

//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=snapshots,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=snapshots/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=snapshots/finalizers,verbs=update

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
func (ssc *SnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)

var snapshot workspacev1.Snapshot
if err := ssc.Client.Get(ctx, req.NamespacedName, &snapshot); err != nil {
return ctrl.Result{}, client.IgnoreNotFound(err)
}

if snapshot.Status.Completed {
return ctrl.Result{}, nil
}

snapshotURL, snapshotName, snapshotErr := ssc.operations.SnapshotIDs(snapshot.Spec.WorkspaceID)
if snapshotErr != nil {
return ctrl.Result{}, snapshotErr
}

err := retry.RetryOnConflict(retryParams, func() error {
err := ssc.Client.Get(ctx, req.NamespacedName, &snapshot)
if err != nil {
return err
}

snapshot.Status.URL = snapshotURL
return ssc.Client.Status().Update(ctx, &snapshot)
})

if err != nil {
log.Error(err, "could not set snapshot url", "workspace", snapshot.Spec.WorkspaceID)
return ctrl.Result{}, err
}

snapshotErr = ssc.operations.TakeSnapshot(ctx, snapshot.Spec.WorkspaceID, snapshotName)
err = retry.RetryOnConflict(retryParams, func() error {
err := ssc.Client.Get(ctx, req.NamespacedName, &snapshot)
if err != nil {
return err
}

snapshot.Status.Completed = true
if snapshotErr != nil {
snapshot.Status.Error = fmt.Errorf("could not take snapshot: %w", snapshotErr).Error()
}

return ssc.Status().Update(ctx, &snapshot)
})

if err != nil {
log.Error(err, "could not set completion status for snapshot", "workspace", snapshot.Spec.WorkspaceID)
}

return ctrl.Result{}, err
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we delete the Snapshot resource somewhere after some time?

Might make sense to set a completion timestamp, and then clean up any snapshots completed e.g. > 30 mins ago?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The owner of the snapshot is the workspace so the snapshot gets deleted when the workspace gets deleted (cascading delete) which I think is pretty nice. We can inspect the snapshots taken by a workspace while it is running and we ensure that the snapshot CR does not outlive the workspace so that we do not try to take a snapshot from a stopped workspace.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ import (
csapi "github.com/gitpod-io/gitpod/content-service/api"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/container"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/content"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/internal/session"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/iws"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/quota"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -52,40 +50,18 @@ type WorkspaceControllerOpts struct {
type WorkspaceController struct {
client.Client
NodeName string
opts *WorkspaceControllerOpts
operations WorkspaceOperations
operations *WorkspaceOperations
metrics *workspaceMetrics
}

func NewWorkspaceController(c client.Client, opts WorkspaceControllerOpts) (*WorkspaceController, error) {
xfs, err := quota.NewXFS(opts.ContentConfig.WorkingArea)
if err != nil {
return nil, err
}
store, err := session.NewStore(context.Background(), opts.ContentConfig.WorkingArea, content.WorkspaceLifecycleHooks(
opts.ContentConfig,
func(instanceID string) bool { return true },
&iws.Uidmapper{Config: opts.UIDMapperConfig, Runtime: opts.ContainerRuntime},
xfs,
opts.CGroupMountPoint,
))
if err != nil {
return nil, err
}

func NewWorkspaceController(c client.Client, nodeName string, ops *WorkspaceOperations, reg prometheus.Registerer) (*WorkspaceController, error) {
metrics := newWorkspaceMetrics()
opts.MetricsRegistry.Register(metrics)

ops, err := NewWorkspaceOperations(opts.ContentConfig, store, opts.MetricsRegistry)
if err != nil {
return nil, err
}
reg.Register(metrics)

return &WorkspaceController{
Client: c,
NodeName: opts.NodeName,
opts: &opts,
operations: *ops,
NodeName: nodeName,
operations: ops,
metrics: metrics,
}, nil
}
Expand Down
49 changes: 46 additions & 3 deletions components/ws-daemon/pkg/controller/workspace_operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ func (wso *WorkspaceOperations) DisposeWorkspace(ctx context.Context, opts Dispo
}
}

err = wso.uploadWorkspaceContent(ctx, sess)
err = wso.uploadWorkspaceContent(ctx, sess, storage.DefaultBackup)
if err != nil {
return false, nil, xerrors.Errorf("final backup failed for workspace %s", opts.Meta.InstanceId)
}
Expand All @@ -227,6 +227,50 @@ func (wso *WorkspaceOperations) DisposeWorkspace(ctx context.Context, opts Dispo
return false, repo, nil
}

func (wso *WorkspaceOperations) SnapshotIDs(workspaceID string) (snapshotUrl, snapshotName string, err error) {
sess := wso.store.Get(workspaceID)
if sess == nil {
return "", "", fmt.Errorf("cannot find workspace %s during SnapshotName", workspaceID)
}

baseName := fmt.Sprintf("snapshot-%d", time.Now().UnixNano())
snapshotName = baseName + ".tar"

rs, ok := sess.NonPersistentAttrs[session.AttrRemoteStorage].(storage.DirectAccess)
if rs == nil || !ok {
return "", "", fmt.Errorf("no remote storage configured")
}

return rs.Qualify(snapshotName), snapshotName, nil
}

func (wso *WorkspaceOperations) TakeSnapshot(ctx context.Context, workspaceID, snapshotName string) (err error) {
//nolint:ineffassign
span, ctx := opentracing.StartSpanFromContext(ctx, "TakeSnapshot")
span.SetTag("workspace", workspaceID)
defer tracing.FinishSpan(span, &err)

if workspaceID == "" {
return fmt.Errorf("workspaceID is required")
}

sess := wso.store.Get(workspaceID)
if sess == nil {
return fmt.Errorf("cannot find workspace %s during DisposeWorkspace", workspaceID)
}

if sess.RemoteStorageDisabled {
return fmt.Errorf("workspace has no remote storage")
}

err = wso.uploadWorkspaceContent(ctx, sess, snapshotName)
if err != nil {
return fmt.Errorf("snapshot failed for workspace %s", workspaceID)
}

return nil
}

func (wso *WorkspaceOperations) uploadWorkspaceLogs(ctx context.Context, opts DisposeOptions) (err error) {
// currently we're only uploading prebuild log files
logFiles, err := logs.ListPrebuildLogFiles(ctx, opts.WorkspaceLocation)
Expand Down Expand Up @@ -271,8 +315,7 @@ func (wso *WorkspaceOperations) uploadWorkspaceLogs(ctx context.Context, opts Di
return err
}

func (wso *WorkspaceOperations) uploadWorkspaceContent(ctx context.Context, sess *session.Workspace) error {
var backupName = storage.DefaultBackup
func (wso *WorkspaceOperations) uploadWorkspaceContent(ctx context.Context, sess *session.Workspace, backupName string) error {
// Avoid too many simultaneous backups in order to avoid excessive memory utilization.
var timedOut bool
waitStart := time.Now()
Expand Down
38 changes: 30 additions & 8 deletions components/ws-daemon/pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ import (
"github.com/gitpod-io/gitpod/ws-daemon/pkg/diskguard"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/dispatch"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/hosts"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/internal/session"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/iws"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/netlimit"
"github.com/gitpod-io/gitpod/ws-daemon/pkg/quota"
)

var (
Expand Down Expand Up @@ -185,21 +187,41 @@ func NewDaemon(config Config) (*Daemon, error) {
contentCfg.WorkingArea += config.WorkspaceController.WorkingAreaSuffix
contentCfg.WorkingAreaNode += config.WorkspaceController.WorkingAreaSuffix

wsctrl, err := controller.NewWorkspaceController(mgr.GetClient(), controller.WorkspaceControllerOpts{
NodeName: nodename,
ContentConfig: contentCfg,
UIDMapperConfig: config.Uidmapper,
ContainerRuntime: containerRuntime,
CGroupMountPoint: config.CPULimit.CGroupBasePath,
MetricsRegistry: wrappedReg,
})
xfs, err := quota.NewXFS(contentCfg.WorkingArea)
if err != nil {
return nil, err
}

store, err := session.NewStore(context.Background(), contentCfg.WorkingArea, content.WorkspaceLifecycleHooks(
contentCfg,
func(instanceID string) bool { return true },
&iws.Uidmapper{Config: config.Uidmapper, Runtime: containerRuntime},
xfs,
config.CPULimit.CGroupBasePath,
))
if err != nil {
return nil, err
}

workspaceOps, err := controller.NewWorkspaceOperations(contentCfg, store, wrappedReg)
if err != nil {
return nil, err
}

wsctrl, err := controller.NewWorkspaceController(mgr.GetClient(), nodename, workspaceOps, wrappedReg)
if err != nil {
return nil, err
}
err = wsctrl.SetupWithManager(mgr)
if err != nil {
return nil, err
}

ssctrl := controller.NewSnapshotController(mgr.GetClient(), nodename, workspaceOps)
err = ssctrl.SetupWithManager(mgr)
if err != nil {
return nil, err
}
}

dsptch, err := dispatch.NewDispatch(containerRuntime, clientset, config.Runtime.KubernetesNamespace, nodename, listener...)
Expand Down
68 changes: 68 additions & 0 deletions components/ws-manager-api/go/crd/v1/snapshot_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.

package v1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// SnapshotSpec defines the desired state of the snapshot
type SnapshotSpec struct {
// +kubebuilder:validation:Required
NodeName string `json:"nodeName"`

// +kubebuilder:validation:Required
WorkspaceID string `json:"workspaceID"`
}

// SnapshotStatus defines the observed state of the snapshot
type SnapshotStatus struct {
// // +kubebuilder:validation:Optional
// Conditions []metav1.Condition `json:"conditions"`

// Erorr is the error observed during snapshot creation if any
// +kubebuilder:validation:Optional
Error string `json:"error,omitempty"`

// URL contains the url of the snapshot
// +kubebuilder:validation:Optional
URL string `json:"url,omitempty"`

// Completed indicates if the snapshot operation has completed either by taking the snapshot or through failure
// +kubebuilder:validation:Required
Completed bool `json:"completed"`
}

//+kubebuilder:object:root=true
//+kubebuilder:subresource:status
//+kubebuilder:resource:shortName=snapshot
// Custom print columns on the Custom Resource Definition. These are the columns
// showing up when doing e.g. `kubectl get snapshots`.
// Columns with priority > 0 will only show up with `-o wide`.
//+kubebuilder:printcolumn:name="Workspace",type="string",JSONPath=".spec.workspaceID"
//+kubebuilder:printcolumn:name="URL",type="string",JSONPath=".status.url",priority=10
//+kubebuilder:printcolumn:name="Completed",type="boolean",JSONPath=".status.completed"

// Snapshot is the Schema for the snapshot API
type Snapshot struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec SnapshotSpec `json:"spec,omitempty"`
Status SnapshotStatus `json:"status,omitempty"`
}

//+kubebuilder:object:root=true

// SnapshotList contains a list of Snapshots
type SnapshotList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []Snapshot `json:"items"`
}

func init() {
SchemeBuilder.Register(&Snapshot{}, &SnapshotList{})
}
Loading