Skip to content

Commit f9e6a63

Browse files
authored
[ws-manager-mk2] Add node utilization metrics (#19105)
* [ws-manager-mk2] Node capacity metrics * CPU use millis, fix stopped, collect * Refactor to utilization metric, rm total, add type
1 parent 9737e97 commit f9e6a63

File tree

1 file changed

+136
-3
lines changed

1 file changed

+136
-3
lines changed

components/ws-manager-mk2/controllers/metrics.go

Lines changed: 136 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ import (
1515
"github.com/go-logr/logr"
1616
lru "github.com/hashicorp/golang-lru"
1717
"github.com/prometheus/client_golang/prometheus"
18+
corev1 "k8s.io/api/core/v1"
1819
"sigs.k8s.io/controller-runtime/pkg/client"
20+
"sigs.k8s.io/controller-runtime/pkg/log"
1921
)
2022

2123
const (
@@ -29,6 +31,7 @@ const (
2931
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
3032
workspaceRestoresTotal string = "workspace_restores_total"
3133
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
34+
workspaceNodeUtilization string = "workspace_node_utilization"
3235
)
3336

3437
type StopReason string
@@ -58,6 +61,8 @@ type controllerMetrics struct {
5861
workspacePhases *phaseTotalVec
5962
timeoutSettings *timeoutSettingsVec
6063

64+
workspaceNodeUtilization *nodeUtilizationVec
65+
6166
// used to prevent recording metrics multiple times
6267
cache *lru.Cache
6368
}
@@ -127,9 +132,10 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
127132
Help: "total number of workspace restore failures",
128133
}, []string{"type", "class"}),
129134

130-
workspacePhases: newPhaseTotalVec(r),
131-
timeoutSettings: newTimeoutSettingsVec(r),
132-
cache: cache,
135+
workspacePhases: newPhaseTotalVec(r),
136+
timeoutSettings: newTimeoutSettingsVec(r),
137+
workspaceNodeUtilization: newNodeUtilizationVec(r),
138+
cache: cache,
133139
}, nil
134140
}
135141

@@ -298,6 +304,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
298304

299305
m.workspacePhases.Describe(ch)
300306
m.timeoutSettings.Describe(ch)
307+
m.workspaceNodeUtilization.Describe(ch)
301308
}
302309

303310
// Collect implements Collector.
@@ -315,6 +322,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
315322

316323
m.workspacePhases.Collect(ch)
317324
m.timeoutSettings.Collect(ch)
325+
m.workspaceNodeUtilization.Collect(ch)
318326
}
319327

320328
// phaseTotalVec returns a gauge vector counting the workspaces per phase
@@ -458,3 +466,128 @@ func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
458466

459467
ch <- metric
460468
}
469+
470+
// nodeUtilizationVec provides metrics per workspace node on:
471+
// - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
472+
// CPU is measured in cores, memory in bytes.
473+
// Differentiates between headless and regular workspace nodes using the type label.
474+
// Useful to determine node utilization and capacity.
475+
type nodeUtilizationVec struct {
476+
name string
477+
desc *prometheus.Desc
478+
reconciler *WorkspaceReconciler
479+
}
480+
481+
func newNodeUtilizationVec(r *WorkspaceReconciler) *nodeUtilizationVec {
482+
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeUtilization)
483+
desc := prometheus.NewDesc(
484+
name,
485+
"Amount of resources requested by workspaces on the node (cpu/memory, workspace type)",
486+
[]string{"node", "resource", "type"},
487+
prometheus.Labels(map[string]string{}),
488+
)
489+
return &nodeUtilizationVec{
490+
name: name,
491+
reconciler: r,
492+
desc: desc,
493+
}
494+
}
495+
496+
// Describe implements Collector. It will send exactly one Desc to the provided channel.
497+
func (n *nodeUtilizationVec) Describe(ch chan<- *prometheus.Desc) {
498+
ch <- n.desc
499+
}
500+
501+
// Collect implements Collector.
502+
func (n *nodeUtilizationVec) Collect(ch chan<- prometheus.Metric) {
503+
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
504+
defer cancel()
505+
506+
var nodes corev1.NodeList
507+
err := n.reconciler.List(ctx, &nodes)
508+
if err != nil {
509+
log.FromContext(ctx).Error(err, "cannot list nodes for node utilization metric")
510+
return
511+
}
512+
513+
var (
514+
nodeUtilization = make(map[string]map[corev1.ResourceName]float64)
515+
nodeTypes = make(map[string]string)
516+
)
517+
for _, node := range nodes.Items {
518+
isRegular := node.Labels["gitpod.io/workload_workspace_regular"] == "true"
519+
isHeadless := node.Labels["gitpod.io/workload_workspace_headless"] == "true"
520+
if !isRegular && !isHeadless {
521+
// Ignore non-workspace nodes.
522+
continue
523+
}
524+
525+
nodeUtilization[node.Name] = map[corev1.ResourceName]float64{
526+
corev1.ResourceCPU: 0,
527+
corev1.ResourceMemory: 0,
528+
}
529+
nodeTypes[node.Name] = "regular"
530+
if !isRegular && isHeadless {
531+
// In case a node is both regular and headless (e.g. a preview env), mark it as regular.
532+
nodeTypes[node.Name] = "headless"
533+
}
534+
}
535+
536+
var workspaces workspacev1.WorkspaceList
537+
if err = n.reconciler.List(ctx, &workspaces, client.InNamespace(n.reconciler.Config.Namespace)); err != nil {
538+
log.FromContext(ctx).Error(err, "cannot list workspaces for node utilization metric")
539+
return
540+
}
541+
542+
// Aggregate workspace resource requests per node.
543+
for _, ws := range workspaces.Items {
544+
if ws.Status.Runtime == nil {
545+
continue
546+
}
547+
nodeName := ws.Status.Runtime.NodeName
548+
if nodeName == "" {
549+
// Not yet scheduled.
550+
continue
551+
}
552+
553+
if ws.Status.Phase == workspacev1.WorkspacePhaseStopped {
554+
// Stopped, no longer consuming resources on the node.
555+
continue
556+
}
557+
558+
if _, ok := nodeUtilization[nodeName]; !ok {
559+
nodeUtilization[nodeName] = map[corev1.ResourceName]float64{
560+
corev1.ResourceCPU: 0,
561+
corev1.ResourceMemory: 0,
562+
}
563+
}
564+
565+
class, ok := n.reconciler.Config.WorkspaceClasses[ws.Spec.Class]
566+
if !ok {
567+
log.FromContext(ctx).Error(err, "cannot find workspace class for node utilization metric", "class", ws.Spec.Class)
568+
continue
569+
}
570+
571+
requests, err := class.Container.Requests.ResourceList()
572+
if err != nil {
573+
log.FromContext(ctx).Error(err, "cannot get resource requests for node utilization metric", "class", ws.Spec.Class)
574+
continue
575+
}
576+
577+
nodeUtilization[nodeName][corev1.ResourceCPU] += float64(requests.Cpu().MilliValue()) / 1000.0
578+
nodeUtilization[nodeName][corev1.ResourceMemory] += float64(requests.Memory().Value())
579+
}
580+
581+
for nodeName, metrics := range nodeUtilization {
582+
for resource, value := range metrics {
583+
nodeType := nodeTypes[nodeName]
584+
metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, value, nodeName, resource.String(), nodeType)
585+
if err != nil {
586+
log.FromContext(ctx).Error(err, "cannot create node utilization metric", "node", nodeName, "resource", resource.String(), "type", nodeType)
587+
continue
588+
}
589+
590+
ch <- metric
591+
}
592+
}
593+
}

0 commit comments

Comments
 (0)