@@ -15,7 +15,9 @@ import (
15
15
"github.com/go-logr/logr"
16
16
lru "github.com/hashicorp/golang-lru"
17
17
"github.com/prometheus/client_golang/prometheus"
18
+ corev1 "k8s.io/api/core/v1"
18
19
"sigs.k8s.io/controller-runtime/pkg/client"
20
+ "sigs.k8s.io/controller-runtime/pkg/log"
19
21
)
20
22
21
23
const (
@@ -29,6 +31,7 @@ const (
29
31
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
30
32
workspaceRestoresTotal string = "workspace_restores_total"
31
33
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
34
+ workspaceNodeUtilization string = "workspace_node_utilization"
32
35
)
33
36
34
37
type StopReason string
@@ -58,6 +61,8 @@ type controllerMetrics struct {
58
61
workspacePhases * phaseTotalVec
59
62
timeoutSettings * timeoutSettingsVec
60
63
64
+ workspaceNodeUtilization * nodeUtilizationVec
65
+
61
66
// used to prevent recording metrics multiple times
62
67
cache * lru.Cache
63
68
}
@@ -127,9 +132,10 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
127
132
Help : "total number of workspace restore failures" ,
128
133
}, []string {"type" , "class" }),
129
134
130
- workspacePhases : newPhaseTotalVec (r ),
131
- timeoutSettings : newTimeoutSettingsVec (r ),
132
- cache : cache ,
135
+ workspacePhases : newPhaseTotalVec (r ),
136
+ timeoutSettings : newTimeoutSettingsVec (r ),
137
+ workspaceNodeUtilization : newNodeUtilizationVec (r ),
138
+ cache : cache ,
133
139
}, nil
134
140
}
135
141
@@ -298,6 +304,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
298
304
299
305
m .workspacePhases .Describe (ch )
300
306
m .timeoutSettings .Describe (ch )
307
+ m .workspaceNodeUtilization .Describe (ch )
301
308
}
302
309
303
310
// Collect implements Collector.
@@ -315,6 +322,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
315
322
316
323
m .workspacePhases .Collect (ch )
317
324
m .timeoutSettings .Collect (ch )
325
+ m .workspaceNodeUtilization .Collect (ch )
318
326
}
319
327
320
328
// phaseTotalVec returns a gauge vector counting the workspaces per phase
@@ -458,3 +466,128 @@ func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
458
466
459
467
ch <- metric
460
468
}
469
+
470
+ // nodeUtilizationVec provides metrics per workspace node on:
471
+ // - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
472
+ // CPU is measured in cores, memory in bytes.
473
+ // Differentiates between headless and regular workspace nodes using the type label.
474
+ // Useful to determine node utilization and capacity.
475
+ type nodeUtilizationVec struct {
476
+ name string
477
+ desc * prometheus.Desc
478
+ reconciler * WorkspaceReconciler
479
+ }
480
+
481
+ func newNodeUtilizationVec (r * WorkspaceReconciler ) * nodeUtilizationVec {
482
+ name := prometheus .BuildFQName (metricsNamespace , metricsWorkspaceSubsystem , workspaceNodeUtilization )
483
+ desc := prometheus .NewDesc (
484
+ name ,
485
+ "Amount of resources requested by workspaces on the node (cpu/memory, workspace type)" ,
486
+ []string {"node" , "resource" , "type" },
487
+ prometheus .Labels (map [string ]string {}),
488
+ )
489
+ return & nodeUtilizationVec {
490
+ name : name ,
491
+ reconciler : r ,
492
+ desc : desc ,
493
+ }
494
+ }
495
+
496
+ // Describe implements Collector. It will send exactly one Desc to the provided channel.
497
+ func (n * nodeUtilizationVec ) Describe (ch chan <- * prometheus.Desc ) {
498
+ ch <- n .desc
499
+ }
500
+
501
+ // Collect implements Collector.
502
+ func (n * nodeUtilizationVec ) Collect (ch chan <- prometheus.Metric ) {
503
+ ctx , cancel := context .WithTimeout (context .Background (), kubernetesOperationTimeout )
504
+ defer cancel ()
505
+
506
+ var nodes corev1.NodeList
507
+ err := n .reconciler .List (ctx , & nodes )
508
+ if err != nil {
509
+ log .FromContext (ctx ).Error (err , "cannot list nodes for node utilization metric" )
510
+ return
511
+ }
512
+
513
+ var (
514
+ nodeUtilization = make (map [string ]map [corev1.ResourceName ]float64 )
515
+ nodeTypes = make (map [string ]string )
516
+ )
517
+ for _ , node := range nodes .Items {
518
+ isRegular := node .Labels ["gitpod.io/workload_workspace_regular" ] == "true"
519
+ isHeadless := node .Labels ["gitpod.io/workload_workspace_headless" ] == "true"
520
+ if ! isRegular && ! isHeadless {
521
+ // Ignore non-workspace nodes.
522
+ continue
523
+ }
524
+
525
+ nodeUtilization [node .Name ] = map [corev1.ResourceName ]float64 {
526
+ corev1 .ResourceCPU : 0 ,
527
+ corev1 .ResourceMemory : 0 ,
528
+ }
529
+ nodeTypes [node .Name ] = "regular"
530
+ if ! isRegular && isHeadless {
531
+ // In case a node is both regular and headless (e.g. a preview env), mark it as regular.
532
+ nodeTypes [node .Name ] = "headless"
533
+ }
534
+ }
535
+
536
+ var workspaces workspacev1.WorkspaceList
537
+ if err = n .reconciler .List (ctx , & workspaces , client .InNamespace (n .reconciler .Config .Namespace )); err != nil {
538
+ log .FromContext (ctx ).Error (err , "cannot list workspaces for node utilization metric" )
539
+ return
540
+ }
541
+
542
+ // Aggregate workspace resource requests per node.
543
+ for _ , ws := range workspaces .Items {
544
+ if ws .Status .Runtime == nil {
545
+ continue
546
+ }
547
+ nodeName := ws .Status .Runtime .NodeName
548
+ if nodeName == "" {
549
+ // Not yet scheduled.
550
+ continue
551
+ }
552
+
553
+ if ws .Status .Phase == workspacev1 .WorkspacePhaseStopped {
554
+ // Stopped, no longer consuming resources on the node.
555
+ continue
556
+ }
557
+
558
+ if _ , ok := nodeUtilization [nodeName ]; ! ok {
559
+ nodeUtilization [nodeName ] = map [corev1.ResourceName ]float64 {
560
+ corev1 .ResourceCPU : 0 ,
561
+ corev1 .ResourceMemory : 0 ,
562
+ }
563
+ }
564
+
565
+ class , ok := n .reconciler .Config .WorkspaceClasses [ws .Spec .Class ]
566
+ if ! ok {
567
+ log .FromContext (ctx ).Error (err , "cannot find workspace class for node utilization metric" , "class" , ws .Spec .Class )
568
+ continue
569
+ }
570
+
571
+ requests , err := class .Container .Requests .ResourceList ()
572
+ if err != nil {
573
+ log .FromContext (ctx ).Error (err , "cannot get resource requests for node utilization metric" , "class" , ws .Spec .Class )
574
+ continue
575
+ }
576
+
577
+ nodeUtilization [nodeName ][corev1.ResourceCPU ] += float64 (requests .Cpu ().MilliValue ()) / 1000.0
578
+ nodeUtilization [nodeName ][corev1.ResourceMemory ] += float64 (requests .Memory ().Value ())
579
+ }
580
+
581
+ for nodeName , metrics := range nodeUtilization {
582
+ for resource , value := range metrics {
583
+ nodeType := nodeTypes [nodeName ]
584
+ metric , err := prometheus .NewConstMetric (n .desc , prometheus .GaugeValue , value , nodeName , resource .String (), nodeType )
585
+ if err != nil {
586
+ log .FromContext (ctx ).Error (err , "cannot create node utilization metric" , "node" , nodeName , "resource" , resource .String (), "type" , nodeType )
587
+ continue
588
+ }
589
+
590
+ ch <- metric
591
+ }
592
+ }
593
+ }
0 commit comments