@@ -31,7 +31,7 @@ const (
31
31
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
32
32
workspaceRestoresTotal string = "workspace_restores_total"
33
33
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
34
- workspaceNodeCapacity string = "workspace_node_capacity "
34
+ workspaceNodeUtilization string = "workspace_node_utilization "
35
35
)
36
36
37
37
type StopReason string
@@ -61,7 +61,7 @@ type controllerMetrics struct {
61
61
workspacePhases * phaseTotalVec
62
62
timeoutSettings * timeoutSettingsVec
63
63
64
- workspaceNodeCapacity * nodeCapacityVec
64
+ workspaceNodeUtilization * nodeUtilizationVec
65
65
66
66
// used to prevent recording metrics multiple times
67
67
cache * lru.Cache
@@ -132,10 +132,10 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
132
132
Help : "total number of workspace restore failures" ,
133
133
}, []string {"type" , "class" }),
134
134
135
- workspacePhases : newPhaseTotalVec (r ),
136
- timeoutSettings : newTimeoutSettingsVec (r ),
137
- workspaceNodeCapacity : newNodeCapacityVec (r ),
138
- cache : cache ,
135
+ workspacePhases : newPhaseTotalVec (r ),
136
+ timeoutSettings : newTimeoutSettingsVec (r ),
137
+ workspaceNodeUtilization : newNodeUtilizationVec (r ),
138
+ cache : cache ,
139
139
}, nil
140
140
}
141
141
@@ -304,7 +304,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
304
304
305
305
m .workspacePhases .Describe (ch )
306
306
m .timeoutSettings .Describe (ch )
307
- m .workspaceNodeCapacity .Describe (ch )
307
+ m .workspaceNodeUtilization .Describe (ch )
308
308
}
309
309
310
310
// Collect implements Collector.
@@ -322,7 +322,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
322
322
323
323
m .workspacePhases .Collect (ch )
324
324
m .timeoutSettings .Collect (ch )
325
- m .workspaceNodeCapacity .Collect (ch )
325
+ m .workspaceNodeUtilization .Collect (ch )
326
326
}
327
327
328
328
// phaseTotalVec returns a gauge vector counting the workspaces per phase
@@ -467,80 +467,79 @@ func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
467
467
ch <- metric
468
468
}
469
469
470
- type nodeCapacityVec struct {
470
+ // nodeUtilizationVec provides metrics per workspace node on:
471
+ // - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
472
+ // CPU is measured in cores, memory in bytes.
473
+ // Differentiates between headless and regular workspace nodes using the type label.
474
+ // Useful to determine node utilization and capacity.
475
+ type nodeUtilizationVec struct {
471
476
name string
472
477
desc * prometheus.Desc
473
478
reconciler * WorkspaceReconciler
474
479
}
475
480
476
- func newNodeCapacityVec (r * WorkspaceReconciler ) * nodeCapacityVec {
477
- name := prometheus .BuildFQName (metricsNamespace , metricsWorkspaceSubsystem , workspaceNodeCapacity )
481
+ func newNodeUtilizationVec (r * WorkspaceReconciler ) * nodeUtilizationVec {
482
+ name := prometheus .BuildFQName (metricsNamespace , metricsWorkspaceSubsystem , workspaceNodeUtilization )
478
483
desc := prometheus .NewDesc (
479
484
name ,
480
- "Amount of resource capacity on the node (cpu/memory, metric for `total` on the node vs `requested` by workspaces )" ,
481
- []string {"node" , "resource" , "metric " },
485
+ "Amount of resources requested by workspaces on the node (cpu/memory, workspace type )" ,
486
+ []string {"node" , "resource" , "type " },
482
487
prometheus .Labels (map [string ]string {}),
483
488
)
484
- return & nodeCapacityVec {
489
+ return & nodeUtilizationVec {
485
490
name : name ,
486
491
reconciler : r ,
487
492
desc : desc ,
488
493
}
489
494
}
490
495
491
496
// Describe implements Collector. It will send exactly one Desc to the provided channel.
492
- func (n * nodeCapacityVec ) Describe (ch chan <- * prometheus.Desc ) {
497
+ func (n * nodeUtilizationVec ) Describe (ch chan <- * prometheus.Desc ) {
493
498
ch <- n .desc
494
499
}
495
500
496
501
// Collect implements Collector.
497
- func (n * nodeCapacityVec ) Collect (ch chan <- prometheus.Metric ) {
502
+ func (n * nodeUtilizationVec ) Collect (ch chan <- prometheus.Metric ) {
498
503
ctx , cancel := context .WithTimeout (context .Background (), kubernetesOperationTimeout )
499
504
defer cancel ()
500
505
501
506
var nodes corev1.NodeList
502
507
err := n .reconciler .List (ctx , & nodes )
503
508
if err != nil {
504
- log .FromContext (ctx ).Error (err , "cannot list nodes for node capacity metric" )
509
+ log .FromContext (ctx ).Error (err , "cannot list nodes for node utilization metric" )
505
510
return
506
511
}
507
512
508
- nodeMap := make (map [string ]corev1.Node )
513
+ var (
514
+ nodeUtilization = make (map [string ]map [corev1.ResourceName ]float64 )
515
+ nodeTypes = make (map [string ]string )
516
+ )
509
517
for _ , node := range nodes .Items {
510
- // Only collect metrics for workspace nodes.
511
- if node .Labels ["gitpod.io/workload_workspace_regular" ] != "true" && node .Labels ["gitpod.io/workload_workspace_headless" ] != "true" {
518
+ isRegular := node .Labels ["gitpod.io/workload_workspace_regular" ] == "true"
519
+ isHeadless := node .Labels ["gitpod.io/workload_workspace_headless" ] == "true"
520
+ if ! isRegular && ! isHeadless {
521
+ // Ignore non-workspace nodes.
512
522
continue
513
523
}
514
524
515
- nodeMap [node .Name ] = node
516
-
517
- // Record node total capacity.
518
- for _ , resource := range []corev1.ResourceName {corev1 .ResourceCPU , corev1 .ResourceMemory } {
519
- capacity := node .Status .Capacity [resource ]
520
- var value int64
521
- if resource == corev1 .ResourceCPU {
522
- value = capacity .MilliValue ()
523
- } else {
524
- value = capacity .Value ()
525
- }
526
- metric , err := prometheus .NewConstMetric (n .desc , prometheus .GaugeValue , float64 (value ), node .Name , resource .String (), "total" )
527
- if err != nil {
528
- log .FromContext (ctx ).Error (err , "cannot create node capacity metric" , "node" , node .Name , "resource" , resource .String (), "metric" , "total" )
529
- continue
530
- }
531
-
532
- ch <- metric
525
+ nodeUtilization [node .Name ] = map [corev1.ResourceName ]float64 {
526
+ corev1 .ResourceCPU : 0 ,
527
+ corev1 .ResourceMemory : 0 ,
528
+ }
529
+ nodeTypes [node .Name ] = "regular"
530
+ if ! isRegular && isHeadless {
531
+ // In case a node is both regular and headless (e.g. a preview env), mark it as regular.
532
+ nodeTypes [node .Name ] = "headless"
533
533
}
534
534
}
535
535
536
536
var workspaces workspacev1.WorkspaceList
537
537
if err = n .reconciler .List (ctx , & workspaces , client .InNamespace (n .reconciler .Config .Namespace )); err != nil {
538
- log .FromContext (ctx ).Error (err , "cannot list workspaces for node capacity metric" )
538
+ log .FromContext (ctx ).Error (err , "cannot list workspaces for node utilization metric" )
539
539
return
540
540
}
541
541
542
- // we're only interested in the total capacity of the node
543
- nodeCapacity := make (map [string ]map [corev1.ResourceName ]int64 )
542
+ // Aggregate workspace resource requests per node.
544
543
for _ , ws := range workspaces .Items {
545
544
if ws .Status .Runtime == nil {
546
545
continue
@@ -556,34 +555,35 @@ func (n *nodeCapacityVec) Collect(ch chan<- prometheus.Metric) {
556
555
continue
557
556
}
558
557
559
- if _ , ok := nodeCapacity [nodeName ]; ! ok {
560
- nodeCapacity [nodeName ] = map [corev1.ResourceName ]int64 {
558
+ if _ , ok := nodeUtilization [nodeName ]; ! ok {
559
+ nodeUtilization [nodeName ] = map [corev1.ResourceName ]float64 {
561
560
corev1 .ResourceCPU : 0 ,
562
561
corev1 .ResourceMemory : 0 ,
563
562
}
564
563
}
565
564
566
565
class , ok := n .reconciler .Config .WorkspaceClasses [ws .Spec .Class ]
567
566
if ! ok {
568
- log .FromContext (ctx ).Error (err , "cannot find workspace class for node capacity metric" , "class" , ws .Spec .Class )
567
+ log .FromContext (ctx ).Error (err , "cannot find workspace class for node utilization metric" , "class" , ws .Spec .Class )
569
568
continue
570
569
}
571
570
572
571
requests , err := class .Container .Requests .ResourceList ()
573
572
if err != nil {
574
- log .FromContext (ctx ).Error (err , "cannot get resource requests for node capacity metric" , "class" , ws .Spec .Class )
573
+ log .FromContext (ctx ).Error (err , "cannot get resource requests for node utilization metric" , "class" , ws .Spec .Class )
575
574
continue
576
575
}
577
576
578
- nodeCapacity [nodeName ][corev1.ResourceCPU ] += requests .Cpu ().MilliValue ()
579
- nodeCapacity [nodeName ][corev1.ResourceMemory ] += requests .Memory ().Value ()
577
+ nodeUtilization [nodeName ][corev1.ResourceCPU ] += float64 ( requests .Cpu ().MilliValue ()) / 1000.0
578
+ nodeUtilization [nodeName ][corev1.ResourceMemory ] += float64 ( requests .Memory ().Value () )
580
579
}
581
580
582
- for nodeName , metrics := range nodeCapacity {
581
+ for nodeName , metrics := range nodeUtilization {
583
582
for resource , value := range metrics {
584
- metric , err := prometheus .NewConstMetric (n .desc , prometheus .GaugeValue , float64 (value ), nodeName , resource .String (), "requested" )
583
+ nodeType := nodeTypes [nodeName ]
584
+ metric , err := prometheus .NewConstMetric (n .desc , prometheus .GaugeValue , value , nodeName , resource .String (), nodeType )
585
585
if err != nil {
586
- log .FromContext (ctx ).Error (err , "cannot create node capacity metric" , "node" , nodeName , "resource" , resource .String (), "metric " , "requested" )
586
+ log .FromContext (ctx ).Error (err , "cannot create node utilization metric" , "node" , nodeName , "resource" , resource .String (), "type " , nodeType )
587
587
continue
588
588
}
589
589
0 commit comments