Skip to content

Commit 04b9fbb

Browse files
committed
Refactor to utilization metric, rm total, add type
1 parent 542da01 commit 04b9fbb

File tree

1 file changed

+50
-50
lines changed

1 file changed

+50
-50
lines changed

components/ws-manager-mk2/controllers/metrics.go

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ const (
3131
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
3232
workspaceRestoresTotal string = "workspace_restores_total"
3333
workspaceRestoresFailureTotal string = "workspace_restores_failure_total"
34-
workspaceNodeCapacity string = "workspace_node_capacity"
34+
workspaceNodeUtilization string = "workspace_node_utilization"
3535
)
3636

3737
type StopReason string
@@ -61,7 +61,7 @@ type controllerMetrics struct {
6161
workspacePhases *phaseTotalVec
6262
timeoutSettings *timeoutSettingsVec
6363

64-
workspaceNodeCapacity *nodeCapacityVec
64+
workspaceNodeUtilization *nodeUtilizationVec
6565

6666
// used to prevent recording metrics multiple times
6767
cache *lru.Cache
@@ -132,10 +132,10 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
132132
Help: "total number of workspace restore failures",
133133
}, []string{"type", "class"}),
134134

135-
workspacePhases: newPhaseTotalVec(r),
136-
timeoutSettings: newTimeoutSettingsVec(r),
137-
workspaceNodeCapacity: newNodeCapacityVec(r),
138-
cache: cache,
135+
workspacePhases: newPhaseTotalVec(r),
136+
timeoutSettings: newTimeoutSettingsVec(r),
137+
workspaceNodeUtilization: newNodeUtilizationVec(r),
138+
cache: cache,
139139
}, nil
140140
}
141141

@@ -304,7 +304,7 @@ func (m *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
304304

305305
m.workspacePhases.Describe(ch)
306306
m.timeoutSettings.Describe(ch)
307-
m.workspaceNodeCapacity.Describe(ch)
307+
m.workspaceNodeUtilization.Describe(ch)
308308
}
309309

310310
// Collect implements Collector.
@@ -322,7 +322,7 @@ func (m *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
322322

323323
m.workspacePhases.Collect(ch)
324324
m.timeoutSettings.Collect(ch)
325-
m.workspaceNodeCapacity.Collect(ch)
325+
m.workspaceNodeUtilization.Collect(ch)
326326
}
327327

328328
// phaseTotalVec returns a gauge vector counting the workspaces per phase
@@ -467,80 +467,79 @@ func (m *maintenanceEnabledGauge) Collect(ch chan<- prometheus.Metric) {
467467
ch <- metric
468468
}
469469

470-
type nodeCapacityVec struct {
470+
// nodeUtilizationVec provides metrics per workspace node on:
471+
// - the amount of cpu/memory requested by workspaces on the node (size of the workspace class)
472+
// CPU is measured in cores, memory in bytes.
473+
// Differentiates between headless and regular workspace nodes using the type label.
474+
// Useful to determine node utilization and capacity.
475+
type nodeUtilizationVec struct {
471476
name string
472477
desc *prometheus.Desc
473478
reconciler *WorkspaceReconciler
474479
}
475480

476-
func newNodeCapacityVec(r *WorkspaceReconciler) *nodeCapacityVec {
477-
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeCapacity)
481+
func newNodeUtilizationVec(r *WorkspaceReconciler) *nodeUtilizationVec {
482+
name := prometheus.BuildFQName(metricsNamespace, metricsWorkspaceSubsystem, workspaceNodeUtilization)
478483
desc := prometheus.NewDesc(
479484
name,
480-
"Amount of resource capacity on the node (cpu/memory, metric for `total` on the node vs `requested` by workspaces)",
481-
[]string{"node", "resource", "metric"},
485+
"Amount of resources requested by workspaces on the node (cpu/memory, workspace type)",
486+
[]string{"node", "resource", "type"},
482487
prometheus.Labels(map[string]string{}),
483488
)
484-
return &nodeCapacityVec{
489+
return &nodeUtilizationVec{
485490
name: name,
486491
reconciler: r,
487492
desc: desc,
488493
}
489494
}
490495

491496
// Describe implements Collector. It will send exactly one Desc to the provided channel.
492-
func (n *nodeCapacityVec) Describe(ch chan<- *prometheus.Desc) {
497+
func (n *nodeUtilizationVec) Describe(ch chan<- *prometheus.Desc) {
493498
ch <- n.desc
494499
}
495500

496501
// Collect implements Collector.
497-
func (n *nodeCapacityVec) Collect(ch chan<- prometheus.Metric) {
502+
func (n *nodeUtilizationVec) Collect(ch chan<- prometheus.Metric) {
498503
ctx, cancel := context.WithTimeout(context.Background(), kubernetesOperationTimeout)
499504
defer cancel()
500505

501506
var nodes corev1.NodeList
502507
err := n.reconciler.List(ctx, &nodes)
503508
if err != nil {
504-
log.FromContext(ctx).Error(err, "cannot list nodes for node capacity metric")
509+
log.FromContext(ctx).Error(err, "cannot list nodes for node utilization metric")
505510
return
506511
}
507512

508-
nodeMap := make(map[string]corev1.Node)
513+
var (
514+
nodeUtilization = make(map[string]map[corev1.ResourceName]float64)
515+
nodeTypes = make(map[string]string)
516+
)
509517
for _, node := range nodes.Items {
510-
// Only collect metrics for workspace nodes.
511-
if node.Labels["gitpod.io/workload_workspace_regular"] != "true" && node.Labels["gitpod.io/workload_workspace_headless"] != "true" {
518+
isRegular := node.Labels["gitpod.io/workload_workspace_regular"] == "true"
519+
isHeadless := node.Labels["gitpod.io/workload_workspace_headless"] == "true"
520+
if !isRegular && !isHeadless {
521+
// Ignore non-workspace nodes.
512522
continue
513523
}
514524

515-
nodeMap[node.Name] = node
516-
517-
// Record node total capacity.
518-
for _, resource := range []corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory} {
519-
capacity := node.Status.Capacity[resource]
520-
var value int64
521-
if resource == corev1.ResourceCPU {
522-
value = capacity.MilliValue()
523-
} else {
524-
value = capacity.Value()
525-
}
526-
metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, float64(value), node.Name, resource.String(), "total")
527-
if err != nil {
528-
log.FromContext(ctx).Error(err, "cannot create node capacity metric", "node", node.Name, "resource", resource.String(), "metric", "total")
529-
continue
530-
}
531-
532-
ch <- metric
525+
nodeUtilization[node.Name] = map[corev1.ResourceName]float64{
526+
corev1.ResourceCPU: 0,
527+
corev1.ResourceMemory: 0,
528+
}
529+
nodeTypes[node.Name] = "regular"
530+
if !isRegular && isHeadless {
531+
// In case a node is both regular and headless (e.g. a preview env), mark it as regular.
532+
nodeTypes[node.Name] = "headless"
533533
}
534534
}
535535

536536
var workspaces workspacev1.WorkspaceList
537537
if err = n.reconciler.List(ctx, &workspaces, client.InNamespace(n.reconciler.Config.Namespace)); err != nil {
538-
log.FromContext(ctx).Error(err, "cannot list workspaces for node capacity metric")
538+
log.FromContext(ctx).Error(err, "cannot list workspaces for node utilization metric")
539539
return
540540
}
541541

542-
// we're only interested in the total capacity of the node
543-
nodeCapacity := make(map[string]map[corev1.ResourceName]int64)
542+
// Aggregate workspace resource requests per node.
544543
for _, ws := range workspaces.Items {
545544
if ws.Status.Runtime == nil {
546545
continue
@@ -556,34 +555,35 @@ func (n *nodeCapacityVec) Collect(ch chan<- prometheus.Metric) {
556555
continue
557556
}
558557

559-
if _, ok := nodeCapacity[nodeName]; !ok {
560-
nodeCapacity[nodeName] = map[corev1.ResourceName]int64{
558+
if _, ok := nodeUtilization[nodeName]; !ok {
559+
nodeUtilization[nodeName] = map[corev1.ResourceName]float64{
561560
corev1.ResourceCPU: 0,
562561
corev1.ResourceMemory: 0,
563562
}
564563
}
565564

566565
class, ok := n.reconciler.Config.WorkspaceClasses[ws.Spec.Class]
567566
if !ok {
568-
log.FromContext(ctx).Error(err, "cannot find workspace class for node capacity metric", "class", ws.Spec.Class)
567+
log.FromContext(ctx).Error(err, "cannot find workspace class for node utilization metric", "class", ws.Spec.Class)
569568
continue
570569
}
571570

572571
requests, err := class.Container.Requests.ResourceList()
573572
if err != nil {
574-
log.FromContext(ctx).Error(err, "cannot get resource requests for node capacity metric", "class", ws.Spec.Class)
573+
log.FromContext(ctx).Error(err, "cannot get resource requests for node utilization metric", "class", ws.Spec.Class)
575574
continue
576575
}
577576

578-
nodeCapacity[nodeName][corev1.ResourceCPU] += requests.Cpu().MilliValue()
579-
nodeCapacity[nodeName][corev1.ResourceMemory] += requests.Memory().Value()
577+
nodeUtilization[nodeName][corev1.ResourceCPU] += float64(requests.Cpu().MilliValue()) / 1000.0
578+
nodeUtilization[nodeName][corev1.ResourceMemory] += float64(requests.Memory().Value())
580579
}
581580

582-
for nodeName, metrics := range nodeCapacity {
581+
for nodeName, metrics := range nodeUtilization {
583582
for resource, value := range metrics {
584-
metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, float64(value), nodeName, resource.String(), "requested")
583+
nodeType := nodeTypes[nodeName]
584+
metric, err := prometheus.NewConstMetric(n.desc, prometheus.GaugeValue, value, nodeName, resource.String(), nodeType)
585585
if err != nil {
586-
log.FromContext(ctx).Error(err, "cannot create node capacity metric", "node", nodeName, "resource", resource.String(), "metric", "requested")
586+
log.FromContext(ctx).Error(err, "cannot create node utilization metric", "node", nodeName, "resource", resource.String(), "type", nodeType)
587587
continue
588588
}
589589

0 commit comments

Comments
 (0)