Skip to content

📖 document exposed metrics #811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 48 additions & 35 deletions pkg/metrics/client_go_adapter.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,78 +29,91 @@ import (
// that client-go registers metrics. We copy the names and formats
// from Kubernetes so that we match the core controllers.

// Metrics subsystem and all of the keys used by the rest client.
const (
RestClientSubsystem = "rest_client"
LatencyKey = "request_latency_seconds"
ResultKey = "requests_total"
)

// Metrics subsystem and all keys used by the reflectors.
const (
ReflectorSubsystem = "reflector"
ListsTotalKey = "lists_total"
ListsDurationKey = "list_duration_seconds"
ItemsPerListKey = "items_per_list"
WatchesTotalKey = "watches_total"
ShortWatchesTotalKey = "short_watches_total"
WatchDurationKey = "watch_duration_seconds"
ItemsPerWatchKey = "items_per_watch"
LastResourceVersionKey = "last_resource_version"
)

var (
// client metrics

requestLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_latency_seconds",
Help: "Request latency in seconds. Broken down by verb and URL.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
},
[]string{"verb", "url"},
)

requestResult = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "rest_client_requests_total",
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
},
[]string{"code", "method", "host"},
)
requestLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: RestClientSubsystem,
Name: LatencyKey,
Help: "Request latency in seconds. Broken down by verb and URL.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
}, []string{"verb", "url"})

requestResult = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: RestClientSubsystem,
Name: ResultKey,
Help: "Number of HTTP requests, partitioned by status code, method, and host.",
}, []string{"code", "method", "host"})

// reflector metrics

// TODO(directxman12): update these to be histograms once the metrics overhaul KEP
// PRs start landing.

reflectorSubsystem = "reflector"

listsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: reflectorSubsystem,
Name: "lists_total",
Subsystem: ReflectorSubsystem,
Name: ListsTotalKey,
Help: "Total number of API lists done by the reflectors",
}, []string{"name"})

listsDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Subsystem: reflectorSubsystem,
Name: "list_duration_seconds",
Subsystem: ReflectorSubsystem,
Name: ListsDurationKey,
Help: "How long an API list takes to return and decode for the reflectors",
}, []string{"name"})

itemsPerList = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Subsystem: reflectorSubsystem,
Name: "items_per_list",
Subsystem: ReflectorSubsystem,
Name: ItemsPerListKey,
Help: "How many items an API list returns to the reflectors",
}, []string{"name"})

watchesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: reflectorSubsystem,
Name: "watches_total",
Subsystem: ReflectorSubsystem,
Name: WatchesTotalKey,
Help: "Total number of API watches done by the reflectors",
}, []string{"name"})

shortWatchesTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: reflectorSubsystem,
Name: "short_watches_total",
Subsystem: ReflectorSubsystem,
Name: ShortWatchesTotalKey,
Help: "Total number of short API watches done by the reflectors",
}, []string{"name"})

watchDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Subsystem: reflectorSubsystem,
Name: "watch_duration_seconds",
Subsystem: ReflectorSubsystem,
Name: WatchDurationKey,
Help: "How long an API watch takes to return and decode for the reflectors",
}, []string{"name"})

itemsPerWatch = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Subsystem: reflectorSubsystem,
Name: "items_per_watch",
Subsystem: ReflectorSubsystem,
Name: ItemsPerWatchKey,
Help: "How many items an API watch returns to the reflectors",
}, []string{"name"})

lastResourceVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: reflectorSubsystem,
Name: "last_resource_version",
Subsystem: ReflectorSubsystem,
Name: LastResourceVersionKey,
Help: "Last resource version seen for the reflectors",
}, []string{"name"})
)
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ package metrics
import (
"fmt"
"net"

logf "sigs.k8s.io/controller-runtime/pkg/internal/log"
)

var log = logf.RuntimeLog.WithName("metrics")

// DefaultBindAddress sets the default bind address for the metrics listener
// The metrics is on by default.
var DefaultBindAddress = ":8080"
Expand Down
162 changes: 84 additions & 78 deletions pkg/metrics/workqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,106 +19,112 @@ package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/util/workqueue"
logf "sigs.k8s.io/controller-runtime/pkg/internal/log"
)

var log = logf.RuntimeLog.WithName("metrics")

// This file is copied and adapted from k8s.io/kubernetes/pkg/util/workqueue/prometheus
// which registers metrics to the default prometheus Registry. We require very
// similar functionality, but must register metrics to a different Registry.

// Metrics subsystem and all keys used by the workqueue.
const (
WorkQueueSubsystem = "workqueue"
DepthKey = "depth"
AddsKey = "adds_total"
QueueLatencyKey = "queue_duration_seconds"
WorkDurationKey = "work_duration_seconds"
UnfinishedWorkKey = "unfinished_work_seconds"
LongestRunningProcessorKey = "longest_running_processor_seconds"
RetriesKey = "retries_total"
)

var (
depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: DepthKey,
Help: "Current depth of workqueue",
}, []string{"name"})

adds = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: AddsKey,
Help: "Total number of adds handled by workqueue",
}, []string{"name"})

latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: QueueLatencyKey,
Help: "How long in seconds an item stays in workqueue before being requested",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
}, []string{"name"})

workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: WorkDurationKey,
Help: "How long in seconds processing an item from workqueue takes.",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
}, []string{"name"})

unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: UnfinishedWorkKey,
Help: "How many seconds of work has been done that " +
"is in progress and hasn't been observed by work_duration. Large " +
"values indicate stuck threads. One can deduce the number of stuck " +
"threads by observing the rate at which this increases.",
}, []string{"name"})

longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: LongestRunningProcessorKey,
Help: "How many seconds has the longest running " +
"processor for workqueue been running.",
}, []string{"name"})

retries = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: RetriesKey,
Help: "Total number of retries handled by workqueue",
}, []string{"name"})
)

func init() {
workqueue.SetProvider(workqueueMetricsProvider{})
}
Registry.MustRegister(depth)
Registry.MustRegister(adds)
Registry.MustRegister(latency)
Registry.MustRegister(workDuration)
Registry.MustRegister(unfinished)
Registry.MustRegister(longestRunningProcessor)
Registry.MustRegister(retries)

func registerWorkqueueMetric(c prometheus.Collector, name, queue string) {
if err := Registry.Register(c); err != nil {
log.Error(err, "failed to register metric", "name", name, "queue", queue)
}
workqueue.SetProvider(workqueueMetricsProvider{})
}

type workqueueMetricsProvider struct{}

func (workqueueMetricsProvider) NewDepthMetric(queue string) workqueue.GaugeMetric {
const name = "workqueue_depth"
m := prometheus.NewGauge(prometheus.GaugeOpts{
Name: name,
Help: "Current depth of workqueue",
ConstLabels: prometheus.Labels{"name": queue},
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
return depth.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewAddsMetric(queue string) workqueue.CounterMetric {
const name = "workqueue_adds_total"
m := prometheus.NewCounter(prometheus.CounterOpts{
Name: name,
Help: "Total number of adds handled by workqueue",
ConstLabels: prometheus.Labels{"name": queue},
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric {
return adds.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewLatencyMetric(queue string) workqueue.HistogramMetric {
const name = "workqueue_queue_duration_seconds"
m := prometheus.NewHistogram(prometheus.HistogramOpts{
Name: name,
Help: "How long in seconds an item stays in workqueue before being requested.",
ConstLabels: prometheus.Labels{"name": queue},
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
return latency.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewWorkDurationMetric(queue string) workqueue.HistogramMetric {
const name = "workqueue_work_duration_seconds"
m := prometheus.NewHistogram(prometheus.HistogramOpts{
Name: name,
Help: "How long in seconds processing an item from workqueue takes.",
ConstLabels: prometheus.Labels{"name": queue},
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
return workDuration.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(queue string) workqueue.SettableGaugeMetric {
const name = "workqueue_unfinished_work_seconds"
m := prometheus.NewGauge(prometheus.GaugeOpts{
Name: name,
Help: "How many seconds of work has done that " +
"is in progress and hasn't been observed by work_duration. Large " +
"values indicate stuck threads. One can deduce the number of stuck " +
"threads by observing the rate at which this increases.",
ConstLabels: prometheus.Labels{"name": queue},
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
return unfinished.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(queue string) workqueue.SettableGaugeMetric {
const name = "workqueue_longest_running_processor_seconds"
m := prometheus.NewGauge(prometheus.GaugeOpts{
Name: name,
Help: "How many seconds has the longest running " +
"processor for workqueue been running.",
ConstLabels: prometheus.Labels{"name": queue},
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
return longestRunningProcessor.WithLabelValues(name)
}

func (workqueueMetricsProvider) NewRetriesMetric(queue string) workqueue.CounterMetric {
const name = "workqueue_retries_total"
m := prometheus.NewCounter(prometheus.CounterOpts{
Name: name,
Help: "Total number of retries handled by workqueue",
ConstLabels: prometheus.Labels{"name": queue},
})
registerWorkqueueMetric(m, name, queue)
return m
func (workqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
return retries.WithLabelValues(name)
}