Skip to content

Commit 2676ea6

Browse files
committed
Auto merge of #3829 - pietroalbini:pa-db-pool-histogram, r=Turbo87
Track used database conns with an histogram While looking at the metrics, me and `@jtgeibel` noticed that the way we tracked used database connections before (sampling the current state every time metrics are scraped) is not really accurate, as the connections could be all used up until the scrape happens. This PR adds another metric recording the used database connections with an histogram. This PR also refactors how histograms are defined in the source code, to account for different histograms needing different buckets.
2 parents 5a76a33 + 5ad915f commit 2676ea6

File tree

6 files changed

+127
-48
lines changed

6 files changed

+127
-48
lines changed

src/app.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ impl App {
128128
instance_metrics
129129
.database_time_to_obtain_connection
130130
.with_label_values(&["primary"]),
131+
instance_metrics
132+
.database_used_conns_histogram
133+
.with_label_values(&["primary"]),
131134
)
132135
.unwrap()
133136
};
@@ -155,6 +158,9 @@ impl App {
155158
instance_metrics
156159
.database_time_to_obtain_connection
157160
.with_label_values(&["follower"]),
161+
instance_metrics
162+
.database_used_conns_histogram
163+
.with_label_values(&["follower"]),
158164
)
159165
.unwrap(),
160166
)

src/db.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::middleware::app::RequestApp;
1313
pub enum DieselPool {
1414
Pool {
1515
pool: r2d2::Pool<ConnectionManager<PgConnection>>,
16+
used_conns_metric: Histogram,
1617
time_to_obtain_connection_metric: Histogram,
1718
},
1819
Test(Arc<ReentrantMutex<PgConnection>>),
@@ -22,6 +23,7 @@ impl DieselPool {
2223
pub(crate) fn new(
2324
url: &str,
2425
config: r2d2::Builder<ConnectionManager<PgConnection>>,
26+
used_conns_metric: Histogram,
2527
time_to_obtain_connection_metric: Histogram,
2628
) -> Result<DieselPool, PoolError> {
2729
let manager = ConnectionManager::new(connection_url(url));
@@ -39,6 +41,7 @@ impl DieselPool {
3941
// automatically be marked as unhealthy and the rest of the application will adapt.
4042
let pool = DieselPool::Pool {
4143
pool: config.build_unchecked(manager),
44+
used_conns_metric,
4245
time_to_obtain_connection_metric,
4346
};
4447
match pool.wait_until_healthy(Duration::from_secs(5)) {
@@ -62,8 +65,13 @@ impl DieselPool {
6265
match self {
6366
DieselPool::Pool {
6467
pool,
68+
used_conns_metric,
6569
time_to_obtain_connection_metric,
6670
} => time_to_obtain_connection_metric.observe_closure_duration(|| {
71+
// Record the number of used connections before obtaining the current one.
72+
let state = pool.state();
73+
used_conns_metric.observe((state.connections - state.idle_connections) as f64);
74+
6775
if let Some(conn) = pool.try_get() {
6876
Ok(DieselPooledConn::Pool(conn))
6977
} else if !self.is_healthy() {

src/metrics/histogram.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
use crate::metrics::macros::MetricFromOpts;
2+
use std::marker::PhantomData;
3+
use std::ops::{Deref, DerefMut};
4+
5+
/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing the
6+
/// count of datapoints equal or greater to the bucket value.
7+
///
8+
/// Histogram buckets are not an exact science, so feel free to tweak the buckets or create new
9+
/// ones if you see that the histograms are not really accurate. Just avoid adding too many buckets
10+
/// for a single type as that increases the number of exported metric series.
11+
pub trait HistogramBuckets {
12+
const BUCKETS: &'static [f64];
13+
}
14+
15+
/// Buckets geared towards measuring timings, such as the response time of our requests, going from
16+
/// 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly lower
17+
/// resolution. This allows us to properly measure download requests (which take around 1ms) and
18+
/// other requests (our 95h is around 10-20ms).
19+
pub struct TimingBuckets;
20+
21+
impl HistogramBuckets for TimingBuckets {
22+
const BUCKETS: &'static [f64] = &[
23+
0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
24+
];
25+
}
26+
27+
/// Buckets geared towards measuring how many database connections are used in the database pool.
28+
pub struct DatabasePoolBuckets;
29+
30+
impl HistogramBuckets for DatabasePoolBuckets {
31+
const BUCKETS: &'static [f64] = &[0.0, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, 100.0];
32+
}
33+
34+
/// Wrapper type over [`prometheus::Histogram`] to support defining buckets.
35+
pub struct Histogram<B: HistogramBuckets> {
36+
inner: prometheus::Histogram,
37+
_phantom: PhantomData<B>,
38+
}
39+
40+
impl<B: HistogramBuckets> MetricFromOpts for Histogram<B> {
41+
fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
42+
Ok(Histogram {
43+
inner: prometheus::Histogram::with_opts(prometheus::HistogramOpts {
44+
common_opts: opts,
45+
buckets: B::BUCKETS.to_vec(),
46+
})?,
47+
_phantom: PhantomData,
48+
})
49+
}
50+
}
51+
52+
impl<B: HistogramBuckets> Deref for Histogram<B> {
53+
type Target = prometheus::Histogram;
54+
55+
fn deref(&self) -> &Self::Target {
56+
&self.inner
57+
}
58+
}
59+
60+
impl<B: HistogramBuckets> DerefMut for Histogram<B> {
61+
fn deref_mut(&mut self) -> &mut Self::Target {
62+
&mut self.inner
63+
}
64+
}
65+
66+
/// Wrapper type over [`prometheus::HistogramVec`] to support defining buckets.
67+
pub struct HistogramVec<B: HistogramBuckets> {
68+
inner: prometheus::HistogramVec,
69+
_phantom: PhantomData<B>,
70+
}
71+
72+
impl<B: HistogramBuckets> MetricFromOpts for HistogramVec<B> {
73+
fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
74+
Ok(HistogramVec {
75+
inner: prometheus::HistogramVec::new(
76+
prometheus::HistogramOpts {
77+
common_opts: opts.clone(),
78+
buckets: B::BUCKETS.to_vec(),
79+
},
80+
opts.variable_labels
81+
.iter()
82+
.map(|s| s.as_str())
83+
.collect::<Vec<_>>()
84+
.as_slice(),
85+
)?,
86+
_phantom: PhantomData,
87+
})
88+
}
89+
}
90+
91+
impl<B: HistogramBuckets> Deref for HistogramVec<B> {
92+
type Target = prometheus::HistogramVec;
93+
94+
fn deref(&self) -> &Self::Target {
95+
&self.inner
96+
}
97+
}
98+
99+
impl<B: HistogramBuckets> DerefMut for HistogramVec<B> {
100+
fn deref_mut(&mut self) -> &mut Self::Target {
101+
&mut self.inner
102+
}
103+
}

src/metrics/instance.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,10 @@
1717
//! As a rule of thumb, if the metric requires a database query to be updated it's probably a
1818
//! service-level metric, and you should add it to `src/metrics/service.rs` instead.
1919
20+
use crate::metrics::histogram::{DatabasePoolBuckets, Histogram, HistogramVec, TimingBuckets};
2021
use crate::util::errors::AppResult;
2122
use crate::{app::App, db::DieselPool};
22-
use prometheus::{
23-
proto::MetricFamily, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
24-
};
23+
use prometheus::{proto::MetricFamily, IntCounter, IntCounterVec, IntGauge, IntGaugeVec};
2524

2625
metrics! {
2726
pub struct InstanceMetrics {
@@ -30,15 +29,17 @@ metrics! {
3029
/// Number of used database connections in the pool
3130
database_used_conns: IntGaugeVec["pool"],
3231
/// Amount of time required to obtain a database connection
33-
pub database_time_to_obtain_connection: HistogramVec["pool"],
32+
pub database_time_to_obtain_connection: HistogramVec<TimingBuckets>["pool"],
33+
/// Number of used database connections in the pool, as histogram
34+
pub database_used_conns_histogram: HistogramVec<DatabasePoolBuckets>["pool"],
3435

3536
/// Number of requests processed by this instance
3637
pub requests_total: IntCounter,
3738
/// Number of requests currently being processed
3839
pub requests_in_flight: IntGauge,
3940

4041
/// Response times of our endpoints
41-
pub response_times: HistogramVec["endpoint"],
42+
pub response_times: HistogramVec<TimingBuckets>["endpoint"],
4243
/// Nmber of responses per status code
4344
pub responses_by_status_code_total: IntCounterVec["status"],
4445

@@ -47,7 +48,7 @@ metrics! {
4748
/// Number of download requests with a non-canonical crate name.
4849
pub downloads_non_canonical_crate_name_total: IntCounter,
4950
/// How long it takes to execute the SELECT query in the download endpoint.
50-
pub downloads_select_query_execution_time: Histogram,
51+
pub downloads_select_query_execution_time: Histogram<TimingBuckets>,
5152
/// Number of download requests that are not counted yet.
5253
downloads_not_counted_total: IntGauge,
5354
}

src/metrics/macros.rs

Lines changed: 2 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,4 @@
1-
use prometheus::{Histogram, HistogramOpts, HistogramVec, Opts};
2-
3-
/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing
4-
/// the count of datapoints equal or greater to the bucket value.
5-
///
6-
/// The buckets used by crates.io are geared towards measuring the response time of our requests,
7-
/// going from 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly
8-
/// lower resolution. This allows us to properly measure download requests (which take around 1ms)
9-
/// and other requests (our 95h is around 10-20ms).
10-
///
11-
/// Histogram buckets are not an exact science, so feel free to tweak the buckets if you see that
12-
/// the histograms are not really accurate. Just avoid adding too many buckets as that increases
13-
/// the number of exported metric series.
14-
const HISTOGRAM_BUCKETS: &[f64] = &[
15-
0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
16-
];
1+
use prometheus::Opts;
172

183
pub(super) trait MetricFromOpts: Sized {
194
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error>;
@@ -105,29 +90,4 @@ load_metric_type!(GaugeVec as vec);
10590
load_metric_type!(IntGauge as single);
10691
load_metric_type!(IntGaugeVec as vec);
10792

108-
// Use a custom implementation for histograms to customize the buckets.
109-
110-
impl MetricFromOpts for Histogram {
111-
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
112-
Histogram::with_opts(HistogramOpts {
113-
common_opts: opts,
114-
buckets: HISTOGRAM_BUCKETS.to_vec(),
115-
})
116-
}
117-
}
118-
119-
impl MetricFromOpts for HistogramVec {
120-
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
121-
HistogramVec::new(
122-
HistogramOpts {
123-
common_opts: opts.clone(),
124-
buckets: HISTOGRAM_BUCKETS.to_vec(),
125-
},
126-
opts.variable_labels
127-
.iter()
128-
.map(|s| s.as_str())
129-
.collect::<Vec<_>>()
130-
.as_slice(),
131-
)
132-
}
133-
}
93+
// Histograms are defined in histogram.rs

src/metrics/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ pub use self::service::ServiceMetrics;
55
#[macro_use]
66
mod macros;
77

8+
mod histogram;
89
mod instance;
910
mod log_encoder;
1011
mod service;

0 commit comments

Comments
 (0)