Auto merge of #3829 - pietroalbini:pa-db-pool-histogram, r=Turbo87

bors · bors · commit 2676ea604a98 · 2021-08-19T04:27:55.000Z
Track used database conns with an histogram

While looking at the metrics, me and `@jtgeibel` noticed that the way we tracked used database connections before (sampling the current state every time metrics are scraped) is not really accurate, as the connections could be all used up until the scrape happens. This PR adds another metric recording the used database connections with an histogram.

This PR also refactors how histograms are defined in the source code, to account for different histograms needing different buckets.
diff --git a/src/app.rs b/src/app.rs
@@ -128,6 +128,9 @@ impl App {
                 instance_metrics
                     .database_time_to_obtain_connection
                     .with_label_values(&["primary"]),
+                instance_metrics
+                    .database_used_conns_histogram
+                    .with_label_values(&["primary"]),
             )
             .unwrap()
         };
@@ -155,6 +158,9 @@ impl App {
                         instance_metrics
                             .database_time_to_obtain_connection
                             .with_label_values(&["follower"]),
+                        instance_metrics
+                            .database_used_conns_histogram
+                            .with_label_values(&["follower"]),
                     )
                     .unwrap(),
                 )
diff --git a/src/db.rs b/src/db.rs
@@ -13,6 +13,7 @@ use crate::middleware::app::RequestApp;
 pub enum DieselPool {
     Pool {
         pool: r2d2::Pool<ConnectionManager<PgConnection>>,
+        used_conns_metric: Histogram,
         time_to_obtain_connection_metric: Histogram,
     },
     Test(Arc<ReentrantMutex<PgConnection>>),
@@ -22,6 +23,7 @@ impl DieselPool {
     pub(crate) fn new(
         url: &str,
         config: r2d2::Builder<ConnectionManager<PgConnection>>,
+        used_conns_metric: Histogram,
         time_to_obtain_connection_metric: Histogram,
     ) -> Result<DieselPool, PoolError> {
         let manager = ConnectionManager::new(connection_url(url));
@@ -39,6 +41,7 @@ impl DieselPool {
         // automatically be marked as unhealthy and the rest of the application will adapt.
         let pool = DieselPool::Pool {
             pool: config.build_unchecked(manager),
+            used_conns_metric,
             time_to_obtain_connection_metric,
         };
         match pool.wait_until_healthy(Duration::from_secs(5)) {
@@ -62,8 +65,13 @@ impl DieselPool {
         match self {
             DieselPool::Pool {
                 pool,
+                used_conns_metric,
                 time_to_obtain_connection_metric,
             } => time_to_obtain_connection_metric.observe_closure_duration(|| {
+                // Record the number of used connections before obtaining the current one.
+                let state = pool.state();
+                used_conns_metric.observe((state.connections - state.idle_connections) as f64);
+
                 if let Some(conn) = pool.try_get() {
                     Ok(DieselPooledConn::Pool(conn))
                 } else if !self.is_healthy() {
diff --git a/src/metrics/histogram.rs b/src/metrics/histogram.rs
@@ -0,0 +1,103 @@
+use crate::metrics::macros::MetricFromOpts;
+use std::marker::PhantomData;
+use std::ops::{Deref, DerefMut};
+
+/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing the
+/// count of datapoints equal or greater to the bucket value.
+///
+/// Histogram buckets are not an exact science, so feel free to tweak the buckets or create new
+/// ones if you see that the histograms are not really accurate. Just avoid adding too many buckets
+/// for a single type as that increases the number of exported metric series.
+pub trait HistogramBuckets {
+    const BUCKETS: &'static [f64];
+}
+
+/// Buckets geared towards measuring timings, such as the response time of our requests, going from
+/// 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly lower
+/// resolution. This allows us to properly measure download requests (which take around 1ms) and
+/// other requests (our 95h is around 10-20ms).
+pub struct TimingBuckets;
+
+impl HistogramBuckets for TimingBuckets {
+    const BUCKETS: &'static [f64] = &[
+        0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
+    ];
+}
+
+/// Buckets geared towards measuring how many database connections are used in the database pool.
+pub struct DatabasePoolBuckets;
+
+impl HistogramBuckets for DatabasePoolBuckets {
+    const BUCKETS: &'static [f64] = &[0.0, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, 100.0];
+}
+
+/// Wrapper type over [`prometheus::Histogram`] to support defining buckets.
+pub struct Histogram<B: HistogramBuckets> {
+    inner: prometheus::Histogram,
+    _phantom: PhantomData<B>,
+}
+
+impl<B: HistogramBuckets> MetricFromOpts for Histogram<B> {
+    fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
+        Ok(Histogram {
+            inner: prometheus::Histogram::with_opts(prometheus::HistogramOpts {
+                common_opts: opts,
+                buckets: B::BUCKETS.to_vec(),
+            })?,
+            _phantom: PhantomData,
+        })
+    }
+}
+
+impl<B: HistogramBuckets> Deref for Histogram<B> {
+    type Target = prometheus::Histogram;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl<B: HistogramBuckets> DerefMut for Histogram<B> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
+/// Wrapper type over [`prometheus::HistogramVec`] to support defining buckets.
+pub struct HistogramVec<B: HistogramBuckets> {
+    inner: prometheus::HistogramVec,
+    _phantom: PhantomData<B>,
+}
+
+impl<B: HistogramBuckets> MetricFromOpts for HistogramVec<B> {
+    fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
+        Ok(HistogramVec {
+            inner: prometheus::HistogramVec::new(
+                prometheus::HistogramOpts {
+                    common_opts: opts.clone(),
+                    buckets: B::BUCKETS.to_vec(),
+                },
+                opts.variable_labels
+                    .iter()
+                    .map(|s| s.as_str())
+                    .collect::<Vec<_>>()
+                    .as_slice(),
+            )?,
+            _phantom: PhantomData,
+        })
+    }
+}
+
+impl<B: HistogramBuckets> Deref for HistogramVec<B> {
+    type Target = prometheus::HistogramVec;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl<B: HistogramBuckets> DerefMut for HistogramVec<B> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
diff --git a/src/metrics/instance.rs b/src/metrics/instance.rs
@@ -17,11 +17,10 @@
 //! As a rule of thumb, if the metric requires a database query to be updated it's probably a
 //! service-level metric, and you should add it to `src/metrics/service.rs` instead.
 
+use crate::metrics::histogram::{DatabasePoolBuckets, Histogram, HistogramVec, TimingBuckets};
 use crate::util::errors::AppResult;
 use crate::{app::App, db::DieselPool};
-use prometheus::{
-    proto::MetricFamily, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
-};
+use prometheus::{proto::MetricFamily, IntCounter, IntCounterVec, IntGauge, IntGaugeVec};
 
 metrics! {
     pub struct InstanceMetrics {
@@ -30,15 +29,17 @@ metrics! {
         /// Number of used database connections in the pool
         database_used_conns: IntGaugeVec["pool"],
         /// Amount of time required to obtain a database connection
-        pub database_time_to_obtain_connection: HistogramVec["pool"],
+        pub database_time_to_obtain_connection: HistogramVec<TimingBuckets>["pool"],
+        /// Number of used database connections in the pool, as histogram
+        pub database_used_conns_histogram: HistogramVec<DatabasePoolBuckets>["pool"],
 
         /// Number of requests processed by this instance
         pub requests_total: IntCounter,
         /// Number of requests currently being processed
         pub requests_in_flight: IntGauge,
 
         /// Response times of our endpoints
-        pub response_times: HistogramVec["endpoint"],
+        pub response_times: HistogramVec<TimingBuckets>["endpoint"],
         /// Nmber of responses per status code
         pub responses_by_status_code_total: IntCounterVec["status"],
 
@@ -47,7 +48,7 @@ metrics! {
         /// Number of download requests with a non-canonical crate name.
         pub downloads_non_canonical_crate_name_total: IntCounter,
         /// How long it takes to execute the SELECT query in the download endpoint.
-        pub downloads_select_query_execution_time: Histogram,
+        pub downloads_select_query_execution_time: Histogram<TimingBuckets>,
         /// Number of download requests that are not counted yet.
         downloads_not_counted_total: IntGauge,
     }
diff --git a/src/metrics/macros.rs b/src/metrics/macros.rs
@@ -1,19 +1,4 @@
-use prometheus::{Histogram, HistogramOpts, HistogramVec, Opts};
-
-/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing
-/// the count of datapoints equal or greater to the bucket value.
-///
-/// The buckets used by crates.io are geared towards measuring the response time of our requests,
-/// going from 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly
-/// lower resolution. This allows us to properly measure download requests (which take around 1ms)
-/// and other requests (our 95h is around 10-20ms).
-///
-/// Histogram buckets are not an exact science, so feel free to tweak the buckets if you see that
-/// the histograms are not really accurate. Just avoid adding too many buckets as that increases
-/// the number of exported metric series.
-const HISTOGRAM_BUCKETS: &[f64] = &[
-    0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
-];
+use prometheus::Opts;
 
 pub(super) trait MetricFromOpts: Sized {
     fn from_opts(opts: Opts) -> Result<Self, prometheus::Error>;
@@ -105,29 +90,4 @@ load_metric_type!(GaugeVec as vec);
 load_metric_type!(IntGauge as single);
 load_metric_type!(IntGaugeVec as vec);
 
-// Use a custom implementation for histograms to customize the buckets.
-
-impl MetricFromOpts for Histogram {
-    fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
-        Histogram::with_opts(HistogramOpts {
-            common_opts: opts,
-            buckets: HISTOGRAM_BUCKETS.to_vec(),
-        })
-    }
-}
-
-impl MetricFromOpts for HistogramVec {
-    fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
-        HistogramVec::new(
-            HistogramOpts {
-                common_opts: opts.clone(),
-                buckets: HISTOGRAM_BUCKETS.to_vec(),
-            },
-            opts.variable_labels
-                .iter()
-                .map(|s| s.as_str())
-                .collect::<Vec<_>>()
-                .as_slice(),
-        )
-    }
-}
+// Histograms are defined in histogram.rs
diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs
@@ -5,6 +5,7 @@ pub use self::service::ServiceMetrics;
 #[macro_use]
 mod macros;
 
+mod histogram;
 mod instance;
 mod log_encoder;
 mod service;

Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,9 @@ impl App {`
`128`	`128`	`instance_metrics`
`129`	`129`	`.database_time_to_obtain_connection`
`130`	`130`	`.with_label_values(&["primary"]),`
	`131`	`+ instance_metrics`
	`132`	`+ .database_used_conns_histogram`
	`133`	`+ .with_label_values(&["primary"]),`
`131`	`134`	`)`
`132`	`135`	`.unwrap()`
`133`	`136`	`};`
`@@ -155,6 +158,9 @@ impl App {`
`155`	`158`	`instance_metrics`
`156`	`159`	`.database_time_to_obtain_connection`
`157`	`160`	`.with_label_values(&["follower"]),`
	`161`	`+ instance_metrics`
	`162`	`+ .database_used_conns_histogram`
	`163`	`+ .with_label_values(&["follower"]),`
`158`	`164`	`)`
`159`	`165`	`.unwrap(),`
`160`	`166`	`)`