Skip to content

Commit 73b4cea

Browse files
authored
feat(system): Log service back pressure as metric (#1583)
This adds central back pressure monitoring via statsd metrics to all services that use the new tokio-based service framework. Channels track the number of pending messages and emit a gauge metric with that number. The metric is `service.back_pressure` and tagged with a `service` tag to identify the specific service. The metric is debounced once per second. Between these intervals, the back pressure number is not checked, so this metric is not suitable to detect very short-lived spikes. Additionally, the current implementation requires that the service makes progress by polling `recv`. The purpose of this metric is to detect lasting increases in backlogs that indicate bottlenecks or degraded system behavior.
1 parent 155556f commit 73b4cea

File tree

10 files changed

+227
-11
lines changed

10 files changed

+227
-11
lines changed

.github/workflows/deploy.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,11 @@ jobs:
131131
uses: actions-rs/cargo@v1
132132
with:
133133
command: run
134-
args: -p document-metrics -- -o relay_metrics.json relay-server/src/statsd.rs relay-metrics/src/statsd.rs relay-kafka/src/statsd.rs
134+
args: -p document-metrics -- -o relay_metrics.json
135+
relay-kafka/src/statsd.rs
136+
relay-metrics/src/statsd.rs
137+
relay-server/src/statsd.rs
138+
relay-system/src/statsd.rs
135139

136140
- name: Deploy
137141
if: github.ref == 'refs/heads/master'

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## Unreleased
4+
5+
**Internal**:
6+
7+
- Emit a `service.back_pressure` metric that measures internal back pressure by service. ([#1583](https://github.com/getsentry/relay/pull/1583))
8+
39
## 22.11.0
410

511
**Features**:

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

relay-metrics/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ tokio = { version = "1.0", features = ["macros", "time"] }
2727
criterion = "0.3"
2828
futures01 = { version = "0.1.28", package = "futures" }
2929
insta = "1.19.0"
30+
relay-statsd = { path = "../relay-statsd", features = ["test"] }
3031
relay-test = { path = "../relay-test" }
3132

3233
[[bench]]

relay-statsd/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@ cadence = "0.26.0"
1414
parking_lot = "0.12.1"
1515
rand = "0.7.3"
1616
relay-log = { path = "../relay-log" }
17+
18+
[features]
19+
default = []
20+
test = []

relay-statsd/src/lib.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,7 @@ use std::ops::{Deref, DerefMut};
6262
use std::sync::Arc;
6363

6464
use cadence::{
65-
BufferedUdpMetricSink, Metric, MetricBuilder, QueuingMetricSink, SpyMetricSink, StatsdClient,
66-
UdpMetricSink,
65+
BufferedUdpMetricSink, Metric, MetricBuilder, QueuingMetricSink, StatsdClient, UdpMetricSink,
6766
};
6867
use parking_lot::RwLock;
6968
use rand::distributions::{Distribution, Uniform};
@@ -164,8 +163,9 @@ pub fn set_client(client: MetricsClient) {
164163
}
165164

166165
/// Set a test client for the period of the called function (only affects the current thread).
166+
#[cfg(feature = "test")]
167167
pub fn with_capturing_test_client(f: impl FnOnce()) -> Vec<String> {
168-
let (rx, sink) = SpyMetricSink::new();
168+
let (rx, sink) = cadence::SpyMetricSink::new();
169169
let test_client = MetricsClient {
170170
statsd_client: StatsdClient::from_sink("", sink),
171171
default_tags: Default::default(),
@@ -573,7 +573,6 @@ macro_rules! metric {
573573

574574
#[cfg(test)]
575575
mod tests {
576-
577576
use cadence::{NopMetricSink, StatsdClient};
578577

579578
use crate::{set_client, with_capturing_test_client, with_client, GaugeMetric, MetricsClient};
@@ -608,7 +607,7 @@ mod tests {
608607
});
609608

610609
assert_eq!(
611-
captures.as_slice(),
610+
captures,
612611
[
613612
"foo:123|g|#server:server1,host:host1",
614613
"bar:456|g|#server:server2,host:host2"

relay-system/Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,10 @@ futures01 = { version = "0.1.28", package = "futures" }
1616
futures = { version = "0.3", package = "futures", features = ["compat"] }
1717
once_cell = "1.13.1"
1818
relay-log = { path = "../relay-log" }
19-
tokio = { version = "1.0", features = ["rt-multi-thread", "sync"] }
19+
relay-statsd = { path = "../relay-statsd" }
20+
tokio = { version = "1.0", features = ["rt-multi-thread", "sync", "macros"] }
2021
tokio01 = { version = "0.1", package = "tokio" }
22+
23+
[dev-dependencies]
24+
relay-statsd = { path = "../relay-statsd", features = ["test"] }
25+
tokio = { version = "1.0", features = ["test-util"] }

relay-system/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
pub mod compat;
1818
mod controller;
1919
mod service;
20+
mod statsd;
2021

2122
pub use self::controller::*;
2223
pub use self::service::*;

relay-system/src/service.rs

Lines changed: 177 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,18 @@ use std::fmt;
22
use std::future::Future;
33
use std::marker::PhantomData;
44
use std::pin::Pin;
5+
use std::sync::atomic::{AtomicU64, Ordering};
6+
use std::sync::Arc;
57
use std::task::Context;
8+
use std::time::Duration;
69

710
use tokio::sync::{mpsc, oneshot};
11+
use tokio::time::MissedTickBehavior;
12+
13+
use crate::statsd::SystemGauges;
14+
15+
/// Interval for recording backlog metrics on service channels.
16+
const BACKLOG_INTERVAL: Duration = Duration::from_secs(1);
817

918
/// A message interface for [services](Service).
1019
///
@@ -305,12 +314,14 @@ pub trait FromMessage<M>: Interface {
305314
/// long as the service is running. It can be freely cloned.
306315
pub struct Addr<I: Interface> {
307316
tx: mpsc::UnboundedSender<I>,
317+
queue_size: Arc<AtomicU64>,
308318
}
309319

310320
impl<I: Interface> fmt::Debug for Addr<I> {
311321
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
312322
f.debug_struct("Addr")
313323
.field("open", &!self.tx.is_closed())
324+
.field("queue_size", &self.queue_size.load(Ordering::Relaxed))
314325
.finish()
315326
}
316327
}
@@ -321,6 +332,7 @@ impl<I: Interface> Clone for Addr<I> {
321332
fn clone(&self) -> Self {
322333
Self {
323334
tx: self.tx.clone(),
335+
queue_size: self.queue_size.clone(),
324336
}
325337
}
326338
}
@@ -341,6 +353,7 @@ impl<I: Interface> Addr<I> {
341353
I: FromMessage<M>,
342354
{
343355
let (tx, rx) = I::Response::channel();
356+
self.queue_size.fetch_add(1, Ordering::SeqCst);
344357
self.tx.send(I::from_message(message, tx)).ok(); // it's ok to drop, the response will fail
345358
rx
346359
}
@@ -352,15 +365,78 @@ impl<I: Interface> Addr<I> {
352365
///
353366
/// Instances are created automatically when [spawning](Service::spawn_handler) a service, or can be
354367
/// created through [`channel`]. The channel closes when all associated [`Addr`]s are dropped.
355-
pub type Receiver<I> = mpsc::UnboundedReceiver<I>;
368+
pub struct Receiver<I: Interface> {
369+
rx: mpsc::UnboundedReceiver<I>,
370+
name: &'static str,
371+
interval: tokio::time::Interval,
372+
queue_size: Arc<AtomicU64>,
373+
}
374+
375+
impl<I: Interface> Receiver<I> {
376+
/// Receives the next value for this receiver.
377+
///
378+
/// This method returns `None` if the channel has been closed and there are
379+
/// no remaining messages in the channel's buffer. This indicates that no
380+
/// further values can ever be received from this `Receiver`. The channel is
381+
/// closed when all senders have been dropped.
382+
///
383+
/// If there are no messages in the channel's buffer, but the channel has
384+
/// not yet been closed, this method will sleep until a message is sent or
385+
/// the channel is closed.
386+
pub async fn recv(&mut self) -> Option<I> {
387+
loop {
388+
tokio::select! {
389+
biased;
390+
391+
_ = self.interval.tick() => {
392+
let backlog = self.queue_size.load(Ordering::Relaxed);
393+
relay_statsd::metric!(
394+
gauge(SystemGauges::ServiceBackPressure) = backlog,
395+
service = self.name
396+
);
397+
},
398+
message = self.rx.recv() => {
399+
self.queue_size.fetch_sub(1, Ordering::SeqCst);
400+
return message;
401+
},
402+
}
403+
}
404+
}
405+
}
406+
407+
impl<I: Interface> fmt::Debug for Receiver<I> {
408+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
409+
f.debug_struct("Receiver")
410+
.field("name", &self.name)
411+
.field("queue_size", &self.queue_size.load(Ordering::Relaxed))
412+
.finish()
413+
}
414+
}
356415

357416
/// Creates an unbounded channel for communicating with a [`Service`].
358417
///
359418
/// The `Addr` as the sending part provides public access to the service, while the `Receiver`
360419
/// should remain internal to the service.
361-
pub fn channel<I: Interface>() -> (Addr<I>, Receiver<I>) {
420+
pub fn channel<I: Interface>(name: &'static str) -> (Addr<I>, Receiver<I>) {
421+
let queue_size = Arc::new(AtomicU64::new(0));
362422
let (tx, rx) = mpsc::unbounded_channel();
363-
(Addr { tx }, rx)
423+
424+
let addr = Addr {
425+
tx,
426+
queue_size: queue_size.clone(),
427+
};
428+
429+
let mut interval = tokio::time::interval(BACKLOG_INTERVAL);
430+
interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
431+
432+
let receiver = Receiver {
433+
rx,
434+
name,
435+
interval,
436+
queue_size,
437+
};
438+
439+
(addr, receiver)
364440
}
365441

366442
/// An asynchronous unit responding to messages.
@@ -426,8 +502,105 @@ pub trait Service: Sized {
426502

427503
/// Starts the service in the current runtime and returns an address for it.
428504
fn start(self) -> Addr<Self::Interface> {
429-
let (addr, rx) = channel();
505+
let (addr, rx) = channel(Self::name());
430506
self.spawn_handler(rx);
431507
addr
432508
}
509+
510+
/// Returns a unique name for this service implementation.
511+
///
512+
/// This is used for internal diagnostics and uses the fully qualified type name of the service
513+
/// implementor by default.
514+
fn name() -> &'static str {
515+
std::any::type_name::<Self>()
516+
}
517+
}
518+
519+
#[cfg(test)]
520+
mod tests {
521+
use super::*;
522+
523+
struct MockMessage;
524+
525+
impl Interface for MockMessage {}
526+
527+
impl FromMessage<Self> for MockMessage {
528+
type Response = NoResponse;
529+
530+
fn from_message(message: Self, _: ()) -> Self {
531+
message
532+
}
533+
}
534+
535+
struct MockService;
536+
537+
impl Service for MockService {
538+
type Interface = MockMessage;
539+
540+
fn spawn_handler(self, mut rx: Receiver<Self::Interface>) {
541+
tokio::spawn(async move {
542+
while rx.recv().await.is_some() {
543+
tokio::time::sleep(BACKLOG_INTERVAL * 2).await;
544+
}
545+
});
546+
}
547+
548+
fn name() -> &'static str {
549+
"mock"
550+
}
551+
}
552+
553+
#[test]
554+
fn test_backpressure_metrics() {
555+
let rt = tokio::runtime::Builder::new_current_thread()
556+
.enable_time()
557+
.build()
558+
.unwrap();
559+
560+
let _guard = rt.enter();
561+
tokio::time::pause();
562+
563+
// Mock service takes 2 * BACKLOG_INTERVAL for every message
564+
let addr = MockService.start();
565+
566+
// Advance the timer by a tiny offset to trigger the first metric emission.
567+
let captures = relay_statsd::with_capturing_test_client(|| {
568+
rt.block_on(async {
569+
tokio::time::sleep(Duration::from_millis(10)).await;
570+
})
571+
});
572+
573+
assert_eq!(captures, ["service.back_pressure:0|g|#service:mock"]);
574+
575+
// Send messages and advance to 0.5 * INTERVAL. No metrics expected at this point.
576+
let captures = relay_statsd::with_capturing_test_client(|| {
577+
rt.block_on(async {
578+
addr.send(MockMessage); // will be pulled immediately
579+
addr.send(MockMessage);
580+
addr.send(MockMessage);
581+
582+
tokio::time::sleep(BACKLOG_INTERVAL / 2).await;
583+
})
584+
});
585+
586+
assert!(captures.is_empty());
587+
588+
// Advance to 6.5 * INTERVAL. The service should pull the first message immediately, another
589+
// message every 2 INTERVALS. The messages are fully handled after 6 INTERVALS, but we
590+
// cannot observe that since the last message exits the queue at 4.
591+
let captures = relay_statsd::with_capturing_test_client(|| {
592+
rt.block_on(async {
593+
tokio::time::sleep(BACKLOG_INTERVAL * 6).await;
594+
})
595+
});
596+
597+
assert_eq!(
598+
captures,
599+
[
600+
"service.back_pressure:2|g|#service:mock", // 2 * INTERVAL
601+
"service.back_pressure:1|g|#service:mock", // 4 * INTERVAL
602+
"service.back_pressure:0|g|#service:mock", // 6 * INTERVAL
603+
]
604+
);
605+
}
433606
}

relay-system/src/statsd.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
use relay_statsd::GaugeMetric;
2+
3+
/// Gauge metrics for Relay system components.
4+
pub enum SystemGauges {
5+
/// A number of messages queued in a services inbound message channel.
6+
///
7+
/// This metric is emitted once per second for every running service. Without backlogs, this
8+
/// number should be close to `0`. If this number is monotonically increasing, the service is
9+
/// not able to process the inbound message volume.
10+
///
11+
/// This metric is tagged with:
12+
/// - `service`: The fully qualified type name of the service implementation.
13+
ServiceBackPressure,
14+
}
15+
16+
impl GaugeMetric for SystemGauges {
17+
fn name(&self) -> &'static str {
18+
match *self {
19+
SystemGauges::ServiceBackPressure => "service.back_pressure",
20+
}
21+
}
22+
}

0 commit comments

Comments
 (0)