Skip to content

Add a report-only mode to the BalanceCapacity middleware #2495

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 11, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 43 additions & 29 deletions src/middleware/balance_capacity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
//! we should avoid dropping download requests even if that means rejecting some legitimate
//! requests to other endpoints.

use std::env;
use std::sync::atomic::{AtomicUsize, Ordering};

use super::prelude::*;
Expand All @@ -17,7 +18,8 @@ use conduit::{RequestExt, StatusCode};
pub(super) struct BalanceCapacity {
handler: Option<Box<dyn Handler>>,
capacity: usize,
in_flight_requests: AtomicUsize,
in_flight_non_dl_requests: AtomicUsize,
report_only: bool,
log_at_percentage: usize,
throttle_at_percentage: usize,
dl_only_at_percentage: usize,
Expand All @@ -28,12 +30,39 @@ impl BalanceCapacity {
Self {
handler: None,
capacity,
in_flight_requests: AtomicUsize::new(0),
log_at_percentage: read_env_percentage("WEB_CAPACITY_LOG_PCT", 20),
in_flight_non_dl_requests: AtomicUsize::new(0),
report_only: env::var("WEB_CAPACITY_REPORT_ONLY").ok().is_some(),
log_at_percentage: read_env_percentage("WEB_CAPACITY_LOG_PCT", 50),
throttle_at_percentage: read_env_percentage("WEB_CAPACITY_THROTTLE_PCT", 70),
dl_only_at_percentage: read_env_percentage("WEB_CAPACITY_DL_ONLY_PCT", 80),
}
}

/// Handle a request normally
fn handle(&self, request: &mut dyn RequestExt) -> AfterResult {
self.handler.as_ref().unwrap().call(request)
}

/// Handle a request when load exceeds a threshold
///
/// In report-only mode, log metadata is added but the request is still served. Otherwise,
/// the request is rejected with a service unavailable response.
fn handle_high_load(&self, request: &mut dyn RequestExt, note: &str) -> AfterResult {
if self.report_only {
// In report-only mode we serve all requests but add log metadata
super::log_request::add_custom_metadata(request, "would_reject", note);
self.handle(request)
} else {
// Reject the request
super::log_request::add_custom_metadata(request, "cause", note);
let body = "Service temporarily unavailable";
Response::builder()
.status(StatusCode::SERVICE_UNAVAILABLE)
.header(header::CONTENT_LENGTH, body.len())
.body(Body::from_static(body.as_bytes()))
.map_err(box_error)
}
}
}

impl AroundMiddleware for BalanceCapacity {
Expand All @@ -44,58 +73,43 @@ impl AroundMiddleware for BalanceCapacity {

impl Handler for BalanceCapacity {
fn call(&self, request: &mut dyn RequestExt) -> AfterResult {
// Download requests are always accepted and do not affect the request count
if request.path().starts_with("/api/v1/crates/") && request.path().ends_with("/download") {
return self.handle(request);
}

// The _drop_on_exit ensures the counter is decremented for all exit paths (including panics)
let (_drop_on_exit, count) = RequestCounter::add_one(&self.in_flight_requests);
let handler = self.handler.as_ref().unwrap();
let (_drop_on_exit, count) = RequestCounter::add_one(&self.in_flight_non_dl_requests);
let load = 100 * count / self.capacity;

// Begin logging request count so early stages of load increase can be located
if load >= self.log_at_percentage {
super::log_request::add_custom_metadata(request, "in_flight_requests", count);
}

// Download requests are always accepted
if request.path().starts_with("/api/v1/crates/") && request.path().ends_with("/download") {
return handler.call(request);
super::log_request::add_custom_metadata(request, "in_flight_non_dl_requests", count);
}

// Reject read-only requests as load nears capacity. Bots are likely to send only safe
// requests and this helps prioritize requests that users may be reluctant to retry.
if load >= self.throttle_at_percentage && request.method().is_safe() {
return over_capacity_response(request);
return self.handle_high_load(request, "over capacity (throttle)");
}

// As load reaches capacity, all non-download requests are rejected
if load >= self.dl_only_at_percentage {
return over_capacity_response(request);
return self.handle_high_load(request, "over capacity (download only)");
}

handler.call(request)
self.handle(request)
}
}

fn over_capacity_response(request: &mut dyn RequestExt) -> AfterResult {
// TODO: Generate an alert so we can investigate
super::log_request::add_custom_metadata(request, "cause", "over capacity");
let body = "Service temporarily unavailable";
Response::builder()
.status(StatusCode::SERVICE_UNAVAILABLE)
.header(header::CONTENT_LENGTH, body.len())
.body(Body::from_static(body.as_bytes()))
.map_err(box_error)
}

fn read_env_percentage(name: &str, default: usize) -> usize {
if let Ok(value) = std::env::var(name) {
if let Ok(value) = env::var(name) {
value.parse().unwrap_or(default)
} else {
default
}
}

// FIXME(JTG): I've copied the following from my `conduit-hyper` crate. Once we transition from
// `civet`, we could pass the in_flight_request count from `condut-hyper` via a request extension.

/// A struct that stores a reference to an atomic counter so it can be decremented when dropped
struct RequestCounter<'a> {
counter: &'a AtomicUsize,
Expand Down