feat: new http retry logic (#53)

amunra · web-flow · commit 3c326fd954be · 2024-02-09T20:27:10.000Z
diff --git a/ci/run_all_tests.py b/ci/run_all_tests.py
@@ -38,7 +38,7 @@ def main():
     test_line_sender_path = next(iter(
         build_dir.glob(f'**/test_line_sender{exe_suffix}')))
     system_test_path = pathlib.Path('system_test') / 'test.py'
-    qdb_v = '7.3.7'  # The version of QuestDB we'll test against.
+    qdb_v = '7.3.9'  # The version of QuestDB we'll test against.
 
     run_cmd('cargo', 'test', '--', '--nocapture', cwd='questdb-rs')
     run_cmd('cargo', 'test', '--all-features', '--', '--nocapture', cwd='questdb-rs')
diff --git a/cpp_test/test_line_sender.cpp b/cpp_test/test_line_sender.cpp
@@ -720,8 +720,17 @@ TEST_CASE("Empty Buffer") {
 TEST_CASE("HTTP basics") {
     questdb::ingress::opts opts1{"localhost", 1};
     questdb::ingress::opts opts2{"localhost", 1};
-    opts1.http().transactional().max_retries(5).retry_interval(10).basic_auth("user", "pass");
-    opts2.http().token_auth("token").min_throughput(1000);
+    opts1
+        .http()
+        .transactional()
+        .grace_timeout(5000)
+        .retry_timeout(5)
+        .basic_auth("user", "pass");
+    opts2
+        .http()
+        .token_auth("token")
+        .min_throughput(1000)
+        .retry_timeout(0);
     questdb::ingress::line_sender sender1{opts1};
     questdb::ingress::line_sender sender2{opts2};
 
diff --git a/include/questdb/ingress/line_sender.h b/include/questdb/ingress/line_sender.h
@@ -600,25 +600,13 @@ LINESENDER_API
 void line_sender_opts_http(line_sender_opts* opts);
 
 /**
- * Maxmimum number of HTTP request retries.
- * Defaults to 3.
+ * Cumulative duration spent in retries.
+ * Default is 10 seconds.
  */
 LINESENDER_API
-void line_sender_opts_max_retries(
+void line_sender_opts_retry_timeout(
     line_sender_opts* opts,
-    uint32_t max_retries);
-
-/**
- * The initial retry interval (specified in milliseconds).
- * This the default is 100 milliseconds.
- * The retry interval is doubled after each failed attempt,
- * up to the maximum number of retries.
- * Also see `max_retries`.
- */
-LINESENDER_API
-void line_sender_opts_retry_interval(
-    line_sender_opts* opts,
-    uint64_t retry_interval_millis);
+    uint64_t millis);
 
 /**
  * Minimum expected throughput in bytes per second for HTTP requests.
@@ -631,6 +619,15 @@ void line_sender_opts_min_throughput(
     line_sender_opts* opts,
     uint64_t bytes_per_sec);
 
+/**
+ * Grace request timeout before relying on the minimum throughput logic.
+ * The default is 5 seconds.
+ */
+LINESENDER_API
+void line_sender_opts_grace_timeout(
+    line_sender_opts* opts,
+    uint64_t millis);
+
 /**
  * Enable transactional flushes.
  * This is only relevant for HTTP.
diff --git a/include/questdb/ingress/line_sender.hpp b/include/questdb/ingress/line_sender.hpp
@@ -824,27 +824,12 @@ namespace questdb::ingress
             }
 
             /**
-             * Maxmimum number of HTTP request retries.
-             * Defaults to 3.
+             * Cumulative duration spent in retries.
+             * Default is 10 seconds.
              */
-            opts& max_retries(uint32_t max_retries) noexcept
+            opts& retry_timeout(uint64_t millis) noexcept
             {
-                ::line_sender_opts_max_retries(_impl, max_retries);
-                return *this;
-            }
-
-            /**
-             * The initial retry interval (specified in milliseconds).
-             * This the default is 100 milliseconds.
-             * The retry interval is doubled after each failed attempt,
-             * up to the maximum number of retries.
-             * Also see `max_retries`.
-             */
-            opts& retry_interval(uint64_t retry_interval_millis) noexcept
-            {
-                ::line_sender_opts_retry_interval(
-                    _impl,
-                    retry_interval_millis);
+                ::line_sender_opts_retry_timeout(_impl, millis);
                 return *this;
             }
 
@@ -860,6 +845,16 @@ namespace questdb::ingress
                 return *this;
             }
 
+            /**
+             * Grace request timeout before relying on the minimum throughput logic.
+             * The default is 5 seconds.
+             */
+            opts& grace_timeout(uint64_t millis) noexcept
+            {
+                ::line_sender_opts_grace_timeout(_impl, millis);
+                return *this;
+            }
+
             /**
              * Enable transactional flushes.
              * This is only relevant for HTTP.
diff --git a/questdb-rs-ffi/Cargo.lock b/questdb-rs-ffi/Cargo.lock
diff --git a/questdb-rs-ffi/src/lib.rs b/questdb-rs-ffi/src/lib.rs
@@ -514,28 +514,12 @@ pub unsafe extern "C" fn line_sender_opts_http(opts: *mut line_sender_opts) {
     upd_opts!(opts, http);
 }
 
-/// Maxmimum number of HTTP request retries.
-/// Defaults to 3.
+/// Cumulative duration spent in retries.
+/// Default is 10 seconds.
 #[no_mangle]
-pub unsafe extern "C" fn line_sender_opts_max_retries(
-    opts: *mut line_sender_opts,
-    max_retries: u32,
-) {
-    upd_opts!(opts, max_retries, max_retries);
-}
-
-/// The initial retry interval (specified in milliseconds).
-/// This the default is 100 milliseconds.
-/// The retry interval is doubled after each failed attempt,
-/// up to the maximum number of retries.
-/// Also see `max_retries`.
-#[no_mangle]
-pub unsafe extern "C" fn line_sender_opts_retry_interval(
-    opts: *mut line_sender_opts,
-    retry_interval_millis: u64,
-) {
-    let retry_interval = std::time::Duration::from_millis(retry_interval_millis);
-    upd_opts!(opts, retry_interval, retry_interval);
+pub unsafe extern "C" fn line_sender_opts_retry_timeout(opts: *mut line_sender_opts, millis: u64) {
+    let retry_timeout = std::time::Duration::from_millis(millis);
+    upd_opts!(opts, retry_timeout, retry_timeout);
 }
 
 /// Minimum expected throughput in bytes per second for HTTP requests.
@@ -550,6 +534,14 @@ pub unsafe extern "C" fn line_sender_opts_min_throughput(
     upd_opts!(opts, min_throughput, bytes_per_sec);
 }
 
+/// Grace request timeout before relying on the minimum throughput logic.
+/// The default is 5 seconds.
+#[no_mangle]
+pub unsafe extern "C" fn line_sender_opts_grace_timeout(opts: *mut line_sender_opts, millis: u64) {
+    let grace_timeout = std::time::Duration::from_millis(millis);
+    upd_opts!(opts, grace_timeout, grace_timeout);
+}
+
 /// Enable transactional flushes.
 /// This is only relevant for HTTP.
 /// This works by ensuring that the buffer contains lines for a single table.
diff --git a/questdb-rs/Cargo.toml b/questdb-rs/Cargo.toml
@@ -30,6 +30,7 @@ webpki-roots = { version = "0.26.0", optional = true }
 chrono = { version = "0.4.30", optional = true }
 ureq = { version = "2.9.4", optional = true }
 serde_json = { version = "1.0.108", optional = true }
+rand = { version = "0.8.5", optional = true }
 
 [target.'cfg(windows)'.dependencies]
 winapi = { version = "0.3.9", features = ["ws2def"] }
@@ -48,7 +49,7 @@ chrono = "0.4.31"
 default = ["tls-webpki-certs"]
 
 # Include support for ILP over HTTP.
-ilp-over-http = ["dep:ureq", "dep:serde_json"]
+ilp-over-http = ["dep:ureq", "dep:serde_json", "dep:rand"]
 
 # Allow use OS-provided root TLS certificates
 tls-native-certs = ["dep:rustls-native-certs"]
diff --git a/questdb-rs/src/ingress/http.rs b/questdb-rs/src/ingress/http.rs
@@ -1,7 +1,9 @@
 use crate::{error, Error};
 use base64ct::Base64;
 use base64ct::Encoding;
+use rand::Rng;
 use std::fmt::Write;
+use std::thread::sleep;
 use std::time::Duration;
 
 #[derive(Debug, Clone)]
@@ -39,8 +41,8 @@ impl TokenAuthParams {
 pub(super) struct HttpConfig {
     pub(super) min_throughput: u64,
     pub(super) user_agent: Option<String>,
-    pub(super) max_retries: u32,
-    pub(super) retry_interval: Duration,
+    pub(super) retry_timeout: Duration,
+    pub(super) grace_timeout: Duration,
     pub(super) transactional: bool,
 }
 
@@ -55,7 +57,7 @@ pub(super) struct HttpHandlerState {
     pub(super) auth: Option<String>,
 
     /// Additional grace period added to the timeout as calculated via `min_throughput`.
-    pub(super) timeout_grace_period: Duration,
+    pub(super) grace_timeout: Duration,
 
     /// HTTP params configured via the `SenderBuilder`.
     pub(super) config: HttpConfig,
@@ -175,33 +177,50 @@ pub(super) fn is_retriable_error(err: &ureq::Error) -> bool {
 }
 
 #[allow(clippy::result_large_err)] // `ureq::Error` is large enough to cause this warning.
-pub(super) fn retry_http_send(
+fn retry_http_send(
     request: ureq::Request,
     buf: &[u8],
-    max_retries: u32,
-    mut retry_interval: Duration,
+    retry_timeout: Duration,
+    mut last_err: ureq::Error,
 ) -> Result<ureq::Response, ureq::Error> {
-    let mut counter = 0;
-
+    let mut rng = rand::thread_rng();
+    let retry_end = std::time::Instant::now() + retry_timeout;
+    let mut retry_interval_ms = 10;
     loop {
-        let response_or_err = request.clone().send_bytes(buf);
-        let last_err = match response_or_err {
+        let jitter_ms = rng.gen_range(-5i32..5);
+        let to_sleep_ms = retry_interval_ms + jitter_ms;
+        let to_sleep = Duration::from_millis(to_sleep_ms as u64);
+        if (std::time::Instant::now() + to_sleep) > retry_end {
+            return Err(last_err);
+        }
+        sleep(to_sleep);
+        last_err = match request.clone().send_bytes(buf) {
             Ok(res) => return Ok(res),
             Err(err) => {
-                if is_retriable_error(&err) {
-                    err
-                } else {
+                if !is_retriable_error(&err) {
                     return Err(err);
                 }
+                err
             }
         };
+        retry_interval_ms = (retry_interval_ms * 2).min(1000);
+    }
+}
 
-        counter += 1;
-        if counter > max_retries {
-            return Err(last_err);
-        }
+#[allow(clippy::result_large_err)] // `ureq::Error` is large enough to cause this warning.
+pub(super) fn http_send_with_retries(
+    request: ureq::Request,
+    buf: &[u8],
+    retry_timeout: Duration,
+) -> Result<ureq::Response, ureq::Error> {
+    let last_err = match request.clone().send_bytes(buf) {
+        Ok(res) => return Ok(res),
+        Err(err) => err,
+    };
 
-        std::thread::sleep(retry_interval);
-        retry_interval *= 2;
+    if retry_timeout.is_zero() || !is_retriable_error(&last_err) {
+        return Err(last_err);
     }
+
+    retry_http_send(request, buf, retry_timeout, last_err)
 }
diff --git a/questdb-rs/src/ingress/mod.rs b/questdb-rs/src/ingress/mod.rs
diff --git a/questdb-rs/src/tests/http.rs b/questdb-rs/src/tests/http.rs