Skip to content

Commit f672258

Browse files
yuchungchengdavem330
authored andcommitted
tcp: track min RTT using windowed min-filter
Kathleen Nichols' algorithm for tracking the minimum RTT of a data stream over some measurement window. It uses constant space and constant time per update. Yet it almost always delivers the same minimum as an implementation that has to keep all the data in the window. The measurement window is tunable via sysctl.net.ipv4.tcp_min_rtt_wlen with a default value of 5 minutes. The algorithm keeps track of the best, 2nd best & 3rd best min values, maintaining an invariant that the measurement time of the n'th best >= n-1'th best. It also makes sure that the three values are widely separated in the time window since that bounds the worse case error when that data is monotonically increasing over the window. Upon getting a new min, we can forget everything earlier because it has no value - the new min is less than everything else in the window by definition and it's the most recent. So we restart fresh on every new min and overwrites the 2nd & 3rd choices. The same property holds for the 2nd & 3rd best. Therefore we have to maintain two invariants to maximize the information in the samples, one on values (1st.v <= 2nd.v <= 3rd.v) and the other on times (now-win <=1st.t <= 2nd.t <= 3rd.t <= now). These invariants determine the structure of the code The RTT input to the windowed filter is the minimum RTT measured from ACK or SACK, or as the last resort from TCP timestamps. The accessor tcp_min_rtt() returns the minimum RTT seen in the window. ~0U indicates it is not available. The minimum is 1usec even if the true RTT is below that. Signed-off-by: Yuchung Cheng <[email protected]> Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 9e45a3e commit f672258

File tree

7 files changed

+100
-5
lines changed

7 files changed

+100
-5
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
384384
Defaults are calculated at boot time from amount of available
385385
memory.
386386

387+
tcp_min_rtt_wlen - INTEGER
388+
The window length of the windowed min filter to track the minimum RTT.
389+
A shorter window lets a flow more quickly pick up new (higher)
390+
minimum RTT when it is moved to a longer path (e.g., due to traffic
391+
engineering). A longer window makes the filter more resistant to RTT
392+
inflations such as transient congestion. The unit is seconds.
393+
Default: 300
394+
387395
tcp_moderate_rcvbuf - BOOLEAN
388396
If set, TCP performs receive buffer auto-tuning, attempting to
389397
automatically size the buffer (no greater than tcp_rmem[2]) to

include/linux/tcp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ struct tcp_sock {
217217
u32 mdev_max_us; /* maximal mdev for the last rtt period */
218218
u32 rttvar_us; /* smoothed mdev_max */
219219
u32 rtt_seq; /* sequence number to update rttvar */
220+
struct rtt_meas {
221+
u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
222+
} rtt_min[3];
220223

221224
u32 packets_out; /* Packets which are "in flight" */
222225
u32 retrans_out; /* Retransmitted packets out */

include/net/tcp.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
279279
extern int sysctl_tcp_challenge_ack_limit;
280280
extern unsigned int sysctl_tcp_notsent_lowat;
281281
extern int sysctl_tcp_min_tso_segs;
282+
extern int sysctl_tcp_min_rtt_wlen;
282283
extern int sysctl_tcp_autocorking;
283284
extern int sysctl_tcp_invalid_ratelimit;
284285
extern int sysctl_tcp_pacing_ss_ratio;
@@ -671,6 +672,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
671672
return dst_metric_locked(dst, RTAX_CC_ALGO);
672673
}
673674

675+
/* Minimum RTT in usec. ~0 means not available. */
676+
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
677+
{
678+
return tp->rtt_min[0].rtt;
679+
}
680+
674681
/* Compute the actual receive window we are currently advertising.
675682
* Rcv_nxt can be after the window if our peer push more data
676683
* than the offered window.

net/ipv4/sysctl_net_ipv4.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,13 @@ static struct ctl_table ipv4_table[] = {
576576
.mode = 0644,
577577
.proc_handler = proc_dointvec
578578
},
579+
{
580+
.procname = "tcp_min_rtt_wlen",
581+
.data = &sysctl_tcp_min_rtt_wlen,
582+
.maxlen = sizeof(int),
583+
.mode = 0644,
584+
.proc_handler = proc_dointvec
585+
},
579586
{
580587
.procname = "tcp_low_latency",
581588
.data = &sysctl_tcp_low_latency,

net/ipv4/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
388388

389389
icsk->icsk_rto = TCP_TIMEOUT_INIT;
390390
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391+
tp->rtt_min[0].rtt = ~0U;
391392

392393
/* So many TCP implementations out there (incorrectly) count the
393394
* initial SYN frame in their delayed-ACK and congestion control

net/ipv4/tcp_input.c

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
9595
int sysctl_tcp_rfc1337 __read_mostly;
9696
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
9797
int sysctl_tcp_frto __read_mostly = 2;
98+
int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
9899

99100
int sysctl_tcp_thin_dupack __read_mostly;
100101

@@ -2915,8 +2916,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
29152916
tcp_xmit_retransmit_queue(sk);
29162917
}
29172918

2919+
/* Kathleen Nichols' algorithm for tracking the minimum value of
2920+
* a data stream over some fixed time interval. (E.g., the minimum
2921+
* RTT over the past five minutes.) It uses constant space and constant
2922+
* time per update yet almost always delivers the same minimum as an
2923+
* implementation that has to keep all the data in the window.
2924+
*
2925+
* The algorithm keeps track of the best, 2nd best & 3rd best min
2926+
* values, maintaining an invariant that the measurement time of the
2927+
* n'th best >= n-1'th best. It also makes sure that the three values
2928+
* are widely separated in the time window since that bounds the worse
2929+
* case error when that data is monotonically increasing over the window.
2930+
*
2931+
* Upon getting a new min, we can forget everything earlier because it
2932+
* has no value - the new min is <= everything else in the window by
2933+
* definition and it's the most recent. So we restart fresh on every new min
2934+
* and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
2935+
* best.
2936+
*/
2937+
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
2938+
{
2939+
const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
2940+
struct rtt_meas *m = tcp_sk(sk)->rtt_min;
2941+
struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
2942+
u32 elapsed;
2943+
2944+
/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
2945+
if (unlikely(rttm.rtt <= m[0].rtt))
2946+
m[0] = m[1] = m[2] = rttm;
2947+
else if (rttm.rtt <= m[1].rtt)
2948+
m[1] = m[2] = rttm;
2949+
else if (rttm.rtt <= m[2].rtt)
2950+
m[2] = rttm;
2951+
2952+
elapsed = now - m[0].ts;
2953+
if (unlikely(elapsed > wlen)) {
2954+
/* Passed entire window without a new min so make 2nd choice
2955+
* the new min & 3rd choice the new 2nd. So forth and so on.
2956+
*/
2957+
m[0] = m[1];
2958+
m[1] = m[2];
2959+
m[2] = rttm;
2960+
if (now - m[0].ts > wlen) {
2961+
m[0] = m[1];
2962+
m[1] = rttm;
2963+
if (now - m[0].ts > wlen)
2964+
m[0] = rttm;
2965+
}
2966+
} else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
2967+
/* Passed a quarter of the window without a new min so
2968+
* take 2nd choice from the 2nd quarter of the window.
2969+
*/
2970+
m[2] = m[1] = rttm;
2971+
} else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
2972+
/* Passed half the window without a new min so take the 3rd
2973+
* choice from the last half of the window.
2974+
*/
2975+
m[2] = rttm;
2976+
}
2977+
}
2978+
29182979
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2919-
long seq_rtt_us, long sack_rtt_us)
2980+
long seq_rtt_us, long sack_rtt_us,
2981+
long ca_rtt_us)
29202982
{
29212983
const struct tcp_sock *tp = tcp_sk(sk);
29222984

@@ -2936,11 +2998,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
29362998
*/
29372999
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
29383000
flag & FLAG_ACKED)
2939-
seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2940-
3001+
seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
3002+
tp->rx_opt.rcv_tsecr);
29413003
if (seq_rtt_us < 0)
29423004
return false;
29433005

3006+
/* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
3007+
* always taken together with ACK, SACK, or TS-opts. Any negative
3008+
* values will be skipped with the seq_rtt_us < 0 check above.
3009+
*/
3010+
tcp_update_rtt_min(sk, ca_rtt_us);
29443011
tcp_rtt_estimator(sk, seq_rtt_us);
29453012
tcp_set_rto(sk);
29463013

@@ -2961,7 +3028,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
29613028
rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
29623029
}
29633030

2964-
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L);
3031+
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
29653032
}
29663033

29673034

@@ -3175,7 +3242,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
31753242
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
31763243
}
31773244

3178-
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
3245+
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3246+
ca_rtt_us);
31793247

31803248
if (flag & FLAG_ACKED) {
31813249
tcp_rearm_rto(sk);

net/ipv4/tcp_minisocks.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
470470

471471
newtp->srtt_us = 0;
472472
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
473+
newtp->rtt_min[0].rtt = ~0U;
473474
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
474475

475476
newtp->packets_out = 0;

0 commit comments

Comments
 (0)