Skip to content

Commit eb9fae3

Browse files
committed
Merge branch 'tcp-rack'
Yuchung Cheng says: ==================== RACK loss detection RACK (Recent ACK) loss recovery uses the notion of time instead of packet sequence (FACK) or counts (dupthresh). It's inspired by the FACK heuristic in tcp_mark_lost_retrans(): when a limited transmit (new data packet) is sacked in recovery, then any retransmission sent before that newly sacked packet was sent must have been lost, since at least one round trip time has elapsed. But that existing heuristic from tcp_mark_lost_retrans() has several limitations: 1) it can't detect tail drops since it depends on limited transmit 2) it's disabled upon reordering (assumes no reordering) 3) it's only enabled in fast recovery but not timeout recovery RACK addresses these limitations with a core idea: an unacknowledged packet P1 is deemed lost if a packet P2 that was sent later is is s/acked, since at least one round trip has passed. Since RACK cares about the time sequence instead of the data sequence of packets, it can detect tail drops when a later retransmission is s/acked, while FACK or dupthresh can't. For reordering RACK uses a dynamically adjusted reordering window ("reo_wnd") to reduce false positives on ever (small) degree of reordering, similar to the delayed Early Retransmit. In the current patch set RACK is only a supplemental loss detection and does not trigger fast recovery. However we are developing RACK to replace or consolidate FACK/dupthresh, early retransmit, and thin-dupack. These heuristics all implicitly bear the time notion. For example, the delayed Early Retransmit is simply applying RACK to trigger the fast recovery with small inflight. RACK requires measuring the minimum RTT. Tracking a global min is less robust due to traffic engineering pathing changes. Therefore it uses a windowed filter by Kathleen Nichols. The min RTT can also be useful for various other purposes like congestion control or stat monitoring. This patch has been used on Google servers for well over 1 year. RACK has also been implemented in the QUIC protocol. We are submitting an IETF draft as well. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents c8fdc32 + 4f41b1c commit eb9fae3

File tree

11 files changed

+286
-86
lines changed

11 files changed

+286
-86
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
384384
Defaults are calculated at boot time from amount of available
385385
memory.
386386

387+
tcp_min_rtt_wlen - INTEGER
388+
The window length of the windowed min filter to track the minimum RTT.
389+
A shorter window lets a flow more quickly pick up new (higher)
390+
minimum RTT when it is moved to a longer path (e.g., due to traffic
391+
engineering). A longer window makes the filter more resistant to RTT
392+
inflations such as transient congestion. The unit is seconds.
393+
Default: 300
394+
387395
tcp_moderate_rcvbuf - BOOLEAN
388396
If set, TCP performs receive buffer auto-tuning, attempting to
389397
automatically size the buffer (no greater than tcp_rmem[2]) to
@@ -425,6 +433,15 @@ tcp_orphan_retries - INTEGER
425433
you should think about lowering this value, such sockets
426434
may consume significant resources. Cf. tcp_max_orphans.
427435

436+
tcp_recovery - INTEGER
437+
This value is a bitmap to enable various experimental loss recovery
438+
features.
439+
440+
RACK: 0x1 enables the RACK loss detection for fast detection of lost
441+
retransmissions and tail drops.
442+
443+
Default: 0x1
444+
428445
tcp_reordering - INTEGER
429446
Initial reordering level of packets in a TCP stream.
430447
TCP stack can then dynamically adjust flow reordering level

include/linux/skbuff.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
463463
return delta_us;
464464
}
465465

466+
static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
467+
const struct skb_mstamp *t0)
468+
{
469+
s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
470+
471+
if (!diff)
472+
diff = t1->stamp_us - t0->stamp_us;
473+
return diff > 0;
474+
}
466475

467476
/**
468477
* struct sk_buff - socket buffer

include/linux/tcp.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,12 @@ struct tcp_sock {
194194
u32 window_clamp; /* Maximal window to advertise */
195195
u32 rcv_ssthresh; /* Current window clamp */
196196

197+
/* Information of the most recently (s)acked skb */
198+
struct tcp_rack {
199+
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
200+
u8 advanced; /* mstamp advanced since last lost marking */
201+
u8 reord; /* reordering detected */
202+
} rack;
197203
u16 advmss; /* Advertised MSS */
198204
u8 unused;
199205
u8 nonagle : 4,/* Disable Nagle algorithm? */
@@ -217,6 +223,9 @@ struct tcp_sock {
217223
u32 mdev_max_us; /* maximal mdev for the last rtt period */
218224
u32 rttvar_us; /* smoothed mdev_max */
219225
u32 rtt_seq; /* sequence number to update rttvar */
226+
struct rtt_meas {
227+
u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
228+
} rtt_min[3];
220229

221230
u32 packets_out; /* Packets which are "in flight" */
222231
u32 retrans_out; /* Retransmitted packets out */
@@ -280,8 +289,6 @@ struct tcp_sock {
280289
int lost_cnt_hint;
281290
u32 retransmit_high; /* L-bits may be on up to this seqno */
282291

283-
u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
284-
285292
u32 prior_ssthresh; /* ssthresh saved at recovery start */
286293
u32 high_seq; /* snd_nxt at onset of congestion */
287294

include/net/tcp.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
279279
extern int sysctl_tcp_challenge_ack_limit;
280280
extern unsigned int sysctl_tcp_notsent_lowat;
281281
extern int sysctl_tcp_min_tso_segs;
282+
extern int sysctl_tcp_min_rtt_wlen;
282283
extern int sysctl_tcp_autocorking;
283284
extern int sysctl_tcp_invalid_ratelimit;
284285
extern int sysctl_tcp_pacing_ss_ratio;
@@ -566,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
566567
void tcp_rearm_rto(struct sock *sk);
567568
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
568569
void tcp_reset(struct sock *sk);
570+
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
569571

570572
/* tcp_timer.c */
571573
void tcp_init_xmit_timers(struct sock *);
@@ -671,6 +673,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
671673
return dst_metric_locked(dst, RTAX_CC_ALGO);
672674
}
673675

676+
/* Minimum RTT in usec. ~0 means not available. */
677+
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
678+
{
679+
return tp->rtt_min[0].rtt;
680+
}
681+
674682
/* Compute the actual receive window we are currently advertising.
675683
* Rcv_nxt can be after the window if our peer push more data
676684
* than the offered window.
@@ -1743,6 +1751,19 @@ int tcpv4_offload_init(void);
17431751
void tcp_v4_init(void);
17441752
void tcp_init(void);
17451753

1754+
/* tcp_recovery.c */
1755+
1756+
/* Flags to enable various loss recovery features. See below */
1757+
extern int sysctl_tcp_recovery;
1758+
1759+
/* Use TCP RACK to detect (some) tail and retransmit losses */
1760+
#define TCP_RACK_LOST_RETRANS 0x1
1761+
1762+
extern int tcp_rack_mark_lost(struct sock *sk);
1763+
1764+
extern void tcp_rack_advance(struct tcp_sock *tp,
1765+
const struct skb_mstamp *xmit_time, u8 sacked);
1766+
17461767
/*
17471768
* Save and compile IPv4 options, return a pointer to it
17481769
*/

net/ipv4/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
88
inet_timewait_sock.o inet_connection_sock.o \
99
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
1010
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
11+
tcp_recovery.o \
1112
tcp_offload.o datagram.o raw.o udp.o udplite.o \
1213
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
1314
fib_frontend.o fib_semantics.o fib_trie.o \

net/ipv4/sysctl_net_ipv4.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = {
495495
.mode = 0644,
496496
.proc_handler = proc_dointvec
497497
},
498+
{
499+
.procname = "tcp_recovery",
500+
.data = &sysctl_tcp_recovery,
501+
.maxlen = sizeof(int),
502+
.mode = 0644,
503+
.proc_handler = proc_dointvec,
504+
},
498505
{
499506
.procname = "tcp_reordering",
500507
.data = &sysctl_tcp_reordering,
@@ -576,6 +583,13 @@ static struct ctl_table ipv4_table[] = {
576583
.mode = 0644,
577584
.proc_handler = proc_dointvec
578585
},
586+
{
587+
.procname = "tcp_min_rtt_wlen",
588+
.data = &sysctl_tcp_min_rtt_wlen,
589+
.maxlen = sizeof(int),
590+
.mode = 0644,
591+
.proc_handler = proc_dointvec
592+
},
579593
{
580594
.procname = "tcp_low_latency",
581595
.data = &sysctl_tcp_low_latency,

net/ipv4/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
388388

389389
icsk->icsk_rto = TCP_TIMEOUT_INIT;
390390
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
391+
tp->rtt_min[0].rtt = ~0U;
391392

392393
/* So many TCP implementations out there (incorrectly) count the
393394
* initial SYN frame in their delayed-ACK and congestion control

0 commit comments

Comments
 (0)