Skip to content

Commit 10e361e

Browse files
committed
Merge branch 'tcp-default-RACK-loss-recovery'
Yuchung Cheng says: ==================== tcp: default RACK loss recovery This patch set implements the features correspond to the draft-ietf-tcpm-rack-03 version of the RACK draft. https://datatracker.ietf.org/meeting/101/materials/slides-101-tcpm-update-on-tcp-rack-00 1. SACK: implement equivalent DUPACK threshold heuristic in RACK to replace existing RFC6675 recovery (tcp_mark_head_lost). 2. Non-SACK: simplify RFC6582 NewReno implementation 3. RTO: apply RACK's time-based approach to avoid spuriouly marking very recently sent packets lost. 4. with (1)(2)(3), make RACK the exclusive fast recovery mechanism to mark losses based on time on S/ACK. Tail loss probe and F-RTO remain enabled by default as complementary mechanisms to send probes in CA_Open and CA_Loss states. The probes would solicit S/ACKs to trigger RACK time-based loss detection. All Google web and internal servers have been running RACK-only mode (4) for a while now. a/b experiments indicate RACK/TLP on average reduces recovery latency by 10% compared to RFC6675. RFC6675 is default-off now but can be enabled by disabling RACK (sysctl net.ipv4.tcp_recovery=0) for unseen issues. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 9611d6d + 56f8c5d commit 10e361e

File tree

4 files changed

+124
-64
lines changed

4 files changed

+124
-64
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,8 +449,10 @@ tcp_recovery - INTEGER
449449
features.
450450

451451
RACK: 0x1 enables the RACK loss detection for fast detection of lost
452-
retransmissions and tail drops.
452+
retransmissions and tail drops. It also subsumes and disables
453+
RFC6675 recovery for SACK connections.
453454
RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
455+
RACK: 0x4 disables RACK's DUPACK threshold heuristic
454456

455457
Default: 0x1
456458

include/net/tcp.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];
245245

246246
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
247247
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
248+
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
248249

249250
extern atomic_long_t tcp_memory_allocated;
250251
extern struct percpu_counter tcp_sockets_allocated;
@@ -1876,6 +1877,10 @@ void tcp_v4_init(void);
18761877
void tcp_init(void);
18771878

18781879
/* tcp_recovery.c */
1880+
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
1881+
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
1882+
extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
1883+
u32 reo_wnd);
18791884
extern void tcp_rack_mark_lost(struct sock *sk);
18801885
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
18811886
u64 xmit_time);

net/ipv4/tcp_input.c

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1917,19 +1917,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
19171917
tp->undo_retrans = tp->retrans_out ? : -1;
19181918
}
19191919

1920-
/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1920+
static bool tcp_is_rack(const struct sock *sk)
1921+
{
1922+
return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
1923+
}
1924+
1925+
/* If we detect SACK reneging, forget all SACK information
19211926
* and reset tags completely, otherwise preserve SACKs. If receiver
19221927
* dropped its ofo queue, we will know this due to reneging detection.
19231928
*/
1929+
static void tcp_timeout_mark_lost(struct sock *sk)
1930+
{
1931+
struct tcp_sock *tp = tcp_sk(sk);
1932+
struct sk_buff *skb, *head;
1933+
bool is_reneg; /* is receiver reneging on SACKs? */
1934+
1935+
head = tcp_rtx_queue_head(sk);
1936+
is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
1937+
if (is_reneg) {
1938+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1939+
tp->sacked_out = 0;
1940+
/* Mark SACK reneging until we recover from this loss event. */
1941+
tp->is_sack_reneg = 1;
1942+
} else if (tcp_is_reno(tp)) {
1943+
tcp_reset_reno_sack(tp);
1944+
}
1945+
1946+
skb = head;
1947+
skb_rbtree_walk_from(skb) {
1948+
if (is_reneg)
1949+
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1950+
else if (tcp_is_rack(sk) && skb != head &&
1951+
tcp_rack_skb_timeout(tp, skb, 0) > 0)
1952+
continue; /* Don't mark recently sent ones lost yet */
1953+
tcp_mark_skb_lost(sk, skb);
1954+
}
1955+
tcp_verify_left_out(tp);
1956+
tcp_clear_all_retrans_hints(tp);
1957+
}
1958+
1959+
/* Enter Loss state. */
19241960
void tcp_enter_loss(struct sock *sk)
19251961
{
19261962
const struct inet_connection_sock *icsk = inet_csk(sk);
19271963
struct tcp_sock *tp = tcp_sk(sk);
19281964
struct net *net = sock_net(sk);
1929-
struct sk_buff *skb;
19301965
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
1931-
bool is_reneg; /* is receiver reneging on SACKs? */
1932-
bool mark_lost;
1966+
1967+
tcp_timeout_mark_lost(sk);
19331968

19341969
/* Reduce ssthresh if it has not yet been made inside this window. */
19351970
if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1941,40 +1976,10 @@ void tcp_enter_loss(struct sock *sk)
19411976
tcp_ca_event(sk, CA_EVENT_LOSS);
19421977
tcp_init_undo(tp);
19431978
}
1944-
tp->snd_cwnd = 1;
1979+
tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
19451980
tp->snd_cwnd_cnt = 0;
19461981
tp->snd_cwnd_stamp = tcp_jiffies32;
19471982

1948-
tp->retrans_out = 0;
1949-
tp->lost_out = 0;
1950-
1951-
if (tcp_is_reno(tp))
1952-
tcp_reset_reno_sack(tp);
1953-
1954-
skb = tcp_rtx_queue_head(sk);
1955-
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1956-
if (is_reneg) {
1957-
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1958-
tp->sacked_out = 0;
1959-
/* Mark SACK reneging until we recover from this loss event. */
1960-
tp->is_sack_reneg = 1;
1961-
}
1962-
tcp_clear_all_retrans_hints(tp);
1963-
1964-
skb_rbtree_walk_from(skb) {
1965-
mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
1966-
is_reneg);
1967-
if (mark_lost)
1968-
tcp_sum_lost(tp, skb);
1969-
TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1970-
if (mark_lost) {
1971-
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1972-
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1973-
tp->lost_out += tcp_skb_pcount(skb);
1974-
}
1975-
}
1976-
tcp_verify_left_out(tp);
1977-
19781983
/* Timeout in disordered state after receiving substantial DUPACKs
19791984
* suggests that the degree of reordering is over-estimated.
19801985
*/
@@ -2141,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
21412146
return true;
21422147

21432148
/* Not-A-Trick#2 : Classic rule... */
2144-
if (tcp_dupack_heuristics(tp) > tp->reordering)
2149+
if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
21452150
return true;
21462151

21472152
return false;
@@ -2218,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
22182223
{
22192224
struct tcp_sock *tp = tcp_sk(sk);
22202225

2221-
if (tcp_is_reno(tp)) {
2222-
tcp_mark_head_lost(sk, 1, 1);
2223-
} else {
2226+
if (tcp_is_sack(tp)) {
22242227
int sacked_upto = tp->sacked_out - tp->reordering;
22252228
if (sacked_upto >= 0)
22262229
tcp_mark_head_lost(sk, sacked_upto, 0);
@@ -2718,12 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
27182721
return false;
27192722
}
27202723

2721-
static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
2724+
static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
27222725
{
27232726
struct tcp_sock *tp = tcp_sk(sk);
27242727

2725-
/* Use RACK to detect loss */
2726-
if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
2728+
if (tcp_rtx_queue_empty(sk))
2729+
return;
2730+
2731+
if (unlikely(tcp_is_reno(tp))) {
2732+
tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
2733+
} else if (tcp_is_rack(sk)) {
27272734
u32 prior_retrans = tp->retrans_out;
27282735

27292736
tcp_rack_mark_lost(sk);
@@ -2819,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
28192826
tcp_try_keep_open(sk);
28202827
return;
28212828
}
2822-
tcp_rack_identify_loss(sk, ack_flag);
2829+
tcp_identify_packet_loss(sk, ack_flag);
28232830
break;
28242831
case TCP_CA_Loss:
28252832
tcp_process_loss(sk, flag, is_dupack, rexmit);
2826-
tcp_rack_identify_loss(sk, ack_flag);
2833+
tcp_identify_packet_loss(sk, ack_flag);
28272834
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
28282835
(*ack_flag & FLAG_LOST_RETRANS)))
28292836
return;
@@ -2840,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
28402847
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
28412848
tcp_try_undo_dsack(sk);
28422849

2843-
tcp_rack_identify_loss(sk, ack_flag);
2850+
tcp_identify_packet_loss(sk, ack_flag);
28442851
if (!tcp_time_to_recover(sk, flag)) {
28452852
tcp_try_to_open(sk, flag);
28462853
return;
@@ -2862,7 +2869,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
28622869
fast_rexmit = 1;
28632870
}
28642871

2865-
if (do_lost)
2872+
if (!tcp_is_rack(sk) && do_lost)
28662873
tcp_update_scoreboard(sk, fast_rexmit);
28672874
*rexmit = REXMIT_LOST;
28682875
}

net/ipv4/tcp_recovery.c

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <linux/tcp.h>
33
#include <net/tcp.h>
44

5-
static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
5+
void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
66
{
77
struct tcp_sock *tp = tcp_sk(sk);
88

@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
2121
return t1 > t2 || (t1 == t2 && after(seq1, seq2));
2222
}
2323

24+
u32 tcp_rack_reo_wnd(const struct sock *sk)
25+
{
26+
struct tcp_sock *tp = tcp_sk(sk);
27+
28+
if (!tp->rack.reord) {
29+
/* If reordering has not been observed, be aggressive during
30+
* the recovery or starting the recovery by DUPACK threshold.
31+
*/
32+
if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
33+
return 0;
34+
35+
if (tp->sacked_out >= tp->reordering &&
36+
!(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
37+
return 0;
38+
}
39+
40+
/* To be more reordering resilient, allow min_rtt/4 settling delay.
41+
* Use min_rtt instead of the smoothed RTT because reordering is
42+
* often a path property and less related to queuing or delayed ACKs.
43+
* Upon receiving DSACKs, linearly increase the window up to the
44+
* smoothed RTT.
45+
*/
46+
return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
47+
tp->srtt_us >> 3);
48+
}
49+
50+
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
51+
{
52+
return tp->rack.rtt_us + reo_wnd -
53+
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
54+
}
55+
2456
/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
2557
*
2658
* Marks a packet lost, if some packet sent later has been (s)acked.
@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
4476
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
4577
{
4678
struct tcp_sock *tp = tcp_sk(sk);
47-
u32 min_rtt = tcp_min_rtt(tp);
4879
struct sk_buff *skb, *n;
4980
u32 reo_wnd;
5081

5182
*reo_timeout = 0;
52-
/* To be more reordering resilient, allow min_rtt/4 settling delay
53-
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
54-
* RTT because reordering is often a path property and less related
55-
* to queuing or delayed ACKs.
56-
*/
57-
reo_wnd = 1000;
58-
if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
59-
min_rtt != ~0U) {
60-
reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
61-
reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
62-
}
63-
83+
reo_wnd = tcp_rack_reo_wnd(sk);
6484
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
6585
tcp_tsorted_anchor) {
6686
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
7898
/* A packet is lost if it has not been s/acked beyond
7999
* the recent RTT plus the reordering window.
80100
*/
81-
remaining = tp->rack.rtt_us + reo_wnd -
82-
tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
101+
remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
83102
if (remaining <= 0) {
84-
tcp_rack_mark_skb_lost(sk, skb);
103+
tcp_mark_skb_lost(sk, skb);
85104
list_del_init(&skb->tcp_tsorted_anchor);
86105
} else {
87106
/* Record maximum wait time */
@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
202221
tp->rack.reo_wnd_steps = 1;
203222
}
204223
}
224+
225+
/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
226+
* the next unacked packet upon receiving
227+
* a) three or more DUPACKs to start the fast recovery
228+
* b) an ACK acknowledging new data during the fast recovery.
229+
*/
230+
void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
231+
{
232+
const u8 state = inet_csk(sk)->icsk_ca_state;
233+
struct tcp_sock *tp = tcp_sk(sk);
234+
235+
if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
236+
(state == TCP_CA_Recovery && snd_una_advanced)) {
237+
struct sk_buff *skb = tcp_rtx_queue_head(sk);
238+
u32 mss;
239+
240+
if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
241+
return;
242+
243+
mss = tcp_skb_mss(skb);
244+
if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
245+
tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
246+
mss, mss, GFP_ATOMIC);
247+
248+
tcp_skb_mark_lost_uncond_verify(tp, skb);
249+
}
250+
}

0 commit comments

Comments
 (0)