Skip to content

Commit 3e59cb0

Browse files
yuchungchengdavem330
authored andcommitted
tcp: remove bad timeout logic in fast recovery
tcp_timeout_skb() was intended to trigger fast recovery on timeout, unfortunately in reality it often causes spurious retransmission storms during fast recovery. The particular sign is a fast retransmit over the highest sacked sequence (SND.FACK). Currently the RTO timer re-arming (as in RFC6298) offers a nice cushion to avoid spurious timeout: when SND.UNA advances the sender re-arms RTO and extends the timeout by icsk_rto. The sender does not offset the time elapsed since the packet at SND.UNA was sent. But if the next (DUP)ACK arrives later than ~RTTVAR and triggers tcp_fastretrans_alert(), then tcp_timeout_skb() will mark any packet sent before the icsk_rto interval lost, including one that's above the highest sacked sequence. Most likely a large part of scorebard will be marked. If most packets are not lost then the subsequent DUPACKs with new SACK blocks will cause the sender to continue to retransmit packets beyond SND.FACK spuriously. Even if only one packet is lost the sender may falsely retransmit almost the entire window. The situation becomes common in the world of bufferbloat: the RTT continues to grow as the queue builds up but RTTVAR remains small and close to the minimum 200ms. If a data packet is lost and the DUPACK triggered by the next data packet is slightly delayed, then a spurious retransmission storm forms. As the original comment on tcp_timeout_skb() suggests: the usefulness of this feature is questionable. It also wastes cycles walking the sack scoreboard and is actually harmful because of false recovery. It's time to remove this. Signed-off-by: Yuchung Cheng <[email protected]> Acked-by: Eric Dumazet <[email protected]> Acked-by: Neal Cardwell <[email protected]> Acked-by: Nandita Dukkipati <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 3cc7587 commit 3e59cb0

File tree

3 files changed

+1
-66
lines changed

3 files changed

+1
-66
lines changed

include/linux/tcp.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,6 @@ struct tcp_sock {
246246

247247
/* from STCP, retrans queue hinting */
248248
struct sk_buff* lost_skb_hint;
249-
struct sk_buff *scoreboard_skb_hint;
250249
struct sk_buff *retransmit_skb_hint;
251250

252251
struct sk_buff_head out_of_order_queue; /* Out of order segments go here */

include/net/tcp.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1193,7 +1193,6 @@ static inline void tcp_mib_init(struct net *net)
11931193
static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
11941194
{
11951195
tp->lost_skb_hint = NULL;
1196-
tp->scoreboard_skb_hint = NULL;
11971196
}
11981197

11991198
static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)

net/ipv4/tcp_input.c

Lines changed: 1 addition & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,8 +1255,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
12551255

12561256
if (skb == tp->retransmit_skb_hint)
12571257
tp->retransmit_skb_hint = prev;
1258-
if (skb == tp->scoreboard_skb_hint)
1259-
tp->scoreboard_skb_hint = prev;
12601258
if (skb == tp->lost_skb_hint) {
12611259
tp->lost_skb_hint = prev;
12621260
tp->lost_cnt_hint -= tcp_skb_pcount(prev);
@@ -1964,20 +1962,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
19641962
return true;
19651963
}
19661964

1967-
static inline int tcp_skb_timedout(const struct sock *sk,
1968-
const struct sk_buff *skb)
1969-
{
1970-
return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
1971-
}
1972-
1973-
static inline int tcp_head_timedout(const struct sock *sk)
1974-
{
1975-
const struct tcp_sock *tp = tcp_sk(sk);
1976-
1977-
return tp->packets_out &&
1978-
tcp_skb_timedout(sk, tcp_write_queue_head(sk));
1979-
}
1980-
19811965
/* Linux NewReno/SACK/FACK/ECN state machine.
19821966
* --------------------------------------
19831967
*
@@ -2084,12 +2068,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
20842068
if (tcp_dupack_heuristics(tp) > tp->reordering)
20852069
return true;
20862070

2087-
/* Trick#3 : when we use RFC2988 timer restart, fast
2088-
* retransmit can be triggered by timeout of queue head.
2089-
*/
2090-
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
2091-
return true;
2092-
20932071
/* Trick#4: It is still not OK... But will it be useful to delay
20942072
* recovery more?
20952073
*/
@@ -2126,44 +2104,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
21262104
return false;
21272105
}
21282106

2129-
/* New heuristics: it is possible only after we switched to restart timer
2130-
* each time when something is ACKed. Hence, we can detect timed out packets
2131-
* during fast retransmit without falling to slow start.
2132-
*
2133-
* Usefulness of this as is very questionable, since we should know which of
2134-
* the segments is the next to timeout which is relatively expensive to find
2135-
* in general case unless we add some data structure just for that. The
2136-
* current approach certainly won't find the right one too often and when it
2137-
* finally does find _something_ it usually marks large part of the window
2138-
* right away (because a retransmission with a larger timestamp blocks the
2139-
* loop from advancing). -ij
2140-
*/
2141-
static void tcp_timeout_skbs(struct sock *sk)
2142-
{
2143-
struct tcp_sock *tp = tcp_sk(sk);
2144-
struct sk_buff *skb;
2145-
2146-
if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
2147-
return;
2148-
2149-
skb = tp->scoreboard_skb_hint;
2150-
if (tp->scoreboard_skb_hint == NULL)
2151-
skb = tcp_write_queue_head(sk);
2152-
2153-
tcp_for_write_queue_from(skb, sk) {
2154-
if (skb == tcp_send_head(sk))
2155-
break;
2156-
if (!tcp_skb_timedout(sk, skb))
2157-
break;
2158-
2159-
tcp_skb_mark_lost(tp, skb);
2160-
}
2161-
2162-
tp->scoreboard_skb_hint = skb;
2163-
2164-
tcp_verify_left_out(tp);
2165-
}
2166-
21672107
/* Detect loss in event "A" above by marking head of queue up as lost.
21682108
* For FACK or non-SACK(Reno) senders, the first "packets" number of segments
21692109
* are considered lost. For RFC3517 SACK, a segment is considered lost if it
@@ -2249,8 +2189,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
22492189
else if (fast_rexmit)
22502190
tcp_mark_head_lost(sk, 1, 1);
22512191
}
2252-
2253-
tcp_timeout_skbs(sk);
22542192
}
22552193

22562194
/* CWND moderation, preventing bursts due to too big ACKs
@@ -2842,7 +2780,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
28422780
fast_rexmit = 1;
28432781
}
28442782

2845-
if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
2783+
if (do_lost)
28462784
tcp_update_scoreboard(sk, fast_rexmit);
28472785
tcp_cwnd_reduction(sk, newly_acked_sacked, fast_rexmit);
28482786
tcp_xmit_retransmit_queue(sk);
@@ -3075,7 +3013,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
30753013

30763014
tcp_unlink_write_queue(skb, sk);
30773015
sk_wmem_free_skb(sk, skb);
3078-
tp->scoreboard_skb_hint = NULL;
30793016
if (skb == tp->retransmit_skb_hint)
30803017
tp->retransmit_skb_hint = NULL;
30813018
if (skb == tp->lost_skb_hint)

0 commit comments

Comments
 (0)