Skip to content

Commit 1f25569

Browse files
Prj2223davem330
authored andcommitted
tcp: higher throughput under reordering with adaptive RACK reordering wnd
Currently TCP RACK loss detection does not work well if packets are being reordered beyond its static reordering window (min_rtt/4).Under such reordering it may falsely trigger loss recoveries and reduce TCP throughput significantly. This patch improves that by increasing and reducing the reordering window based on DSACK, which is now supported in major TCP implementations. It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries. - If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded by srtt), since there is possibility that spurious retransmission was due to reordering delay longer than reo_wnd. - Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) no. of successful recoveries (accounts for full DSACK-based loss recovery undo). After that, reset it to default (min_rtt/4). - At max, reo_wnd is incremented only once per rtt. So that the new DSACK on which we are reacting, is due to the spurious retx (approx) after the reo_wnd has been updated last time. - reo_wnd is tracked in terms of steps (of min_rtt/4), rather than absolute value to account for change in rtt. In our internal testing, we observed significant increase in throughput, in scenarios where reordering exceeds min_rtt/4 (previous static value). Signed-off-by: Priyaranjan Jha <[email protected]> Signed-off-by: Yuchung Cheng <[email protected]> Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 6c49b5e commit 1f25569

File tree

7 files changed

+68
-4
lines changed

7 files changed

+68
-4
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,7 @@ tcp_recovery - INTEGER
454454

455455
RACK: 0x1 enables the RACK loss detection for fast detection of lost
456456
retransmissions and tail drops.
457+
RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
457458

458459
Default: 0x1
459460

include/linux/tcp.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,13 @@ struct tcp_sock {
210210
u64 mstamp; /* (Re)sent time of the skb */
211211
u32 rtt_us; /* Associated RTT */
212212
u32 end_seq; /* Ending TCP sequence of the skb */
213-
u8 advanced; /* mstamp advanced since last lost marking */
214-
u8 reord; /* reordering detected */
213+
u32 last_delivered; /* tp->delivered at last reo_wnd adj */
214+
u8 reo_wnd_steps; /* Allowed reordering window */
215+
#define TCP_RACK_RECOVERY_THRESH 16
216+
u8 reo_wnd_persist:5, /* No. of recovery since last adj */
217+
dsack_seen:1, /* Whether DSACK seen after last adj */
218+
advanced:1, /* mstamp advanced since last lost marking */
219+
reord:1; /* reordering detected */
215220
} rack;
216221
u16 advmss; /* Advertised MSS */
217222
u32 chrono_start; /* Start time in jiffies of a TCP chrono */

include/net/tcp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3];
246246
extern int sysctl_tcp_rmem[3];
247247

248248
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
249+
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
249250

250251
extern atomic_long_t tcp_memory_allocated;
251252
extern struct percpu_counter tcp_sockets_allocated;
@@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk);
19011902
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
19021903
u64 xmit_time);
19031904
extern void tcp_rack_reo_timeout(struct sock *sk);
1905+
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
19041906

19051907
/* At how many usecs into the future should the RTO fire? */
19061908
static inline s64 tcp_rto_delta_us(const struct sock *sk)

net/ipv4/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk)
447447
tcp_assign_congestion_control(sk);
448448

449449
tp->tsoffset = 0;
450+
tp->rack.reo_wnd_steps = 1;
450451

451452
sk->sk_state = TCP_CLOSE;
452453

net/ipv4/tcp_input.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,7 @@ void tcp_disable_fack(struct tcp_sock *tp)
856856
static void tcp_dsack_seen(struct tcp_sock *tp)
857857
{
858858
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
859+
tp->rack.dsack_seen = 1;
859860
}
860861

861862
static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2408,6 +2409,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
24082409
mib_idx = LINUX_MIB_TCPFULLUNDO;
24092410

24102411
NET_INC_STATS(sock_net(sk), mib_idx);
2412+
} else if (tp->rack.reo_wnd_persist) {
2413+
tp->rack.reo_wnd_persist--;
24112414
}
24122415
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
24132416
/* Hold old state until something *above* high_seq
@@ -2427,6 +2430,8 @@ static bool tcp_try_undo_dsack(struct sock *sk)
24272430
struct tcp_sock *tp = tcp_sk(sk);
24282431

24292432
if (tp->undo_marker && !tp->undo_retrans) {
2433+
tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2434+
tp->rack.reo_wnd_persist + 1);
24302435
DBGUNDO(sk, "D-SACK");
24312436
tcp_undo_cwnd_reduction(sk, false);
24322437
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
@@ -3644,6 +3649,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
36443649
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
36453650
&sack_state);
36463651

3652+
tcp_rack_update_reo_wnd(sk, &rs);
3653+
36473654
if (tp->tlp_high_seq)
36483655
tcp_process_tlp_ack(sk, ack, flag);
36493656
/* If needed, reset TLP/RTO timer; RACK may later override this. */

net/ipv4/tcp_minisocks.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
551551
newtp->syn_data_acked = 0;
552552
newtp->rack.mstamp = 0;
553553
newtp->rack.advanced = 0;
554+
newtp->rack.reo_wnd_steps = 1;
555+
newtp->rack.last_delivered = 0;
556+
newtp->rack.reo_wnd_persist = 0;
557+
newtp->rack.dsack_seen = 0;
554558

555559
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
556560
}

net/ipv4/tcp_recovery.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
4444
static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
4545
{
4646
struct tcp_sock *tp = tcp_sk(sk);
47+
u32 min_rtt = tcp_min_rtt(tp);
4748
struct sk_buff *skb, *n;
4849
u32 reo_wnd;
4950

@@ -54,8 +55,10 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
5455
* to queuing or delayed ACKs.
5556
*/
5657
reo_wnd = 1000;
57-
if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
58-
reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
58+
if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
59+
reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
60+
reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
61+
}
5962

6063
list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
6164
tcp_tsorted_anchor) {
@@ -160,3 +163,44 @@ void tcp_rack_reo_timeout(struct sock *sk)
160163
if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
161164
tcp_rearm_rto(sk);
162165
}
166+
167+
/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
168+
*
169+
* If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
170+
* by srtt), since there is possibility that spurious retransmission was
171+
* due to reordering delay longer than reo_wnd.
172+
*
173+
* Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
174+
* no. of successful recoveries (accounts for full DSACK-based loss
175+
* recovery undo). After that, reset it to default (min_rtt/4).
176+
*
177+
* At max, reo_wnd is incremented only once per rtt. So that the new
178+
* DSACK on which we are reacting, is due to the spurious retx (approx)
179+
* after the reo_wnd has been updated last time.
180+
*
181+
* reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
182+
* absolute value to account for change in rtt.
183+
*/
184+
void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
185+
{
186+
struct tcp_sock *tp = tcp_sk(sk);
187+
188+
if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
189+
!rs->prior_delivered)
190+
return;
191+
192+
/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
193+
if (before(rs->prior_delivered, tp->rack.last_delivered))
194+
tp->rack.dsack_seen = 0;
195+
196+
/* Adjust the reo_wnd if update is pending */
197+
if (tp->rack.dsack_seen) {
198+
tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
199+
tp->rack.reo_wnd_steps + 1);
200+
tp->rack.dsack_seen = 0;
201+
tp->rack.last_delivered = tp->delivered;
202+
tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
203+
} else if (!tp->rack.reo_wnd_persist) {
204+
tp->rack.reo_wnd_steps = 1;
205+
}
206+
}

0 commit comments

Comments
 (0)