Skip to content

Commit dfa2f04

Browse files
Eric Dumazetkuba-moo
authored andcommitted
tcp: get rid of sysctl_tcp_adv_win_scale
With modern NIC drivers shifting to full page allocations per received frame, we face the following issue: TCP has one per-netns sysctl used to tweak how to translate a memory use into an expected payload (RWIN), in RX path. tcp_win_from_space() implementation is limited to few cases. For hosts dealing with various MSS, we either under estimate or over estimate the RWIN we send to the remote peers. For instance with the default sysctl_tcp_adv_win_scale value, we expect to store 50% of payload per allocated chunk of memory. For the typical use of MTU=1500 traffic, and order-0 pages allocations by NIC drivers, we are sending too big RWIN, leading to potential tcp collapse operations, which are extremely expensive and source of latency spikes. This patch makes sysctl_tcp_adv_win_scale obsolete, and instead uses a per socket scaling factor, so that we can precisely adjust the RWIN based on effective skb->len/skb->truesize ratio. This patch alone can double TCP receive performance when receivers are too slow to drain their receive queue, or by allowing a bigger RWIN when MSS is close to PAGE_SIZE. Signed-off-by: Eric Dumazet <[email protected]> Acked-by: Soheil Hassas Yeganeh <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 63c8778 commit dfa2f04

File tree

6 files changed

+43
-18
lines changed

6 files changed

+43
-18
lines changed

Documentation/networking/ip-sysctl.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN
321321
option can harm clients of your server.
322322

323323
tcp_adv_win_scale - INTEGER
324+
Obsolete since linux-6.6
324325
Count buffering overhead as bytes/2^tcp_adv_win_scale
325326
(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
326327
if it is <= 0.

include/linux/tcp.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
172172
return (struct tcp_request_sock *)req;
173173
}
174174

175+
#define TCP_RMEM_TO_WIN_SCALE 8
176+
175177
struct tcp_sock {
176178
/* inet_connection_sock has to be the first member of tcp_sock */
177179
struct inet_connection_sock inet_conn;
@@ -238,7 +240,7 @@ struct tcp_sock {
238240

239241
u32 window_clamp; /* Maximal window to advertise */
240242
u32 rcv_ssthresh; /* Current window clamp */
241-
243+
u8 scaling_ratio; /* see tcp_win_from_space() */
242244
/* Information of the most recently (s)acked skb */
243245
struct tcp_rack {
244246
u64 mstamp; /* (Re)sent time of the skb */

include/net/netns/ipv4.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ struct netns_ipv4 {
152152
u8 sysctl_tcp_abort_on_overflow;
153153
u8 sysctl_tcp_fack; /* obsolete */
154154
int sysctl_tcp_max_reordering;
155-
int sysctl_tcp_adv_win_scale;
155+
int sysctl_tcp_adv_win_scale; /* obsolete */
156156
u8 sysctl_tcp_dsack;
157157
u8 sysctl_tcp_app_win;
158158
u8 sysctl_tcp_frto;

include/net/tcp.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
14341434

14351435
static inline int tcp_win_from_space(const struct sock *sk, int space)
14361436
{
1437-
int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
1437+
s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
14381438

1439-
return tcp_adv_win_scale <= 0 ?
1440-
(space>>(-tcp_adv_win_scale)) :
1441-
space - (space>>tcp_adv_win_scale);
1439+
return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
1440+
}
1441+
1442+
/* inverse of tcp_win_from_space() */
1443+
static inline int tcp_space_from_win(const struct sock *sk, int win)
1444+
{
1445+
u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
1446+
1447+
do_div(val, tcp_sk(sk)->scaling_ratio);
1448+
return val;
1449+
}
1450+
1451+
static inline void tcp_scaling_ratio_init(struct sock *sk)
1452+
{
1453+
/* Assume a conservative default of 1200 bytes of payload per 4K page.
1454+
* This may be adjusted later in tcp_measure_rcv_mss().
1455+
*/
1456+
tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
1457+
SKB_TRUESIZE(4096);
14421458
}
14431459

14441460
/* Note: caller must be prepared to deal with negative returns */

net/ipv4/tcp.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
457457

458458
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
459459
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
460+
tcp_scaling_ratio_init(sk);
460461

461462
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
462463
sk_sockets_allocated_inc(sk);
@@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len);
17001701
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
17011702
int tcp_set_rcvlowat(struct sock *sk, int val)
17021703
{
1703-
int cap;
1704+
int space, cap;
17041705

17051706
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
17061707
cap = sk->sk_rcvbuf >> 1;
@@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
17151716
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
17161717
return 0;
17171718

1718-
val <<= 1;
1719-
if (val > sk->sk_rcvbuf) {
1720-
WRITE_ONCE(sk->sk_rcvbuf, val);
1721-
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1719+
space = tcp_space_from_win(sk, val);
1720+
if (space > sk->sk_rcvbuf) {
1721+
WRITE_ONCE(sk->sk_rcvbuf, space);
1722+
tcp_sk(sk)->window_clamp = val;
17221723
}
17231724
return 0;
17241725
}

net/ipv4/tcp_input.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
237237
*/
238238
len = skb_shinfo(skb)->gso_size ? : skb->len;
239239
if (len >= icsk->icsk_ack.rcv_mss) {
240+
/* Note: divides are still a bit expensive.
241+
* For the moment, only adjust scaling_ratio
242+
* when we update icsk_ack.rcv_mss.
243+
*/
244+
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
245+
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
246+
247+
do_div(val, skb->truesize);
248+
tcp_sk(sk)->scaling_ratio = val ? val : 1;
249+
}
240250
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
241251
tcp_sk(sk)->advmss);
242252
/* Account for possibly-removed options */
@@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
727737

728738
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
729739
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
730-
int rcvmem, rcvbuf;
731740
u64 rcvwin, grow;
741+
int rcvbuf;
732742

733743
/* minimal window to cope with packet losses, assuming
734744
* steady state. Add some cushion because of small variations.
@@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
740750
do_div(grow, tp->rcvq_space.space);
741751
rcvwin += (grow << 1);
742752

743-
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
744-
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
745-
rcvmem += 128;
746-
747-
do_div(rcvwin, tp->advmss);
748-
rcvbuf = min_t(u64, rcvwin * rcvmem,
753+
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
749754
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
750755
if (rcvbuf > sk->sk_rcvbuf) {
751756
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);

0 commit comments

Comments
 (0)