Skip to content

Commit 570d632

Browse files
committed
Merge branch 'tcp-preempt'
Eric Dumazet says: ==================== net: make TCP preemptible Most of TCP stack assumed it was running from BH handler. This is great for most things, as TCP behavior is very sensitive to scheduling artifacts. However, the prequeue and backlog processing are problematic, as they need to be flushed with BH being blocked. To cope with modern needs, TCP sockets have big sk_rcvbuf values, in the order of 16 MB, and soon 32 MB. This means that backlog can hold thousands of packets, and things like TCP coalescing or collapsing on this amount of packets can lead to insane latency spikes, since BH are blocked for too long. It is time to make UDP/TCP stacks preemptible. Note that fast path still runs from BH handler. v2: Added "tcp: make tcp_sendmsg() aware of socket backlog" to reduce latency problems of large sends. v3: Fixed a typo in tcp_cdg.c ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 5e59c83 + d41a69f commit 570d632

File tree

20 files changed

+150
-157
lines changed

20 files changed

+150
-157
lines changed

include/net/sock.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,17 @@ void sk_stream_kill_queues(struct sock *sk);
926926
void sk_set_memalloc(struct sock *sk);
927927
void sk_clear_memalloc(struct sock *sk);
928928

929+
void __sk_flush_backlog(struct sock *sk);
930+
931+
static inline bool sk_flush_backlog(struct sock *sk)
932+
{
933+
if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
934+
__sk_flush_backlog(sk);
935+
return true;
936+
}
937+
return false;
938+
}
939+
929940
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
930941

931942
struct request_sock_ops;

net/core/sock.c

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,33 +2019,27 @@ static void __release_sock(struct sock *sk)
20192019
__releases(&sk->sk_lock.slock)
20202020
__acquires(&sk->sk_lock.slock)
20212021
{
2022-
struct sk_buff *skb = sk->sk_backlog.head;
2022+
struct sk_buff *skb, *next;
20232023

2024-
do {
2024+
while ((skb = sk->sk_backlog.head) != NULL) {
20252025
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2026-
bh_unlock_sock(sk);
20272026

2028-
do {
2029-
struct sk_buff *next = skb->next;
2027+
spin_unlock_bh(&sk->sk_lock.slock);
20302028

2029+
do {
2030+
next = skb->next;
20312031
prefetch(next);
20322032
WARN_ON_ONCE(skb_dst_is_noref(skb));
20332033
skb->next = NULL;
20342034
sk_backlog_rcv(sk, skb);
20352035

2036-
/*
2037-
* We are in process context here with softirqs
2038-
* disabled, use cond_resched_softirq() to preempt.
2039-
* This is safe to do because we've taken the backlog
2040-
* queue private:
2041-
*/
2042-
cond_resched_softirq();
2036+
cond_resched();
20432037

20442038
skb = next;
20452039
} while (skb != NULL);
20462040

2047-
bh_lock_sock(sk);
2048-
} while ((skb = sk->sk_backlog.head) != NULL);
2041+
spin_lock_bh(&sk->sk_lock.slock);
2042+
}
20492043

20502044
/*
20512045
* Doing the zeroing here guarantee we can not loop forever
@@ -2054,6 +2048,13 @@ static void __release_sock(struct sock *sk)
20542048
sk->sk_backlog.len = 0;
20552049
}
20562050

2051+
void __sk_flush_backlog(struct sock *sk)
2052+
{
2053+
spin_lock_bh(&sk->sk_lock.slock);
2054+
__release_sock(sk);
2055+
spin_unlock_bh(&sk->sk_lock.slock);
2056+
}
2057+
20572058
/**
20582059
* sk_wait_data - wait for data to arrive at sk_receive_queue
20592060
* @sk: sock to wait on

net/dccp/input.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
359359
goto discard;
360360
}
361361

362-
__DCCP_INC_STATS(DCCP_MIB_INERRS);
362+
DCCP_INC_STATS(DCCP_MIB_INERRS);
363363
discard:
364364
__kfree_skb(skb);
365365
return 0;

net/dccp/ipv4.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -533,8 +533,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
533533
bh_unlock_sock(ctl_sk);
534534

535535
if (net_xmit_eval(err) == 0) {
536-
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
537-
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
536+
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
537+
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
538538
}
539539
out:
540540
dst_release(dst);

net/dccp/ipv6.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
277277
if (!IS_ERR(dst)) {
278278
skb_dst_set(skb, dst);
279279
ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
280-
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
281-
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
280+
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
281+
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
282282
return;
283283
}
284284

net/dccp/options.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
253253
return 0;
254254

255255
out_invalid_option:
256-
__DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
256+
DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
257257
rc = DCCP_RESET_CODE_OPTION_ERROR;
258258
out_featneg_failed:
259259
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);

net/ipv4/tcp.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,11 +1136,12 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
11361136
/* This should be in poll */
11371137
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
11381138

1139-
mss_now = tcp_send_mss(sk, &size_goal, flags);
1140-
11411139
/* Ok commence sending. */
11421140
copied = 0;
11431141

1142+
restart:
1143+
mss_now = tcp_send_mss(sk, &size_goal, flags);
1144+
11441145
err = -EPIPE;
11451146
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
11461147
goto out_err;
@@ -1166,6 +1167,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
11661167
if (!sk_stream_memory_free(sk))
11671168
goto wait_for_sndbuf;
11681169

1170+
if (sk_flush_backlog(sk))
1171+
goto restart;
1172+
11691173
skb = sk_stream_alloc_skb(sk,
11701174
select_size(sk, sg),
11711175
sk->sk_allocation,
@@ -1449,12 +1453,8 @@ static void tcp_prequeue_process(struct sock *sk)
14491453

14501454
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
14511455

1452-
/* RX process wants to run with disabled BHs, though it is not
1453-
* necessary */
1454-
local_bh_disable();
14551456
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
14561457
sk_backlog_rcv(sk, skb);
1457-
local_bh_enable();
14581458

14591459
/* Clear memory counter. */
14601460
tp->ucopy.memory = 0;
@@ -3095,7 +3095,7 @@ void tcp_done(struct sock *sk)
30953095
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
30963096

30973097
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3098-
__TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3098+
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
30993099

31003100
tcp_set_state(sk, TCP_CLOSE);
31013101
tcp_clear_xmit_timers(sk);

net/ipv4/tcp_cdg.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
155155

156156
ca->last_ack = now_us;
157157
if (after(now_us, ca->round_start + base_owd)) {
158-
__NET_INC_STATS(sock_net(sk),
159-
LINUX_MIB_TCPHYSTARTTRAINDETECT);
160-
__NET_ADD_STATS(sock_net(sk),
161-
LINUX_MIB_TCPHYSTARTTRAINCWND,
162-
tp->snd_cwnd);
158+
NET_INC_STATS(sock_net(sk),
159+
LINUX_MIB_TCPHYSTARTTRAINDETECT);
160+
NET_ADD_STATS(sock_net(sk),
161+
LINUX_MIB_TCPHYSTARTTRAINCWND,
162+
tp->snd_cwnd);
163163
tp->snd_ssthresh = tp->snd_cwnd;
164164
return;
165165
}
@@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
174174
125U);
175175

176176
if (ca->rtt.min > thresh) {
177-
__NET_INC_STATS(sock_net(sk),
178-
LINUX_MIB_TCPHYSTARTDELAYDETECT);
179-
__NET_ADD_STATS(sock_net(sk),
180-
LINUX_MIB_TCPHYSTARTDELAYCWND,
181-
tp->snd_cwnd);
177+
NET_INC_STATS(sock_net(sk),
178+
LINUX_MIB_TCPHYSTARTDELAYDETECT);
179+
NET_ADD_STATS(sock_net(sk),
180+
LINUX_MIB_TCPHYSTARTDELAYCWND,
181+
tp->snd_cwnd);
182182
tp->snd_ssthresh = tp->snd_cwnd;
183183
}
184184
}

net/ipv4/tcp_cubic.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay)
402402
ca->last_ack = now;
403403
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
404404
ca->found |= HYSTART_ACK_TRAIN;
405-
__NET_INC_STATS(sock_net(sk),
406-
LINUX_MIB_TCPHYSTARTTRAINDETECT);
407-
__NET_ADD_STATS(sock_net(sk),
408-
LINUX_MIB_TCPHYSTARTTRAINCWND,
409-
tp->snd_cwnd);
405+
NET_INC_STATS(sock_net(sk),
406+
LINUX_MIB_TCPHYSTARTTRAINDETECT);
407+
NET_ADD_STATS(sock_net(sk),
408+
LINUX_MIB_TCPHYSTARTTRAINCWND,
409+
tp->snd_cwnd);
410410
tp->snd_ssthresh = tp->snd_cwnd;
411411
}
412412
}
@@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay)
423423
if (ca->curr_rtt > ca->delay_min +
424424
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
425425
ca->found |= HYSTART_DELAY;
426-
__NET_INC_STATS(sock_net(sk),
427-
LINUX_MIB_TCPHYSTARTDELAYDETECT);
428-
__NET_ADD_STATS(sock_net(sk),
429-
LINUX_MIB_TCPHYSTARTDELAYCWND,
430-
tp->snd_cwnd);
426+
NET_INC_STATS(sock_net(sk),
427+
LINUX_MIB_TCPHYSTARTDELAYDETECT);
428+
NET_ADD_STATS(sock_net(sk),
429+
LINUX_MIB_TCPHYSTARTDELAYCWND,
430+
tp->snd_cwnd);
431431
tp->snd_ssthresh = tp->snd_cwnd;
432432
}
433433
}

net/ipv4/tcp_fastopen.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
255255
spin_lock(&fastopenq->lock);
256256
req1 = fastopenq->rskq_rst_head;
257257
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
258-
spin_unlock(&fastopenq->lock);
259258
__NET_INC_STATS(sock_net(sk),
260259
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
260+
spin_unlock(&fastopenq->lock);
261261
return false;
262262
}
263263
fastopenq->rskq_rst_head = req1->dl_next;
@@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
282282
struct sock *child;
283283

284284
if (foc->len == 0) /* Client requests a cookie */
285-
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
285+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
286286

287287
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
288288
(syn_data || foc->len >= 0) &&
@@ -311,13 +311,13 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
311311
child = tcp_fastopen_create_child(sk, skb, dst, req);
312312
if (child) {
313313
foc->len = -1;
314-
__NET_INC_STATS(sock_net(sk),
315-
LINUX_MIB_TCPFASTOPENPASSIVE);
314+
NET_INC_STATS(sock_net(sk),
315+
LINUX_MIB_TCPFASTOPENPASSIVE);
316316
return child;
317317
}
318-
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
318+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
319319
} else if (foc->len > 0) /* Client presents an invalid cookie */
320-
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
320+
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
321321

322322
valid_foc.exp = foc->exp;
323323
*foc = valid_foc;

0 commit comments

Comments
 (0)