Skip to content

Commit 19757ce

Browse files
edumazetdavem330
authored andcommitted
tcp: switch orphan_count to bare per-cpu counters
Use of percpu_counter structure to track count of orphaned sockets is causing problems on modern hosts with 256 cpus or more. Stefan Bach reported a serious spinlock contention in real workloads, that I was able to reproduce with a netfilter rule dropping incoming FIN packets. 53.56% server [kernel.kallsyms] [k] queued_spin_lock_slowpath | ---queued_spin_lock_slowpath | --53.51%--_raw_spin_lock_irqsave | --53.51%--__percpu_counter_sum tcp_check_oom | |--39.03%--__tcp_close | tcp_close | inet_release | inet6_release | sock_close | __fput | ____fput | task_work_run | exit_to_usermode_loop | do_syscall_64 | entry_SYSCALL_64_after_hwframe | __GI___libc_close | --14.48%--tcp_out_of_resources tcp_write_timeout tcp_retransmit_timer tcp_write_timer_handler tcp_write_timer call_timer_fn expire_timers __run_timers run_timer_softirq __softirqentry_text_start As explained in commit cf86a08 ("net/dst: use a smaller percpu_counter batch for dst entries accounting"), default batch size is too big for the default value of tcp_max_orphans (262144). But even if we reduce batch sizes, there would still be cases where the estimated count of orphans is beyond the limit, and where tcp_too_many_orphans() has to call the expensive percpu_counter_sum_positive(). One solution is to use plain per-cpu counters, and have a timer to periodically refresh this cache. Updating this cache every 100ms seems about right, tcp pressure state is not radically changing over shorter periods. percpu_counter was nice 15 years ago while hosts had less than 16 cpus, not anymore by current standards. v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m, reported by kernel test robot <[email protected]> Remove unused socket argument from tcp_too_many_orphans() Fixes: dd24c00 ("net: Use a percpu_counter for orphan_count") Signed-off-by: Eric Dumazet <[email protected]> Reported-by: Stefan Bach <[email protected]> Cc: Neal Cardwell <[email protected]> Acked-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 0b93aed commit 19757ce

File tree

11 files changed

+49
-38
lines changed

11 files changed

+49
-38
lines changed

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
870870
* created only after 3 way handshake is done.
871871
*/
872872
sock_orphan(child);
873-
percpu_counter_inc((child)->sk_prot->orphan_count);
873+
INC_ORPHAN_COUNT(child);
874874
chtls_release_resources(child);
875875
chtls_conn_done(child);
876876
} else {

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ struct deferred_skb_cb {
9595
#define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
9696
#define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
9797
#define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
98-
#define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
98+
#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count)
9999

100100
/* TLS SKB */
101101
#define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld)

include/net/inet_connection_sock.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
289289
{
290290
/* The below has to be done to allow calling inet_csk_destroy_sock */
291291
sock_set_flag(sk, SOCK_DEAD);
292-
percpu_counter_inc(sk->sk_prot->orphan_count);
292+
this_cpu_inc(*sk->sk_prot->orphan_count);
293293
}
294294

295295
void inet_csk_destroy_sock(struct sock *sk);

include/net/sock.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,7 @@ struct proto {
12351235
unsigned int useroffset; /* Usercopy region offset */
12361236
unsigned int usersize; /* Usercopy region size */
12371237

1238-
struct percpu_counter *orphan_count;
1238+
unsigned int __percpu *orphan_count;
12391239

12401240
struct request_sock_ops *rsk_prot;
12411241
struct timewait_sock_ops *twsk_prot;

include/net/tcp.h

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@
4848

4949
extern struct inet_hashinfo tcp_hashinfo;
5050

51-
extern struct percpu_counter tcp_orphan_count;
51+
DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
52+
int tcp_orphan_count_sum(void);
53+
5254
void tcp_time_wait(struct sock *sk, int state, int timeo);
5355

5456
#define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
@@ -290,19 +292,6 @@ static inline bool tcp_out_of_memory(struct sock *sk)
290292

291293
void sk_forced_mem_schedule(struct sock *sk, int size);
292294

293-
static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
294-
{
295-
struct percpu_counter *ocp = sk->sk_prot->orphan_count;
296-
int orphans = percpu_counter_read_positive(ocp);
297-
298-
if (orphans << shift > sysctl_tcp_max_orphans) {
299-
orphans = percpu_counter_sum_positive(ocp);
300-
if (orphans << shift > sysctl_tcp_max_orphans)
301-
return true;
302-
}
303-
return false;
304-
}
305-
306295
bool tcp_check_oom(struct sock *sk, int shift);
307296

308297

net/dccp/dccp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ extern bool dccp_debug;
4848

4949
extern struct inet_hashinfo dccp_hashinfo;
5050

51-
extern struct percpu_counter dccp_orphan_count;
51+
DECLARE_PER_CPU(unsigned int, dccp_orphan_count);
5252

5353
void dccp_time_wait(struct sock *sk, int state, int timeo);
5454

net/dccp/proto.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
4242

4343
EXPORT_SYMBOL_GPL(dccp_statistics);
4444

45-
struct percpu_counter dccp_orphan_count;
46-
EXPORT_SYMBOL_GPL(dccp_orphan_count);
45+
DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
46+
EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);
4747

4848
struct inet_hashinfo dccp_hashinfo;
4949
EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -1055,7 +1055,7 @@ void dccp_close(struct sock *sk, long timeout)
10551055
bh_lock_sock(sk);
10561056
WARN_ON(sock_owned_by_user(sk));
10571057

1058-
percpu_counter_inc(sk->sk_prot->orphan_count);
1058+
this_cpu_inc(dccp_orphan_count);
10591059

10601060
/* Have we already been destroyed by a softirq or backlog? */
10611061
if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
@@ -1115,13 +1115,10 @@ static int __init dccp_init(void)
11151115

11161116
BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
11171117
sizeof_field(struct sk_buff, cb));
1118-
rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
1119-
if (rc)
1120-
goto out_fail;
11211118
inet_hashinfo_init(&dccp_hashinfo);
11221119
rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
11231120
if (rc)
1124-
goto out_free_percpu;
1121+
goto out_fail;
11251122
rc = -ENOBUFS;
11261123
dccp_hashinfo.bind_bucket_cachep =
11271124
kmem_cache_create("dccp_bind_bucket",
@@ -1226,8 +1223,6 @@ static int __init dccp_init(void)
12261223
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
12271224
out_free_hashinfo2:
12281225
inet_hashinfo2_free_mod(&dccp_hashinfo);
1229-
out_free_percpu:
1230-
percpu_counter_destroy(&dccp_orphan_count);
12311226
out_fail:
12321227
dccp_hashinfo.bhash = NULL;
12331228
dccp_hashinfo.ehash = NULL;
@@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void)
12501245
dccp_ackvec_exit();
12511246
dccp_sysctl_exit();
12521247
inet_hashinfo2_free_mod(&dccp_hashinfo);
1253-
percpu_counter_destroy(&dccp_orphan_count);
12541248
}
12551249

12561250
module_init(dccp_init);

net/ipv4/inet_connection_sock.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk)
10151015

10161016
sk_refcnt_debug_release(sk);
10171017

1018-
percpu_counter_dec(sk->sk_prot->orphan_count);
1018+
this_cpu_dec(*sk->sk_prot->orphan_count);
10191019

10201020
sock_put(sk);
10211021
}
@@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
10741074

10751075
sock_orphan(child);
10761076

1077-
percpu_counter_inc(sk->sk_prot->orphan_count);
1077+
this_cpu_inc(*sk->sk_prot->orphan_count);
10781078

10791079
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
10801080
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);

net/ipv4/inet_hashtables.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
598598
if (ok) {
599599
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
600600
} else {
601-
percpu_counter_inc(sk->sk_prot->orphan_count);
601+
this_cpu_inc(*sk->sk_prot->orphan_count);
602602
inet_sk_set_state(sk, TCP_CLOSE);
603603
sock_set_flag(sk, SOCK_DEAD);
604604
inet_csk_destroy_sock(sk);

net/ipv4/proc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
5353
struct net *net = seq->private;
5454
int orphans, sockets;
5555

56-
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
56+
orphans = tcp_orphan_count_sum();
5757
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
5858

5959
socket_seq_show(seq);

net/ipv4/tcp.c

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,8 @@ enum {
287287
TCP_CMSG_TS = 2
288288
};
289289

290-
struct percpu_counter tcp_orphan_count;
291-
EXPORT_SYMBOL_GPL(tcp_orphan_count);
290+
DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
291+
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
292292

293293
long sysctl_tcp_mem[3] __read_mostly;
294294
EXPORT_SYMBOL(sysctl_tcp_mem);
@@ -2673,11 +2673,36 @@ void tcp_shutdown(struct sock *sk, int how)
26732673
}
26742674
EXPORT_SYMBOL(tcp_shutdown);
26752675

2676+
int tcp_orphan_count_sum(void)
2677+
{
2678+
int i, total = 0;
2679+
2680+
for_each_possible_cpu(i)
2681+
total += per_cpu(tcp_orphan_count, i);
2682+
2683+
return max(total, 0);
2684+
}
2685+
2686+
static int tcp_orphan_cache;
2687+
static struct timer_list tcp_orphan_timer;
2688+
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2689+
2690+
static void tcp_orphan_update(struct timer_list *unused)
2691+
{
2692+
WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2693+
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
2694+
}
2695+
2696+
static bool tcp_too_many_orphans(int shift)
2697+
{
2698+
return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
2699+
}
2700+
26762701
bool tcp_check_oom(struct sock *sk, int shift)
26772702
{
26782703
bool too_many_orphans, out_of_socket_memory;
26792704

2680-
too_many_orphans = tcp_too_many_orphans(sk, shift);
2705+
too_many_orphans = tcp_too_many_orphans(shift);
26812706
out_of_socket_memory = tcp_out_of_memory(sk);
26822707

26832708
if (too_many_orphans)
@@ -2786,7 +2811,7 @@ void __tcp_close(struct sock *sk, long timeout)
27862811
/* remove backlog if any, without releasing ownership. */
27872812
__release_sock(sk);
27882813

2789-
percpu_counter_inc(sk->sk_prot->orphan_count);
2814+
this_cpu_inc(tcp_orphan_count);
27902815

27912816
/* Have we already been destroyed by a softirq or backlog? */
27922817
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -4479,7 +4504,10 @@ void __init tcp_init(void)
44794504
sizeof_field(struct sk_buff, cb));
44804505

44814506
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
4482-
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
4507+
4508+
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4509+
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
4510+
44834511
inet_hashinfo_init(&tcp_hashinfo);
44844512
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
44854513
thash_entries, 21, /* one slot per 2 MB*/

0 commit comments

Comments
 (0)