Skip to content

Commit e10b02e

Browse files
committed
Merge branch 'net-reduce-tcp_memory_allocated-inflation'
Eric Dumazet says: ==================== net: reduce tcp_memory_allocated inflation Hosts with a lot of sockets tend to hit so called TCP memory pressure, leading to very bad TCP performance and/or OOM. The problem is that some TCP sockets can hold up to 2MB of 'forward allocations' in their per-socket cache (sk->sk_forward_alloc), and there is no mechanism to make them relinquish their share under mem pressure. Only under some potentially rare events their share is reclaimed, one socket at a time. In this series, I implemented a per-cpu cache instead of a per-socket one. Each CPU has a +1/-1 MB (256 pages on x86) forward alloc cache, in order to not dirty tcp_memory_allocated shared cache line too often. We keep sk->sk_forward_alloc values as small as possible, to meet memcg page granularity constraint. Note that memcg already has a per-cpu cache, although MEMCG_CHARGE_BATCH is defined to 32 pages, which seems a bit small. Note that while this cover letter mentions TCP, this work is generic and supports TCP, UDP, DECNET, SCTP. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 5c281b4 + 0f2c269 commit e10b02e

23 files changed

+114
-126
lines changed

include/net/sock.h

Lines changed: 44 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,6 +1254,7 @@ struct proto {
12541254
void (*enter_memory_pressure)(struct sock *sk);
12551255
void (*leave_memory_pressure)(struct sock *sk);
12561256
atomic_long_t *memory_allocated; /* Current allocated memory. */
1257+
int __percpu *per_cpu_fw_alloc;
12571258
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
12581259

12591260
/*
@@ -1396,22 +1397,48 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
13961397
return !!*sk->sk_prot->memory_pressure;
13971398
}
13981399

1400+
static inline long
1401+
proto_memory_allocated(const struct proto *prot)
1402+
{
1403+
return max(0L, atomic_long_read(prot->memory_allocated));
1404+
}
1405+
13991406
static inline long
14001407
sk_memory_allocated(const struct sock *sk)
14011408
{
1402-
return atomic_long_read(sk->sk_prot->memory_allocated);
1409+
return proto_memory_allocated(sk->sk_prot);
14031410
}
14041411

1412+
/* 1 MB per cpu, in page units */
1413+
#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
1414+
14051415
static inline long
14061416
sk_memory_allocated_add(struct sock *sk, int amt)
14071417
{
1408-
return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
1418+
int local_reserve;
1419+
1420+
preempt_disable();
1421+
local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
1422+
if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
1423+
__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
1424+
atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
1425+
}
1426+
preempt_enable();
1427+
return sk_memory_allocated(sk);
14091428
}
14101429

14111430
static inline void
14121431
sk_memory_allocated_sub(struct sock *sk, int amt)
14131432
{
1414-
atomic_long_sub(amt, sk->sk_prot->memory_allocated);
1433+
int local_reserve;
1434+
1435+
preempt_disable();
1436+
local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
1437+
if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
1438+
__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
1439+
atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
1440+
}
1441+
preempt_enable();
14151442
}
14161443

14171444
#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
@@ -1440,12 +1467,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
14401467
return percpu_counter_sum_positive(prot->sockets_allocated);
14411468
}
14421469

1443-
static inline long
1444-
proto_memory_allocated(struct proto *prot)
1445-
{
1446-
return atomic_long_read(prot->memory_allocated);
1447-
}
1448-
14491470
static inline bool
14501471
proto_memory_pressure(struct proto *prot)
14511472
{
@@ -1532,30 +1553,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
15321553
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
15331554
void __sk_mem_reclaim(struct sock *sk, int amount);
15341555

1535-
/* We used to have PAGE_SIZE here, but systems with 64KB pages
1536-
* do not necessarily have 16x time more memory than 4KB ones.
1537-
*/
1538-
#define SK_MEM_QUANTUM 4096
1539-
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
15401556
#define SK_MEM_SEND 0
15411557
#define SK_MEM_RECV 1
15421558

1543-
/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
1559+
/* sysctl_mem values are in pages */
15441560
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
15451561
{
1546-
long val = sk->sk_prot->sysctl_mem[index];
1547-
1548-
#if PAGE_SIZE > SK_MEM_QUANTUM
1549-
val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
1550-
#elif PAGE_SIZE < SK_MEM_QUANTUM
1551-
val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
1552-
#endif
1553-
return val;
1562+
return sk->sk_prot->sysctl_mem[index];
15541563
}
15551564

15561565
static inline int sk_mem_pages(int amt)
15571566
{
1558-
return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
1567+
return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
15591568
}
15601569

15611570
static inline bool sk_has_account(struct sock *sk)
@@ -1566,19 +1575,23 @@ static inline bool sk_has_account(struct sock *sk)
15661575

15671576
static inline bool sk_wmem_schedule(struct sock *sk, int size)
15681577
{
1578+
int delta;
1579+
15691580
if (!sk_has_account(sk))
15701581
return true;
1571-
return size <= sk->sk_forward_alloc ||
1572-
__sk_mem_schedule(sk, size, SK_MEM_SEND);
1582+
delta = size - sk->sk_forward_alloc;
1583+
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
15731584
}
15741585

15751586
static inline bool
15761587
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
15771588
{
1589+
int delta;
1590+
15781591
if (!sk_has_account(sk))
15791592
return true;
1580-
return size <= sk->sk_forward_alloc ||
1581-
__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
1593+
delta = size - sk->sk_forward_alloc;
1594+
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
15821595
skb_pfmemalloc(skb);
15831596
}
15841597

@@ -1604,7 +1617,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
16041617

16051618
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
16061619

1607-
if (reclaimable >= SK_MEM_QUANTUM)
1620+
if (reclaimable >= (int)PAGE_SIZE)
16081621
__sk_mem_reclaim(sk, reclaimable);
16091622
}
16101623

@@ -1614,49 +1627,24 @@ static inline void sk_mem_reclaim_final(struct sock *sk)
16141627
sk_mem_reclaim(sk);
16151628
}
16161629

1617-
static inline void sk_mem_reclaim_partial(struct sock *sk)
1618-
{
1619-
int reclaimable;
1620-
1621-
if (!sk_has_account(sk))
1622-
return;
1623-
1624-
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
1625-
1626-
if (reclaimable > SK_MEM_QUANTUM)
1627-
__sk_mem_reclaim(sk, reclaimable - 1);
1628-
}
1629-
16301630
static inline void sk_mem_charge(struct sock *sk, int size)
16311631
{
16321632
if (!sk_has_account(sk))
16331633
return;
16341634
sk->sk_forward_alloc -= size;
16351635
}
16361636

1637-
/* the following macros control memory reclaiming in sk_mem_uncharge()
1637+
/* the following macros control memory reclaiming in mptcp_rmem_uncharge()
16381638
*/
16391639
#define SK_RECLAIM_THRESHOLD (1 << 21)
16401640
#define SK_RECLAIM_CHUNK (1 << 20)
16411641

16421642
static inline void sk_mem_uncharge(struct sock *sk, int size)
16431643
{
1644-
int reclaimable;
1645-
16461644
if (!sk_has_account(sk))
16471645
return;
16481646
sk->sk_forward_alloc += size;
1649-
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
1650-
1651-
/* Avoid a possible overflow.
1652-
* TCP send queues can make this happen, if sk_mem_reclaim()
1653-
* is not called and more than 2 GBytes are released at once.
1654-
*
1655-
* If we reach 2 MBytes, reclaim 1 MBytes right now, there is
1656-
* no need to hold that much forward allocation anyway.
1657-
*/
1658-
if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
1659-
__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
1647+
sk_mem_reclaim(sk);
16601648
}
16611649

16621650
/*

include/net/tcp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ extern long sysctl_tcp_mem[3];
253253
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
254254

255255
extern atomic_long_t tcp_memory_allocated;
256+
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
257+
256258
extern struct percpu_counter tcp_sockets_allocated;
257259
extern unsigned long tcp_memory_pressure;
258260

include/net/udp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
9595
extern struct proto udp_prot;
9696

9797
extern atomic_long_t udp_memory_allocated;
98+
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
9899

99100
/* sysctl variables for udp */
100101
extern long sysctl_udp_mem[3];

net/core/datagram.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram);
320320
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
321321
{
322322
consume_skb(skb);
323-
sk_mem_reclaim_partial(sk);
324323
}
325324
EXPORT_SYMBOL(skb_free_datagram);
326325

@@ -336,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
336335
slow = lock_sock_fast(sk);
337336
sk_peek_offset_bwd(sk, len);
338337
skb_orphan(skb);
339-
sk_mem_reclaim_partial(sk);
340338
unlock_sock_fast(sk, slow);
341339

342340
/* skb is now orphaned, can be freed outside of locked section */
@@ -396,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
396394
NULL);
397395

398396
kfree_skb(skb);
399-
sk_mem_reclaim_partial(sk);
400397
return err;
401398
}
402399
EXPORT_SYMBOL(skb_kill_datagram);

net/core/sock.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,7 @@ EXPORT_SYMBOL(sock_set_mark);
991991
static void sock_release_reserved_memory(struct sock *sk, int bytes)
992992
{
993993
/* Round down bytes to multiple of pages */
994-
bytes &= ~(SK_MEM_QUANTUM - 1);
994+
bytes = round_down(bytes, PAGE_SIZE);
995995

996996
WARN_ON(bytes > sk->sk_reserved_mem);
997997
sk->sk_reserved_mem -= bytes;
@@ -1028,9 +1028,9 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
10281028
mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
10291029
return -ENOMEM;
10301030
}
1031-
sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1031+
sk->sk_forward_alloc += pages << PAGE_SHIFT;
10321032

1033-
sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1033+
sk->sk_reserved_mem += pages << PAGE_SHIFT;
10341034

10351035
return 0;
10361036
}
@@ -2987,7 +2987,6 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
29872987

29882988
return 0;
29892989
}
2990-
EXPORT_SYMBOL(__sk_mem_raise_allocated);
29912990

29922991
/**
29932992
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -3003,10 +3002,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
30033002
{
30043003
int ret, amt = sk_mem_pages(size);
30053004

3006-
sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
3005+
sk->sk_forward_alloc += amt << PAGE_SHIFT;
30073006
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
30083007
if (!ret)
3009-
sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
3008+
sk->sk_forward_alloc -= amt << PAGE_SHIFT;
30103009
return ret;
30113010
}
30123011
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -3029,17 +3028,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
30293028
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
30303029
sk_leave_memory_pressure(sk);
30313030
}
3032-
EXPORT_SYMBOL(__sk_mem_reduce_allocated);
30333031

30343032
/**
30353033
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
30363034
* @sk: socket
3037-
* @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
3035+
* @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
30383036
*/
30393037
void __sk_mem_reclaim(struct sock *sk, int amount)
30403038
{
3041-
amount >>= SK_MEM_QUANTUM_SHIFT;
3042-
sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
3039+
amount >>= PAGE_SHIFT;
3040+
sk->sk_forward_alloc -= amount << PAGE_SHIFT;
30433041
__sk_mem_reduce_allocated(sk, amount);
30443042
}
30453043
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -3798,6 +3796,10 @@ int proto_register(struct proto *prot, int alloc_slab)
37983796
pr_err("%s: missing sysctl_mem\n", prot->name);
37993797
return -EINVAL;
38003798
}
3799+
if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3800+
pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3801+
return -EINVAL;
3802+
}
38013803
if (alloc_slab) {
38023804
prot->slab = kmem_cache_create_usercopy(prot->name,
38033805
prot->obj_size, 0,

net/decnet/af_decnet.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ static DEFINE_RWLOCK(dn_hash_lock);
149149
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
150150
static struct hlist_head dn_wild_sk;
151151
static atomic_long_t decnet_memory_allocated;
152+
static DEFINE_PER_CPU(int, decnet_memory_per_cpu_fw_alloc);
152153

153154
static int __dn_setsockopt(struct socket *sock, int level, int optname,
154155
sockptr_t optval, unsigned int optlen, int flags);
@@ -454,7 +455,10 @@ static struct proto dn_proto = {
454455
.owner = THIS_MODULE,
455456
.enter_memory_pressure = dn_enter_memory_pressure,
456457
.memory_pressure = &dn_memory_pressure,
458+
457459
.memory_allocated = &decnet_memory_allocated,
460+
.per_cpu_fw_alloc = &decnet_memory_per_cpu_fw_alloc,
461+
458462
.sysctl_mem = sysctl_decnet_mem,
459463
.sysctl_wmem = sysctl_decnet_wmem,
460464
.sysctl_rmem = sysctl_decnet_rmem,

net/ipv4/tcp.c

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
294294

295295
atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
296296
EXPORT_SYMBOL(tcp_memory_allocated);
297+
DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
298+
EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
297299

298300
#if IS_ENABLED(CONFIG_SMC)
299301
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
@@ -856,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
856858
{
857859
struct sk_buff *skb;
858860

859-
if (unlikely(tcp_under_memory_pressure(sk)))
860-
sk_mem_reclaim_partial(sk);
861-
862861
skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
863862
if (likely(skb)) {
864863
bool mem_scheduled;
@@ -2762,8 +2761,6 @@ void __tcp_close(struct sock *sk, long timeout)
27622761
__kfree_skb(skb);
27632762
}
27642763

2765-
sk_mem_reclaim(sk);
2766-
27672764
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
27682765
if (sk->sk_state == TCP_CLOSE)
27692766
goto adjudge_to_death;
@@ -2871,7 +2868,6 @@ void __tcp_close(struct sock *sk, long timeout)
28712868
}
28722869
}
28732870
if (sk->sk_state != TCP_CLOSE) {
2874-
sk_mem_reclaim(sk);
28752871
if (tcp_check_oom(sk, 0)) {
28762872
tcp_set_state(sk, TCP_CLOSE);
28772873
tcp_send_active_reset(sk, GFP_ATOMIC);
@@ -2949,7 +2945,6 @@ void tcp_write_queue_purge(struct sock *sk)
29492945
}
29502946
tcp_rtx_queue_purge(sk);
29512947
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2952-
sk_mem_reclaim(sk);
29532948
tcp_clear_all_retrans_hints(tcp_sk(sk));
29542949
tcp_sk(sk)->packets_out = 0;
29552950
inet_csk(sk)->icsk_backoff = 0;
@@ -4661,11 +4656,11 @@ void __init tcp_init(void)
46614656
max_wshare = min(4UL*1024*1024, limit);
46624657
max_rshare = min(6UL*1024*1024, limit);
46634658

4664-
init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4659+
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
46654660
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
46664661
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
46674662

4668-
init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4663+
init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
46694664
init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
46704665
init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
46714666

0 commit comments

Comments
 (0)