Skip to content

Commit 70ae722

Browse files
committed
Merge branch 'inet-frags-bring-rhashtables-to-IP-defrag'
Eric Dumazet says: ==================== inet: frags: bring rhashtables to IP defrag IP defrag processing is one of the remaining problematic layer in linux. It uses static hash tables of 1024 buckets, and up to 128 items per bucket. A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU, and 64bit hosts can now provision whatever amount of memory needed to handle the expected workloads. v2: Addressed Herbert and Kirill feedbacks (Use rhashtable_free_and_destroy(), and split the big patch into small units) v3: Removed the extra add_frag_mem_limit(...) from inet_frag_create() Removed the refcount_inc_not_zero() call from inet_frags_free_cb(), as we can exploit del_timer() return value. v4: kbuild robot feedback about one missing static (squashed) Additional patches : inet: frags: do not clone skb in ip_expire() ipv6: frags: rewrite ip6_expire_frag_queue() rhashtable: reorganize struct rhashtable layout inet: frags: reorganize struct netns_frags inet: frags: get rid of ipfrag_skb_cb/FRAG_CB ipv6: frags: get rid of ip6frag_skb_cb/FRAG6_CB inet: frags: get rid of nf_ct_frag6_skb_cb/NFCT_FRAG6_CB ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 5749d6a + f2d1c72 commit 70ae722

File tree

15 files changed

+499
-827
lines changed

15 files changed

+499
-827
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,11 @@ min_adv_mss - INTEGER
133133

134134
IP Fragmentation:
135135

136-
ipfrag_high_thresh - INTEGER
137-
Maximum memory used to reassemble IP fragments. When
138-
ipfrag_high_thresh bytes of memory is allocated for this purpose,
139-
the fragment handler will toss packets until ipfrag_low_thresh
140-
is reached. This also serves as a maximum limit to namespaces
141-
different from the initial one.
142-
143-
ipfrag_low_thresh - INTEGER
136+
ipfrag_high_thresh - LONG INTEGER
137+
Maximum memory used to reassemble IP fragments.
138+
139+
ipfrag_low_thresh - LONG INTEGER
140+
(Obsolete since linux-4.17)
144141
Maximum memory used to reassemble IP fragments before the kernel
145142
begins to remove incomplete fragment queues to free up resources.
146143
The kernel still accepts new fragments for defragmentation.

include/linux/rhashtable.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,25 +152,25 @@ struct rhashtable_params {
152152
/**
153153
* struct rhashtable - Hash table handle
154154
* @tbl: Bucket table
155-
* @nelems: Number of elements in table
156155
* @key_len: Key length for hashfn
157-
* @p: Configuration parameters
158156
* @max_elems: Maximum number of elements in table
157+
* @p: Configuration parameters
159158
* @rhlist: True if this is an rhltable
160159
* @run_work: Deferred worker to expand/shrink asynchronously
161160
* @mutex: Mutex to protect current/future table swapping
162161
* @lock: Spin lock to protect walker list
162+
* @nelems: Number of elements in table
163163
*/
164164
struct rhashtable {
165165
struct bucket_table __rcu *tbl;
166-
atomic_t nelems;
167166
unsigned int key_len;
168-
struct rhashtable_params p;
169167
unsigned int max_elems;
168+
struct rhashtable_params p;
170169
bool rhlist;
171170
struct work_struct run_work;
172171
struct mutex mutex;
173172
spinlock_t lock;
173+
atomic_t nelems;
174174
};
175175

176176
/**

include/linux/skbuff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ struct sk_buff {
672672
* UDP receive path is one user.
673673
*/
674674
unsigned long dev_scratch;
675+
int ip_defrag_offset;
675676
};
676677
};
677678
struct rb_node rbnode; /* used in netem & tcp stack */

include/net/inet_frag.h

Lines changed: 55 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22
#ifndef __NET_FRAG_H__
33
#define __NET_FRAG_H__
44

5+
#include <linux/rhashtable.h>
6+
57
struct netns_frags {
6-
/* Keep atomic mem on separate cachelines in structs that include it */
7-
atomic_t mem ____cacheline_aligned_in_smp;
88
/* sysctls */
9+
long high_thresh;
10+
long low_thresh;
911
int timeout;
10-
int high_thresh;
11-
int low_thresh;
1212
int max_dist;
13+
struct inet_frags *f;
14+
15+
struct rhashtable rhashtable ____cacheline_aligned_in_smp;
16+
17+
/* Keep atomic mem on separate cachelines in structs that include it */
18+
atomic_long_t mem ____cacheline_aligned_in_smp;
1319
};
1420

1521
/**
@@ -25,12 +31,30 @@ enum {
2531
INET_FRAG_COMPLETE = BIT(2),
2632
};
2733

34+
struct frag_v4_compare_key {
35+
__be32 saddr;
36+
__be32 daddr;
37+
u32 user;
38+
u32 vif;
39+
__be16 id;
40+
u16 protocol;
41+
};
42+
43+
struct frag_v6_compare_key {
44+
struct in6_addr saddr;
45+
struct in6_addr daddr;
46+
u32 user;
47+
__be32 id;
48+
u32 iif;
49+
};
50+
2851
/**
2952
* struct inet_frag_queue - fragment queue
3053
*
31-
* @lock: spinlock protecting the queue
54+
* @node: rhash node
55+
* @key: keys identifying this frag.
3256
* @timer: queue expiration timer
33-
* @list: hash bucket list
57+
* @lock: spinlock protecting this frag
3458
* @refcnt: reference count of the queue
3559
* @fragments: received fragments head
3660
* @fragments_tail: received fragments tail
@@ -40,12 +64,16 @@ enum {
4064
* @flags: fragment queue flags
4165
* @max_size: maximum received fragment size
4266
* @net: namespace that this frag belongs to
43-
* @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
67+
* @rcu: rcu head for freeing deferall
4468
*/
4569
struct inet_frag_queue {
46-
spinlock_t lock;
70+
struct rhash_head node;
71+
union {
72+
struct frag_v4_compare_key v4;
73+
struct frag_v6_compare_key v6;
74+
} key;
4775
struct timer_list timer;
48-
struct hlist_node list;
76+
spinlock_t lock;
4977
refcount_t refcnt;
5078
struct sk_buff *fragments;
5179
struct sk_buff *fragments_tail;
@@ -54,101 +82,57 @@ struct inet_frag_queue {
5482
int meat;
5583
__u8 flags;
5684
u16 max_size;
57-
struct netns_frags *net;
58-
struct hlist_node list_evictor;
59-
};
60-
61-
#define INETFRAGS_HASHSZ 1024
62-
63-
/* averaged:
64-
* max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
65-
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
66-
* struct frag_queue))
67-
*/
68-
#define INETFRAGS_MAXDEPTH 128
69-
70-
struct inet_frag_bucket {
71-
struct hlist_head chain;
72-
spinlock_t chain_lock;
85+
struct netns_frags *net;
86+
struct rcu_head rcu;
7387
};
7488

7589
struct inet_frags {
76-
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
77-
78-
struct work_struct frags_work;
79-
unsigned int next_bucket;
80-
unsigned long last_rebuild_jiffies;
81-
bool rebuild;
82-
83-
/* The first call to hashfn is responsible to initialize
84-
* rnd. This is best done with net_get_random_once.
85-
*
86-
* rnd_seqlock is used to let hash insertion detect
87-
* when it needs to re-lookup the hash chain to use.
88-
*/
89-
u32 rnd;
90-
seqlock_t rnd_seqlock;
9190
unsigned int qsize;
9291

93-
unsigned int (*hashfn)(const struct inet_frag_queue *);
94-
bool (*match)(const struct inet_frag_queue *q,
95-
const void *arg);
9692
void (*constructor)(struct inet_frag_queue *q,
9793
const void *arg);
9894
void (*destructor)(struct inet_frag_queue *);
9995
void (*frag_expire)(struct timer_list *t);
10096
struct kmem_cache *frags_cachep;
10197
const char *frags_cache_name;
98+
struct rhashtable_params rhash_params;
10299
};
103100

104101
int inet_frags_init(struct inet_frags *);
105102
void inet_frags_fini(struct inet_frags *);
106103

107-
static inline void inet_frags_init_net(struct netns_frags *nf)
104+
static inline int inet_frags_init_net(struct netns_frags *nf)
108105
{
109-
atomic_set(&nf->mem, 0);
106+
atomic_long_set(&nf->mem, 0);
107+
return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
110108
}
111-
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
112-
113-
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
114-
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
115-
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
116-
struct inet_frags *f, void *key, unsigned int hash);
109+
void inet_frags_exit_net(struct netns_frags *nf);
117110

118-
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
119-
const char *prefix);
111+
void inet_frag_kill(struct inet_frag_queue *q);
112+
void inet_frag_destroy(struct inet_frag_queue *q);
113+
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
120114

121-
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
115+
static inline void inet_frag_put(struct inet_frag_queue *q)
122116
{
123117
if (refcount_dec_and_test(&q->refcnt))
124-
inet_frag_destroy(q, f);
125-
}
126-
127-
static inline bool inet_frag_evicting(struct inet_frag_queue *q)
128-
{
129-
return !hlist_unhashed(&q->list_evictor);
118+
inet_frag_destroy(q);
130119
}
131120

132121
/* Memory Tracking Functions. */
133122

134-
static inline int frag_mem_limit(struct netns_frags *nf)
135-
{
136-
return atomic_read(&nf->mem);
137-
}
138-
139-
static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
123+
static inline long frag_mem_limit(const struct netns_frags *nf)
140124
{
141-
atomic_sub(i, &nf->mem);
125+
return atomic_long_read(&nf->mem);
142126
}
143127

144-
static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
128+
static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
145129
{
146-
atomic_add(i, &nf->mem);
130+
atomic_long_sub(val, &nf->mem);
147131
}
148132

149-
static inline int sum_frag_mem_limit(struct netns_frags *nf)
133+
static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
150134
{
151-
return atomic_read(&nf->mem);
135+
atomic_long_add(val, &nf->mem);
152136
}
153137

154138
/* RFC 3168 support :

include/net/ip.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
588588
return skb;
589589
}
590590
#endif
591-
int ip_frag_mem(struct net *net);
592591

593592
/*
594593
* Functions provided by ip_forward.c

include/net/ipv6.h

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
379379
idev->cnf.accept_ra;
380380
}
381381

382-
#if IS_ENABLED(CONFIG_IPV6)
383-
static inline int ip6_frag_mem(struct net *net)
384-
{
385-
return sum_frag_mem_limit(&net->ipv6.frags);
386-
}
387-
#endif
388-
389382
#define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
390383
#define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
391384
#define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
@@ -579,37 +572,21 @@ enum ip6_defrag_users {
579572
__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
580573
};
581574

582-
struct ip6_create_arg {
583-
__be32 id;
584-
u32 user;
585-
const struct in6_addr *src;
586-
const struct in6_addr *dst;
587-
int iif;
588-
u8 ecn;
589-
};
590-
591575
void ip6_frag_init(struct inet_frag_queue *q, const void *a);
592-
bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
576+
extern const struct rhashtable_params ip6_rhash_params;
593577

594578
/*
595579
* Equivalent of ipv4 struct ip
596580
*/
597581
struct frag_queue {
598582
struct inet_frag_queue q;
599583

600-
__be32 id; /* fragment id */
601-
u32 user;
602-
struct in6_addr saddr;
603-
struct in6_addr daddr;
604-
605584
int iif;
606-
unsigned int csum;
607585
__u16 nhoffset;
608586
u8 ecn;
609587
};
610588

611-
void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
612-
struct inet_frags *frags);
589+
void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
613590

614591
static inline bool ipv6_addr_any(const struct in6_addr *a)
615592
{

lib/rhashtable.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
333333
err = rhashtable_rehash_chain(ht, old_hash);
334334
if (err)
335335
return err;
336+
cond_resched();
336337
}
337338

338339
/* Publish the new table pointer. */
@@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
11121113
for (i = 0; i < tbl->size; i++) {
11131114
struct rhash_head *pos, *next;
11141115

1116+
cond_resched();
11151117
for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
11161118
next = !rht_is_a_nulls(pos) ?
11171119
rht_dereference(pos->next, ht) : NULL;

net/ieee802154/6lowpan/6lowpan_i.h

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
1717
#define LOWPAN_DISPATCH_FRAG1 0xc0
1818
#define LOWPAN_DISPATCH_FRAGN 0xe0
1919

20-
struct lowpan_create_arg {
20+
struct frag_lowpan_compare_key {
2121
u16 tag;
2222
u16 d_size;
23-
const struct ieee802154_addr *src;
24-
const struct ieee802154_addr *dst;
23+
const struct ieee802154_addr src;
24+
const struct ieee802154_addr dst;
2525
};
2626

27-
/* Equivalent of ipv4 struct ip
27+
/* Equivalent of ipv4 struct ipq
2828
*/
2929
struct lowpan_frag_queue {
3030
struct inet_frag_queue q;
31-
32-
u16 tag;
33-
u16 d_size;
34-
struct ieee802154_addr saddr;
35-
struct ieee802154_addr daddr;
3631
};
3732

38-
static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
39-
{
40-
switch (a->mode) {
41-
case IEEE802154_ADDR_LONG:
42-
return (((__force u64)a->extended_addr) >> 32) ^
43-
(((__force u64)a->extended_addr) & 0xffffffff);
44-
case IEEE802154_ADDR_SHORT:
45-
return (__force u32)(a->short_addr + (a->pan_id << 16));
46-
default:
47-
return 0;
48-
}
49-
}
50-
5133
int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
5234
void lowpan_net_frag_exit(void);
5335
int lowpan_net_frag_init(void);

0 commit comments

Comments
 (0)