Skip to content

Commit 5895631

Browse files
dsaherndavem330
authored andcommitted
neighbor: Improve garbage collection
The existing garbage collection algorithm has a number of problems: 1. The gc algorithm will not evict PERMANENT entries as those entries are managed by userspace, yet the existing algorithm walks the entire hash table which means it always considers PERMANENT entries when looking for entries to evict. In some use cases (e.g., EVPN) there can be tens of thousands of PERMANENT entries leading to wasted CPU cycles when gc kicks in. As an example, with 32k permanent entries, neigh_alloc has been observed taking more than 4 msec per invocation. 2. Currently, when the number of neighbor entries hits gc_thresh2 and the last flush for the table was more than 5 seconds ago gc kicks in walks the entire hash table evicting *all* entries not in PERMANENT or REACHABLE state and not marked as externally learned. There is no discriminator on when the neigh entry was created or if it just moved from REACHABLE to another NUD_VALID state (e.g., NUD_STALE). It is possible for entries to be created or for established neighbor entries to be moved to STALE (e.g., an external node sends an ARP request) right before the 5 second window lapses: -----|---------x|----------|----- t-5 t t+5 If that happens those entries are evicted during gc causing unnecessary thrashing on neighbor entries and userspace caches trying to track them. Further, this contradicts the description of gc_thresh2 which says "Entries older than 5 seconds will be cleared". One workaround is to make gc_thresh2 == gc_thresh3 but that negates the whole point of having separate thresholds. 3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries when gc_thresh2 is exceeded is over kill and contributes to trashing especially during startup. This patch addresses these problems as follows: 1. Use of a separate list_head to track entries that can be garbage collected along with a separate counter. PERMANENT entries are not added to this list. The gc_thresh parameters are only compared to the new counter, not the total entries in the table. The forced_gc function is updated to only walk this new gc_list looking for entries to evict. 2. Entries are added to the list head at the tail and removed from the front. 3. Entries are only evicted if they were last updated more than 5 seconds ago, adhering to the original intent of gc_thresh2. 4. Forced gc is stopped once the number of gc_entries drops below gc_thresh2. 5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped when allocating a new neighbor for a PERMANENT entry. By extension this means there are no explicit limits on the number of PERMANENT entries that can be created, but this is no different than FIB entries or FDB entries. Signed-off-by: David Ahern <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 12edfdf commit 5895631

File tree

3 files changed

+90
-36
lines changed

3 files changed

+90
-36
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ neigh/default/gc_thresh2 - INTEGER
108108
Default: 512
109109

110110
neigh/default/gc_thresh3 - INTEGER
111-
Maximum number of neighbor entries allowed. Increase this
112-
when using large numbers of interfaces and when communicating
111+
Maximum number of non-PERMANENT neighbor entries allowed. Increase
112+
this when using large numbers of interfaces and when communicating
113113
with large numbers of directly-connected peers.
114114
Default: 1024
115115

include/net/neighbour.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ struct neighbour {
154154
struct hh_cache hh;
155155
int (*output)(struct neighbour *, struct sk_buff *);
156156
const struct neigh_ops *ops;
157+
struct list_head gc_list;
157158
struct rcu_head rcu;
158159
struct net_device *dev;
159160
u8 primary_key[0];
@@ -214,6 +215,8 @@ struct neigh_table {
214215
struct timer_list proxy_timer;
215216
struct sk_buff_head proxy_queue;
216217
atomic_t entries;
218+
atomic_t gc_entries;
219+
struct list_head gc_list;
217220
rwlock_t lock;
218221
unsigned long last_rand;
219222
struct neigh_statistics __percpu *stats;

net/core/neighbour.c

Lines changed: 85 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base)
118118
}
119119
EXPORT_SYMBOL(neigh_rand_reach_time);
120120

121+
static void neigh_mark_dead(struct neighbour *n)
122+
{
123+
n->dead = 1;
124+
if (!list_empty(&n->gc_list)) {
125+
list_del_init(&n->gc_list);
126+
atomic_dec(&n->tbl->gc_entries);
127+
}
128+
}
129+
130+
static void neigh_change_state(struct neighbour *n, u8 new)
131+
{
132+
bool on_gc_list = !list_empty(&n->gc_list);
133+
bool new_is_perm = new & NUD_PERMANENT;
134+
135+
n->nud_state = new;
136+
137+
/* remove from the gc list if new state is permanent;
138+
* add to the gc list if new state is not permanent
139+
*/
140+
if (new_is_perm && on_gc_list) {
141+
list_del_init(&n->gc_list);
142+
atomic_dec(&n->tbl->gc_entries);
143+
} else if (!new_is_perm && !on_gc_list) {
144+
/* add entries to the tail; cleaning removes from the front */
145+
list_add_tail(&n->gc_list, &n->tbl->gc_list);
146+
atomic_inc(&n->tbl->gc_entries);
147+
}
148+
}
121149

122150
static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
123151
struct neighbour __rcu **np, struct neigh_table *tbl)
@@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
132160
neigh = rcu_dereference_protected(n->next,
133161
lockdep_is_held(&tbl->lock));
134162
rcu_assign_pointer(*np, neigh);
135-
n->dead = 1;
163+
neigh_mark_dead(n);
136164
retval = true;
137165
}
138166
write_unlock(&n->lock);
@@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
166194

167195
static int neigh_forced_gc(struct neigh_table *tbl)
168196
{
197+
int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
198+
unsigned long tref = jiffies - 5 * HZ;
199+
u8 flags = NTF_EXT_LEARNED;
200+
struct neighbour *n, *tmp;
201+
u8 state = NUD_PERMANENT;
169202
int shrunk = 0;
170-
int i;
171-
struct neigh_hash_table *nht;
172203

173204
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
174205

175206
write_lock_bh(&tbl->lock);
176-
nht = rcu_dereference_protected(tbl->nht,
177-
lockdep_is_held(&tbl->lock));
178-
for (i = 0; i < (1 << nht->hash_shift); i++) {
179-
struct neighbour *n;
180-
struct neighbour __rcu **np;
181207

182-
np = &nht->hash_buckets[i];
183-
while ((n = rcu_dereference_protected(*np,
184-
lockdep_is_held(&tbl->lock))) != NULL) {
185-
/* Neighbour record may be discarded if:
186-
* - nobody refers to it.
187-
* - it is not permanent
188-
*/
189-
if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np,
190-
tbl)) {
191-
shrunk = 1;
192-
continue;
193-
}
194-
np = &n->next;
208+
list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
209+
if (refcount_read(&n->refcnt) == 1) {
210+
bool remove = false;
211+
212+
write_lock(&n->lock);
213+
if (!(n->nud_state & state) && !(n->flags & flags) &&
214+
time_after(tref, n->updated))
215+
remove = true;
216+
write_unlock(&n->lock);
217+
218+
if (remove && neigh_remove_one(n, tbl))
219+
shrunk++;
220+
if (shrunk >= max_clean)
221+
break;
195222
}
196223
}
197224

@@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
260287
lockdep_is_held(&tbl->lock)));
261288
write_lock(&n->lock);
262289
neigh_del_timer(n);
263-
n->dead = 1;
264-
290+
neigh_mark_dead(n);
265291
if (refcount_read(&n->refcnt) != 1) {
266292
/* The most unpleasant situation.
267293
We must destroy neighbour entry,
@@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
321347
}
322348
EXPORT_SYMBOL(neigh_ifdown);
323349

324-
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
350+
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
351+
struct net_device *dev,
352+
bool permanent)
325353
{
326354
struct neighbour *n = NULL;
327355
unsigned long now = jiffies;
328356
int entries;
329357

330-
entries = atomic_inc_return(&tbl->entries) - 1;
358+
if (permanent)
359+
goto do_alloc;
360+
361+
entries = atomic_inc_return(&tbl->gc_entries) - 1;
331362
if (entries >= tbl->gc_thresh3 ||
332363
(entries >= tbl->gc_thresh2 &&
333364
time_after(now, tbl->last_flush + 5 * HZ))) {
@@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
340371
}
341372
}
342373

374+
do_alloc:
343375
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
344376
if (!n)
345377
goto out_entries;
@@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
358390
n->tbl = tbl;
359391
refcount_set(&n->refcnt, 1);
360392
n->dead = 1;
393+
394+
if (!permanent)
395+
list_add_tail(&n->gc_list, &n->tbl->gc_list);
396+
else
397+
INIT_LIST_HEAD(&n->gc_list);
398+
399+
atomic_inc(&tbl->entries);
361400
out:
362401
return n;
363402

364403
out_entries:
365-
atomic_dec(&tbl->entries);
404+
if (!permanent)
405+
atomic_dec(&tbl->gc_entries);
366406
goto out;
367407
}
368408

@@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
505545
}
506546
EXPORT_SYMBOL(neigh_lookup_nodev);
507547

508-
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
509-
struct net_device *dev, bool want_ref)
548+
static struct neighbour *___neigh_create(struct neigh_table *tbl,
549+
const void *pkey,
550+
struct net_device *dev,
551+
bool permanent, bool want_ref)
510552
{
553+
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent);
511554
u32 hash_val;
512555
unsigned int key_len = tbl->key_len;
513556
int error;
514-
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
515557
struct neigh_hash_table *nht;
516558

517559
if (!n) {
@@ -591,6 +633,12 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
591633
neigh_release(n);
592634
goto out;
593635
}
636+
637+
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
638+
struct net_device *dev, bool want_ref)
639+
{
640+
return ___neigh_create(tbl, pkey, dev, false, want_ref);
641+
}
594642
EXPORT_SYMBOL(__neigh_create);
595643

596644
static u32 pneigh_hash(const void *pkey, unsigned int key_len)
@@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work)
854902
(state == NUD_FAILED ||
855903
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
856904
*np = n->next;
857-
n->dead = 1;
905+
neigh_mark_dead(n);
858906
write_unlock(&n->lock);
859907
neigh_cleanup_and_release(n);
860908
continue;
@@ -1167,7 +1215,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
11671215
neigh_del_timer(neigh);
11681216
if (old & NUD_CONNECTED)
11691217
neigh_suspect(neigh);
1170-
neigh->nud_state = new;
1218+
neigh_change_state(neigh, new);
11711219
err = 0;
11721220
notify = old & NUD_VALID;
11731221
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1246,7 +1294,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
12461294
((new & NUD_REACHABLE) ?
12471295
neigh->parms->reachable_time :
12481296
0)));
1249-
neigh->nud_state = new;
1297+
neigh_change_state(neigh, new);
12501298
notify = 1;
12511299
}
12521300

@@ -1582,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
15821630
unsigned long phsize;
15831631

15841632
INIT_LIST_HEAD(&tbl->parms_list);
1633+
INIT_LIST_HEAD(&tbl->gc_list);
15851634
list_add(&tbl->parms.list, &tbl->parms_list);
15861635
write_pnet(&tbl->parms.net, &init_net);
15871636
refcount_set(&tbl->parms.refcnt, 1);
@@ -1813,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
18131862
goto out;
18141863
}
18151864

1816-
neigh = __neigh_lookup_errno(tbl, dst, dev);
1865+
neigh = ___neigh_create(tbl, dst, dev,
1866+
ndm->ndm_state & NUD_PERMANENT,
1867+
true);
18171868
if (IS_ERR(neigh)) {
18181869
err = PTR_ERR(neigh);
18191870
goto out;
@@ -2654,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
26542705
rcu_assign_pointer(*np,
26552706
rcu_dereference_protected(n->next,
26562707
lockdep_is_held(&tbl->lock)));
2657-
n->dead = 1;
2708+
neigh_mark_dead(n);
26582709
} else
26592710
np = &n->next;
26602711
write_unlock(&n->lock);

0 commit comments

Comments
 (0)