Skip to content

Commit 3d8dc43

Browse files
Hou TaoAlexei Starovoitov
authored andcommitted
bpf: Switch to bpf mem allocator for LPM trie
Multiple syzbot warnings have been reported. These warnings are mainly about the lock order between trie->lock and kmalloc()'s internal lock. See report [1] as an example: ====================================================== WARNING: possible circular locking dependency detected 6.10.0-rc7-syzkaller-00003-g4376e966ecb7 #0 Not tainted ------------------------------------------------------ syz.3.2069/15008 is trying to acquire lock: ffff88801544e6d8 (&n->list_lock){-.-.}-{2:2}, at: get_partial_node ... but task is already holding lock: ffff88802dcc89f8 (&trie->lock){-.-.}-{2:2}, at: trie_update_elem ... which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&trie->lock){-.-.}-{2:2}: __raw_spin_lock_irqsave _raw_spin_lock_irqsave+0x3a/0x60 trie_delete_elem+0xb0/0x820 ___bpf_prog_run+0x3e51/0xabd0 __bpf_prog_run32+0xc1/0x100 bpf_dispatcher_nop_func ...... bpf_trace_run2+0x231/0x590 __bpf_trace_contention_end+0xca/0x110 trace_contention_end.constprop.0+0xea/0x170 __pv_queued_spin_lock_slowpath+0x28e/0xcc0 pv_queued_spin_lock_slowpath queued_spin_lock_slowpath queued_spin_lock do_raw_spin_lock+0x210/0x2c0 __raw_spin_lock_irqsave _raw_spin_lock_irqsave+0x42/0x60 __put_partials+0xc3/0x170 qlink_free qlist_free_all+0x4e/0x140 kasan_quarantine_reduce+0x192/0x1e0 __kasan_slab_alloc+0x69/0x90 kasan_slab_alloc slab_post_alloc_hook slab_alloc_node kmem_cache_alloc_node_noprof+0x153/0x310 __alloc_skb+0x2b1/0x380 ...... -> #0 (&n->list_lock){-.-.}-{2:2}: check_prev_add check_prevs_add validate_chain __lock_acquire+0x2478/0x3b30 lock_acquire lock_acquire+0x1b1/0x560 __raw_spin_lock_irqsave _raw_spin_lock_irqsave+0x3a/0x60 get_partial_node.part.0+0x20/0x350 get_partial_node get_partial ___slab_alloc+0x65b/0x1870 __slab_alloc.constprop.0+0x56/0xb0 __slab_alloc_node slab_alloc_node __do_kmalloc_node __kmalloc_node_noprof+0x35c/0x440 kmalloc_node_noprof bpf_map_kmalloc_node+0x98/0x4a0 lpm_trie_node_alloc trie_update_elem+0x1ef/0xe00 bpf_map_update_value+0x2c1/0x6c0 map_update_elem+0x623/0x910 __sys_bpf+0x90c/0x49a0 ... other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&trie->lock); lock(&n->list_lock); lock(&trie->lock); lock(&n->list_lock); *** DEADLOCK *** [1]: https://syzkaller.appspot.com/bug?extid=9045c0a3d5a7f1b119f7 A bpf program attached to trace_contention_end() triggers after acquiring &n->list_lock. The program invokes trie_delete_elem(), which then acquires trie->lock. However, it is possible that another process is invoking trie_update_elem(). trie_update_elem() will acquire trie->lock first, then invoke kmalloc_node(). kmalloc_node() may invoke get_partial_node() and try to acquire &n->list_lock (not necessarily the same lock object). Therefore, lockdep warns about the circular locking dependency. Invoking kmalloc() before acquiring trie->lock could fix the warning. However, since BPF programs call be invoked from any context (e.g., through kprobe/tracepoint/fentry), there may still be lock ordering problems for internal locks in kmalloc() or trie->lock itself. To eliminate these potential lock ordering problems with kmalloc()'s internal locks, replacing kmalloc()/kfree()/kfree_rcu() with equivalent BPF memory allocator APIs that can be invoked in any context. The lock ordering problems with trie->lock (e.g., reentrance) will be handled separately. Three aspects of this change require explanation: 1. Intermediate and leaf nodes are allocated from the same allocator. Since the value size of LPM trie is usually small, using a single alocator reduces the memory overhead of the BPF memory allocator. 2. Leaf nodes are allocated before disabling IRQs. This handles cases where leaf_size is large (e.g., > 4KB - 8) and updates require intermediate node allocation. If leaf nodes were allocated in IRQ-disabled region, the free objects in BPF memory allocator would not be refilled timely and the intermediate node allocation may fail. 3. Paired migrate_{disable|enable}() calls for node alloc and free. The BPF memory allocator uses per-CPU struct internally, these paired calls are necessary to guarantee correctness. Reviewed-by: Toke Høiland-Jørgensen <[email protected]> Signed-off-by: Hou Tao <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 27abc7b commit 3d8dc43

File tree

1 file changed

+48
-23
lines changed

1 file changed

+48
-23
lines changed

kernel/bpf/lpm_trie.c

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
#include <net/ipv6.h>
1616
#include <uapi/linux/btf.h>
1717
#include <linux/btf_ids.h>
18+
#include <linux/bpf_mem_alloc.h>
1819

1920
/* Intermediate node */
2021
#define LPM_TREE_NODE_FLAG_IM BIT(0)
2122

2223
struct lpm_trie_node;
2324

2425
struct lpm_trie_node {
25-
struct rcu_head rcu;
2626
struct lpm_trie_node __rcu *child[2];
2727
u32 prefixlen;
2828
u32 flags;
@@ -32,6 +32,7 @@ struct lpm_trie_node {
3232
struct lpm_trie {
3333
struct bpf_map map;
3434
struct lpm_trie_node __rcu *root;
35+
struct bpf_mem_alloc ma;
3536
size_t n_entries;
3637
size_t max_prefixlen;
3738
size_t data_size;
@@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
287288
return found->data + trie->data_size;
288289
}
289290

290-
static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
291-
const void *value)
291+
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
292+
const void *value,
293+
bool disable_migration)
292294
{
293295
struct lpm_trie_node *node;
294-
size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
295296

296-
if (value)
297-
size += trie->map.value_size;
297+
if (disable_migration)
298+
migrate_disable();
299+
node = bpf_mem_cache_alloc(&trie->ma);
300+
if (disable_migration)
301+
migrate_enable();
298302

299-
node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
300-
trie->map.numa_node);
301303
if (!node)
302304
return NULL;
303305

@@ -325,7 +327,7 @@ static long trie_update_elem(struct bpf_map *map,
325327
void *_key, void *value, u64 flags)
326328
{
327329
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
328-
struct lpm_trie_node *node, *im_node, *new_node = NULL;
330+
struct lpm_trie_node *node, *im_node, *new_node;
329331
struct lpm_trie_node *free_node = NULL;
330332
struct lpm_trie_node __rcu **slot;
331333
struct bpf_lpm_trie_key_u8 *key = _key;
@@ -340,14 +342,14 @@ static long trie_update_elem(struct bpf_map *map,
340342
if (key->prefixlen > trie->max_prefixlen)
341343
return -EINVAL;
342344

343-
spin_lock_irqsave(&trie->lock, irq_flags);
345+
/* Allocate and fill a new node. Need to disable migration before
346+
* invoking bpf_mem_cache_alloc().
347+
*/
348+
new_node = lpm_trie_node_alloc(trie, value, true);
349+
if (!new_node)
350+
return -ENOMEM;
344351

345-
/* Allocate and fill a new node */
346-
new_node = lpm_trie_node_alloc(trie, value);
347-
if (!new_node) {
348-
ret = -ENOMEM;
349-
goto out;
350-
}
352+
spin_lock_irqsave(&trie->lock, irq_flags);
351353

352354
new_node->prefixlen = key->prefixlen;
353355
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -423,7 +425,8 @@ static long trie_update_elem(struct bpf_map *map,
423425
goto out;
424426
}
425427

426-
im_node = lpm_trie_node_alloc(trie, NULL);
428+
/* migration is disabled within the locked scope */
429+
im_node = lpm_trie_node_alloc(trie, NULL, false);
427430
if (!im_node) {
428431
trie->n_entries--;
429432
ret = -ENOMEM;
@@ -447,10 +450,13 @@ static long trie_update_elem(struct bpf_map *map,
447450
rcu_assign_pointer(*slot, im_node);
448451

449452
out:
450-
if (ret)
451-
kfree(new_node);
452453
spin_unlock_irqrestore(&trie->lock, irq_flags);
453-
kfree_rcu(free_node, rcu);
454+
455+
migrate_disable();
456+
if (ret)
457+
bpf_mem_cache_free(&trie->ma, new_node);
458+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
459+
migrate_enable();
454460

455461
return ret;
456462
}
@@ -548,8 +554,11 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
548554

549555
out:
550556
spin_unlock_irqrestore(&trie->lock, irq_flags);
551-
kfree_rcu(free_parent, rcu);
552-
kfree_rcu(free_node, rcu);
557+
558+
migrate_disable();
559+
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
560+
bpf_mem_cache_free_rcu(&trie->ma, free_node);
561+
migrate_enable();
553562

554563
return ret;
555564
}
@@ -571,6 +580,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
571580
static struct bpf_map *trie_alloc(union bpf_attr *attr)
572581
{
573582
struct lpm_trie *trie;
583+
size_t leaf_size;
584+
int err;
574585

575586
/* check sanity of attributes */
576587
if (attr->max_entries == 0 ||
@@ -595,7 +606,17 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
595606

596607
spin_lock_init(&trie->lock);
597608

609+
/* Allocate intermediate and leaf nodes from the same allocator */
610+
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
611+
trie->map.value_size;
612+
err = bpf_mem_alloc_init(&trie->ma, leaf_size, false);
613+
if (err)
614+
goto free_out;
598615
return &trie->map;
616+
617+
free_out:
618+
bpf_map_area_free(trie);
619+
return ERR_PTR(err);
599620
}
600621

601622
static void trie_free(struct bpf_map *map)
@@ -627,13 +648,17 @@ static void trie_free(struct bpf_map *map)
627648
continue;
628649
}
629650

630-
kfree(node);
651+
/* No bpf program may access the map, so freeing the
652+
* node without waiting for the extra RCU GP.
653+
*/
654+
bpf_mem_cache_raw_free(node);
631655
RCU_INIT_POINTER(*slot, NULL);
632656
break;
633657
}
634658
}
635659

636660
out:
661+
bpf_mem_alloc_destroy(&trie->ma);
637662
bpf_map_area_free(trie);
638663
}
639664

0 commit comments

Comments
 (0)