Skip to content

Commit fbfc504

Browse files
Björn TöpelAlexei Starovoitov
authored andcommitted
bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP
The xskmap is yet another BPF map, very much inspired by dev/cpu/sockmap, and is a holder of AF_XDP sockets. A user application adds AF_XDP sockets into the map, and by using the bpf_redirect_map helper, an XDP program can redirect XDP frames to an AF_XDP socket. Note that a socket that is bound to certain ifindex/queue index will *only* accept XDP frames from that netdev/queue index. If an XDP program tries to redirect from a netdev/queue index other than what the socket is bound to, the frame will not be received on the socket. A socket can reside in multiple maps. v3: Fixed race and simplified code. v2: Removed one indirection in map lookup. Signed-off-by: Björn Töpel <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent c497176 commit fbfc504

File tree

8 files changed

+289
-2
lines changed

8 files changed

+289
-2
lines changed

include/linux/bpf.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,31 @@ static inline int sock_map_prog(struct bpf_map *map,
676676
}
677677
#endif
678678

679+
#if defined(CONFIG_XDP_SOCKETS)
680+
struct xdp_sock;
681+
struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
682+
int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
683+
struct xdp_sock *xs);
684+
void __xsk_map_flush(struct bpf_map *map);
685+
#else
686+
struct xdp_sock;
687+
static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
688+
u32 key)
689+
{
690+
return NULL;
691+
}
692+
693+
static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
694+
struct xdp_sock *xs)
695+
{
696+
return -EOPNOTSUPP;
697+
}
698+
699+
static inline void __xsk_map_flush(struct bpf_map *map)
700+
{
701+
}
702+
#endif
703+
679704
/* verifier prototypes for helper functions called from eBPF programs */
680705
extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
681706
extern const struct bpf_func_proto bpf_map_update_elem_proto;

include/linux/bpf_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
4949
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
5050
#endif
5151
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
52+
#if defined(CONFIG_XDP_SOCKETS)
53+
BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
54+
#endif
5255
#endif

include/net/xdp_sock.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct xdp_sock {
2828
struct xsk_queue *rx;
2929
struct net_device *dev;
3030
struct xdp_umem *umem;
31+
struct list_head flush_node;
3132
u16 queue_id;
3233
/* Protects multiple processes in the control path */
3334
struct mutex mutex;
@@ -39,6 +40,7 @@ struct xdp_buff;
3940
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
4041
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
4142
void xsk_flush(struct xdp_sock *xs);
43+
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
4244
#else
4345
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
4446
{
@@ -53,6 +55,11 @@ static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
5355
static inline void xsk_flush(struct xdp_sock *xs)
5456
{
5557
}
58+
59+
static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
60+
{
61+
return false;
62+
}
5663
#endif /* CONFIG_XDP_SOCKETS */
5764

5865
#endif /* _LINUX_XDP_SOCK_H */

include/uapi/linux/bpf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ enum bpf_map_type {
116116
BPF_MAP_TYPE_DEVMAP,
117117
BPF_MAP_TYPE_SOCKMAP,
118118
BPF_MAP_TYPE_CPUMAP,
119+
BPF_MAP_TYPE_XSKMAP,
119120
};
120121

121122
enum bpf_prog_type {

kernel/bpf/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ obj-$(CONFIG_BPF_SYSCALL) += btf.o
88
ifeq ($(CONFIG_NET),y)
99
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
1010
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
11+
ifeq ($(CONFIG_XDP_SOCKETS),y)
12+
obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
13+
endif
1114
obj-$(CONFIG_BPF_SYSCALL) += offload.o
1215
ifeq ($(CONFIG_STREAM_PARSER),y)
1316
ifeq ($(CONFIG_INET),y)

kernel/bpf/verifier.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2070,8 +2070,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
20702070
if (func_id != BPF_FUNC_redirect_map)
20712071
goto error;
20722072
break;
2073-
/* Restrict bpf side of cpumap, open when use-cases appear */
2073+
/* Restrict bpf side of cpumap and xskmap, open when use-cases
2074+
* appear.
2075+
*/
20742076
case BPF_MAP_TYPE_CPUMAP:
2077+
case BPF_MAP_TYPE_XSKMAP:
20752078
if (func_id != BPF_FUNC_redirect_map)
20762079
goto error;
20772080
break;
@@ -2118,7 +2121,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
21182121
break;
21192122
case BPF_FUNC_redirect_map:
21202123
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
2121-
map->map_type != BPF_MAP_TYPE_CPUMAP)
2124+
map->map_type != BPF_MAP_TYPE_CPUMAP &&
2125+
map->map_type != BPF_MAP_TYPE_XSKMAP)
21222126
goto error;
21232127
break;
21242128
case BPF_FUNC_sk_redirect_map:

kernel/bpf/xskmap.c

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/* XSKMAP used for AF_XDP sockets
3+
* Copyright(c) 2018 Intel Corporation.
4+
*
5+
* This program is free software; you can redistribute it and/or modify it
6+
* under the terms and conditions of the GNU General Public License,
7+
* version 2, as published by the Free Software Foundation.
8+
*
9+
* This program is distributed in the hope it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12+
* more details.
13+
*/
14+
15+
#include <linux/bpf.h>
16+
#include <linux/capability.h>
17+
#include <net/xdp_sock.h>
18+
#include <linux/slab.h>
19+
#include <linux/sched.h>
20+
21+
struct xsk_map {
22+
struct bpf_map map;
23+
struct xdp_sock **xsk_map;
24+
struct list_head __percpu *flush_list;
25+
};
26+
27+
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
28+
{
29+
int cpu, err = -EINVAL;
30+
struct xsk_map *m;
31+
u64 cost;
32+
33+
if (!capable(CAP_NET_ADMIN))
34+
return ERR_PTR(-EPERM);
35+
36+
if (attr->max_entries == 0 || attr->key_size != 4 ||
37+
attr->value_size != 4 ||
38+
attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
39+
return ERR_PTR(-EINVAL);
40+
41+
m = kzalloc(sizeof(*m), GFP_USER);
42+
if (!m)
43+
return ERR_PTR(-ENOMEM);
44+
45+
bpf_map_init_from_attr(&m->map, attr);
46+
47+
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
48+
cost += sizeof(struct list_head) * num_possible_cpus();
49+
if (cost >= U32_MAX - PAGE_SIZE)
50+
goto free_m;
51+
52+
m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
53+
54+
/* Notice returns -EPERM on if map size is larger than memlock limit */
55+
err = bpf_map_precharge_memlock(m->map.pages);
56+
if (err)
57+
goto free_m;
58+
59+
m->flush_list = alloc_percpu(struct list_head);
60+
if (!m->flush_list)
61+
goto free_m;
62+
63+
for_each_possible_cpu(cpu)
64+
INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
65+
66+
m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
67+
sizeof(struct xdp_sock *),
68+
m->map.numa_node);
69+
if (!m->xsk_map)
70+
goto free_percpu;
71+
return &m->map;
72+
73+
free_percpu:
74+
free_percpu(m->flush_list);
75+
free_m:
76+
kfree(m);
77+
return ERR_PTR(err);
78+
}
79+
80+
static void xsk_map_free(struct bpf_map *map)
81+
{
82+
struct xsk_map *m = container_of(map, struct xsk_map, map);
83+
int i;
84+
85+
synchronize_net();
86+
87+
for (i = 0; i < map->max_entries; i++) {
88+
struct xdp_sock *xs;
89+
90+
xs = m->xsk_map[i];
91+
if (!xs)
92+
continue;
93+
94+
sock_put((struct sock *)xs);
95+
}
96+
97+
free_percpu(m->flush_list);
98+
bpf_map_area_free(m->xsk_map);
99+
kfree(m);
100+
}
101+
102+
static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
103+
{
104+
struct xsk_map *m = container_of(map, struct xsk_map, map);
105+
u32 index = key ? *(u32 *)key : U32_MAX;
106+
u32 *next = next_key;
107+
108+
if (index >= m->map.max_entries) {
109+
*next = 0;
110+
return 0;
111+
}
112+
113+
if (index == m->map.max_entries - 1)
114+
return -ENOENT;
115+
*next = index + 1;
116+
return 0;
117+
}
118+
119+
struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
120+
{
121+
struct xsk_map *m = container_of(map, struct xsk_map, map);
122+
struct xdp_sock *xs;
123+
124+
if (key >= map->max_entries)
125+
return NULL;
126+
127+
xs = READ_ONCE(m->xsk_map[key]);
128+
return xs;
129+
}
130+
131+
int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
132+
struct xdp_sock *xs)
133+
{
134+
struct xsk_map *m = container_of(map, struct xsk_map, map);
135+
struct list_head *flush_list = this_cpu_ptr(m->flush_list);
136+
int err;
137+
138+
err = xsk_rcv(xs, xdp);
139+
if (err)
140+
return err;
141+
142+
if (!xs->flush_node.prev)
143+
list_add(&xs->flush_node, flush_list);
144+
145+
return 0;
146+
}
147+
148+
void __xsk_map_flush(struct bpf_map *map)
149+
{
150+
struct xsk_map *m = container_of(map, struct xsk_map, map);
151+
struct list_head *flush_list = this_cpu_ptr(m->flush_list);
152+
struct xdp_sock *xs, *tmp;
153+
154+
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
155+
xsk_flush(xs);
156+
__list_del(xs->flush_node.prev, xs->flush_node.next);
157+
xs->flush_node.prev = NULL;
158+
}
159+
}
160+
161+
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
162+
{
163+
return NULL;
164+
}
165+
166+
static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
167+
u64 map_flags)
168+
{
169+
struct xsk_map *m = container_of(map, struct xsk_map, map);
170+
u32 i = *(u32 *)key, fd = *(u32 *)value;
171+
struct xdp_sock *xs, *old_xs;
172+
struct socket *sock;
173+
int err;
174+
175+
if (unlikely(map_flags > BPF_EXIST))
176+
return -EINVAL;
177+
if (unlikely(i >= m->map.max_entries))
178+
return -E2BIG;
179+
if (unlikely(map_flags == BPF_NOEXIST))
180+
return -EEXIST;
181+
182+
sock = sockfd_lookup(fd, &err);
183+
if (!sock)
184+
return err;
185+
186+
if (sock->sk->sk_family != PF_XDP) {
187+
sockfd_put(sock);
188+
return -EOPNOTSUPP;
189+
}
190+
191+
xs = (struct xdp_sock *)sock->sk;
192+
193+
if (!xsk_is_setup_for_bpf_map(xs)) {
194+
sockfd_put(sock);
195+
return -EOPNOTSUPP;
196+
}
197+
198+
sock_hold(sock->sk);
199+
200+
old_xs = xchg(&m->xsk_map[i], xs);
201+
if (old_xs) {
202+
/* Make sure we've flushed everything. */
203+
synchronize_net();
204+
sock_put((struct sock *)old_xs);
205+
}
206+
207+
sockfd_put(sock);
208+
return 0;
209+
}
210+
211+
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
212+
{
213+
struct xsk_map *m = container_of(map, struct xsk_map, map);
214+
struct xdp_sock *old_xs;
215+
int k = *(u32 *)key;
216+
217+
if (k >= map->max_entries)
218+
return -EINVAL;
219+
220+
old_xs = xchg(&m->xsk_map[k], NULL);
221+
if (old_xs) {
222+
/* Make sure we've flushed everything. */
223+
synchronize_net();
224+
sock_put((struct sock *)old_xs);
225+
}
226+
227+
return 0;
228+
}
229+
230+
const struct bpf_map_ops xsk_map_ops = {
231+
.map_alloc = xsk_map_alloc,
232+
.map_free = xsk_map_free,
233+
.map_get_next_key = xsk_map_get_next_key,
234+
.map_lookup_elem = xsk_map_lookup_elem,
235+
.map_update_elem = xsk_map_update_elem,
236+
.map_delete_elem = xsk_map_delete_elem,
237+
};
238+
239+

net/xdp/xsk.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ static struct xdp_sock *xdp_sk(struct sock *sk)
4141
return (struct xdp_sock *)sk;
4242
}
4343

44+
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
45+
{
46+
return !!xs->rx;
47+
}
48+
4449
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
4550
{
4651
u32 *id, len = xdp->data_end - xdp->data;

0 commit comments

Comments
 (0)