Skip to content

Commit e624d4e

Browse files
liuhangbinborkmann
authored andcommitted
xdp: Extend xdp_redirect_map with broadcast support
This patch adds two flags BPF_F_BROADCAST and BPF_F_EXCLUDE_INGRESS to extend xdp_redirect_map for broadcast support. With BPF_F_BROADCAST the packet will be broadcasted to all the interfaces in the map. with BPF_F_EXCLUDE_INGRESS the ingress interface will be excluded when do broadcasting. When getting the devices in dev hash map via dev_map_hash_get_next_key(), there is a possibility that we fall back to the first key when a device was removed. This will duplicate packets on some interfaces. So just walk the whole buckets to avoid this issue. For dev array map, we also walk the whole map to find valid interfaces. Function bpf_clear_redirect_map() was removed in commit ee75aef ("bpf, xdp: Restructure redirect actions"). Add it back as we need to use ri->map again. With test topology: +-------------------+ +-------------------+ | Host A (i40e 10G) | ---------- | eno1(i40e 10G) | +-------------------+ | | | Host B | +-------------------+ | | | Host C (i40e 10G) | ---------- | eno2(i40e 10G) | +-------------------+ | | | +------+ | | veth0 -- | Peer | | | veth1 -- | | | | veth2 -- | NS | | | +------+ | +-------------------+ On Host A: # pktgen/pktgen_sample03_burst_single_flow.sh -i eno1 -d $dst_ip -m $dst_mac -s 64 On Host B(Intel(R) Xeon(R) CPU E5-2690 v3 @ 2.60GHz, 128G Memory): Use xdp_redirect_map and xdp_redirect_map_multi in samples/bpf for testing. All the veth peers in the NS have a XDP_DROP program loaded. The forward_map max_entries in xdp_redirect_map_multi is modify to 4. Testing the performance impact on the regular xdp_redirect path with and without patch (to check impact of additional check for broadcast mode): 5.12 rc4 | redirect_map i40e->i40e | 2.0M | 9.7M 5.12 rc4 | redirect_map i40e->veth | 1.7M | 11.8M 5.12 rc4 + patch | redirect_map i40e->i40e | 2.0M | 9.6M 5.12 rc4 + patch | redirect_map i40e->veth | 1.7M | 11.7M Testing the performance when cloning packets with the redirect_map_multi test, using a redirect map size of 4, filled with 1-3 devices: 5.12 rc4 + patch | redirect_map multi i40e->veth (x1) | 1.7M | 11.4M 5.12 rc4 + patch | redirect_map multi i40e->veth (x2) | 1.1M | 4.3M 5.12 rc4 + patch | redirect_map multi i40e->veth (x3) | 0.8M | 2.6M Signed-off-by: Hangbin Liu <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Acked-by: Toke Høiland-Jørgensen <[email protected]> Acked-by: Martin KaFai Lau <[email protected]> Acked-by: John Fastabend <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent cb261b5 commit e624d4e

File tree

11 files changed

+313
-15
lines changed

11 files changed

+313
-15
lines changed

include/linux/bpf.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
15011501
struct net_device *dev_rx);
15021502
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
15031503
struct net_device *dev_rx);
1504+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
1505+
struct bpf_map *map, bool exclude_ingress);
15041506
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
15051507
struct bpf_prog *xdp_prog);
1508+
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
1509+
struct bpf_prog *xdp_prog, struct bpf_map *map,
1510+
bool exclude_ingress);
15061511
bool dev_map_can_have_prog(struct bpf_map *map);
15071512

15081513
void __cpu_map_flush(void);
@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
16701675
return 0;
16711676
}
16721677

1678+
static inline
1679+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
1680+
struct bpf_map *map, bool exclude_ingress)
1681+
{
1682+
return 0;
1683+
}
1684+
16731685
struct sk_buff;
16741686

16751687
static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
16791691
return 0;
16801692
}
16811693

1694+
static inline
1695+
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
1696+
struct bpf_prog *xdp_prog, struct bpf_map *map,
1697+
bool exclude_ingress)
1698+
{
1699+
return 0;
1700+
}
1701+
16821702
static inline void __cpu_map_flush(void)
16831703
{
16841704
}

include/linux/filter.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,7 @@ struct bpf_redirect_info {
646646
u32 flags;
647647
u32 tgt_index;
648648
void *tgt_value;
649+
struct bpf_map *map;
649650
u32 map_id;
650651
enum bpf_map_type map_type;
651652
u32 kern_flags;
@@ -1464,31 +1465,41 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
14641465
}
14651466
#endif /* IS_ENABLED(CONFIG_IPV6) */
14661467

1467-
static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
1468+
static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
1469+
u64 flags, const u64 flag_mask,
14681470
void *lookup_elem(struct bpf_map *map, u32 key))
14691471
{
14701472
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
1473+
const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
14711474

14721475
/* Lower bits of the flags are used as return code on lookup failure */
1473-
if (unlikely(flags > XDP_TX))
1476+
if (unlikely(flags & ~(action_mask | flag_mask)))
14741477
return XDP_ABORTED;
14751478

14761479
ri->tgt_value = lookup_elem(map, ifindex);
1477-
if (unlikely(!ri->tgt_value)) {
1480+
if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
14781481
/* If the lookup fails we want to clear out the state in the
14791482
* redirect_info struct completely, so that if an eBPF program
14801483
* performs multiple lookups, the last one always takes
14811484
* precedence.
14821485
*/
14831486
ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
14841487
ri->map_type = BPF_MAP_TYPE_UNSPEC;
1485-
return flags;
1488+
return flags & action_mask;
14861489
}
14871490

14881491
ri->tgt_index = ifindex;
14891492
ri->map_id = map->id;
14901493
ri->map_type = map->map_type;
14911494

1495+
if (flags & BPF_F_BROADCAST) {
1496+
WRITE_ONCE(ri->map, map);
1497+
ri->flags = flags;
1498+
} else {
1499+
WRITE_ONCE(ri->map, NULL);
1500+
ri->flags = 0;
1501+
}
1502+
14921503
return XDP_REDIRECT;
14931504
}
14941505

include/net/xdp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
170170
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
171171
struct net_device *dev);
172172
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
173+
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
173174

174175
static inline
175176
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)

include/trace/events/xdp.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
110110
u32 ifindex = 0, map_index = index;
111111

112112
if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
113-
ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
113+
/* Just leave to_ifindex to 0 if do broadcast redirect,
114+
* as tgt will be NULL.
115+
*/
116+
if (tgt)
117+
ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
114118
} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
115119
ifindex = index;
116120
map_index = 0;

include/uapi/linux/bpf.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,8 +2555,12 @@ union bpf_attr {
25552555
* The lower two bits of *flags* are used as the return code if
25562556
* the map lookup fails. This is so that the return value can be
25572557
* one of the XDP program return codes up to **XDP_TX**, as chosen
2558-
* by the caller. Any higher bits in the *flags* argument must be
2559-
* unset.
2558+
* by the caller. The higher bits of *flags* can be set to
2559+
* BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
2560+
*
2561+
* With BPF_F_BROADCAST the packet will be broadcasted to all the
2562+
* interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
2563+
* interface will be excluded when do broadcasting.
25602564
*
25612565
* See also **bpf_redirect**\ (), which only supports redirecting
25622566
* to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
51225126
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
51235127
};
51245128

5129+
/* Flags for bpf_redirect_map helper */
5130+
enum {
5131+
BPF_F_BROADCAST = (1ULL << 3),
5132+
BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
5133+
};
5134+
51255135
#define __bpf_md_ptr(type, name) \
51265136
union { \
51275137
type name; \

kernel/bpf/cpumap.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
601601

602602
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
603603
{
604-
return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
604+
return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
605+
__cpu_map_lookup_elem);
605606
}
606607

607608
static int cpu_map_btf_id;

kernel/bpf/devmap.c

Lines changed: 181 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
198198
list_del_rcu(&dtab->list);
199199
spin_unlock(&dev_map_lock);
200200

201+
bpf_clear_redirect_map(map);
201202
synchronize_rcu();
202203

203204
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -515,6 +516,99 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
515516
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
516517
}
517518

519+
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
520+
int exclude_ifindex)
521+
{
522+
if (!obj || obj->dev->ifindex == exclude_ifindex ||
523+
!obj->dev->netdev_ops->ndo_xdp_xmit)
524+
return false;
525+
526+
if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
527+
return false;
528+
529+
return true;
530+
}
531+
532+
static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
533+
struct net_device *dev_rx,
534+
struct xdp_frame *xdpf)
535+
{
536+
struct xdp_frame *nxdpf;
537+
538+
nxdpf = xdpf_clone(xdpf);
539+
if (!nxdpf)
540+
return -ENOMEM;
541+
542+
bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
543+
544+
return 0;
545+
}
546+
547+
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
548+
struct bpf_map *map, bool exclude_ingress)
549+
{
550+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
551+
int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
552+
struct bpf_dtab_netdev *dst, *last_dst = NULL;
553+
struct hlist_head *head;
554+
struct xdp_frame *xdpf;
555+
unsigned int i;
556+
int err;
557+
558+
xdpf = xdp_convert_buff_to_frame(xdp);
559+
if (unlikely(!xdpf))
560+
return -EOVERFLOW;
561+
562+
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
563+
for (i = 0; i < map->max_entries; i++) {
564+
dst = READ_ONCE(dtab->netdev_map[i]);
565+
if (!is_valid_dst(dst, xdp, exclude_ifindex))
566+
continue;
567+
568+
/* we only need n-1 clones; last_dst enqueued below */
569+
if (!last_dst) {
570+
last_dst = dst;
571+
continue;
572+
}
573+
574+
err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
575+
if (err)
576+
return err;
577+
578+
last_dst = dst;
579+
}
580+
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
581+
for (i = 0; i < dtab->n_buckets; i++) {
582+
head = dev_map_index_hash(dtab, i);
583+
hlist_for_each_entry_rcu(dst, head, index_hlist,
584+
lockdep_is_held(&dtab->index_lock)) {
585+
if (!is_valid_dst(dst, xdp, exclude_ifindex))
586+
continue;
587+
588+
/* we only need n-1 clones; last_dst enqueued below */
589+
if (!last_dst) {
590+
last_dst = dst;
591+
continue;
592+
}
593+
594+
err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
595+
if (err)
596+
return err;
597+
598+
last_dst = dst;
599+
}
600+
}
601+
}
602+
603+
/* consume the last copy of the frame */
604+
if (last_dst)
605+
bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
606+
else
607+
xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
608+
609+
return 0;
610+
}
611+
518612
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
519613
struct bpf_prog *xdp_prog)
520614
{
@@ -529,6 +623,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
529623
return 0;
530624
}
531625

626+
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
627+
struct sk_buff *skb,
628+
struct bpf_prog *xdp_prog)
629+
{
630+
struct sk_buff *nskb;
631+
int err;
632+
633+
nskb = skb_clone(skb, GFP_ATOMIC);
634+
if (!nskb)
635+
return -ENOMEM;
636+
637+
err = dev_map_generic_redirect(dst, nskb, xdp_prog);
638+
if (unlikely(err)) {
639+
consume_skb(nskb);
640+
return err;
641+
}
642+
643+
return 0;
644+
}
645+
646+
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
647+
struct bpf_prog *xdp_prog, struct bpf_map *map,
648+
bool exclude_ingress)
649+
{
650+
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
651+
int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
652+
struct bpf_dtab_netdev *dst, *last_dst = NULL;
653+
struct hlist_head *head;
654+
struct hlist_node *next;
655+
unsigned int i;
656+
int err;
657+
658+
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
659+
for (i = 0; i < map->max_entries; i++) {
660+
dst = READ_ONCE(dtab->netdev_map[i]);
661+
if (!dst || dst->dev->ifindex == exclude_ifindex)
662+
continue;
663+
664+
/* we only need n-1 clones; last_dst enqueued below */
665+
if (!last_dst) {
666+
last_dst = dst;
667+
continue;
668+
}
669+
670+
err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
671+
if (err)
672+
return err;
673+
674+
last_dst = dst;
675+
}
676+
} else { /* BPF_MAP_TYPE_DEVMAP_HASH */
677+
for (i = 0; i < dtab->n_buckets; i++) {
678+
head = dev_map_index_hash(dtab, i);
679+
hlist_for_each_entry_safe(dst, next, head, index_hlist) {
680+
if (!dst || dst->dev->ifindex == exclude_ifindex)
681+
continue;
682+
683+
/* we only need n-1 clones; last_dst enqueued below */
684+
if (!last_dst) {
685+
last_dst = dst;
686+
continue;
687+
}
688+
689+
err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
690+
if (err)
691+
return err;
692+
693+
last_dst = dst;
694+
}
695+
}
696+
}
697+
698+
/* consume the first skb and return */
699+
if (last_dst)
700+
return dev_map_generic_redirect(last_dst, skb, xdp_prog);
701+
702+
/* dtab is empty */
703+
consume_skb(skb);
704+
return 0;
705+
}
706+
532707
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
533708
{
534709
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -755,12 +930,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
755930

756931
static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
757932
{
758-
return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
933+
return __bpf_xdp_redirect_map(map, ifindex, flags,
934+
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
935+
__dev_map_lookup_elem);
759936
}
760937

761938
static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
762939
{
763-
return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
940+
return __bpf_xdp_redirect_map(map, ifindex, flags,
941+
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
942+
__dev_map_hash_lookup_elem);
764943
}
765944

766945
static int dev_map_btf_id;

0 commit comments

Comments
 (0)