Skip to content

Commit 401cb7d

Browse files
Sebastian Andrzej Siewiorkuba-moo
authored andcommitted
net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.
The XDP redirect process is two staged: - bpf_prog_run_xdp() is invoked to run a eBPF program which inspects the packet and makes decisions. While doing that, the per-CPU variable bpf_redirect_info is used. - Afterwards xdp_do_redirect() is invoked and accesses bpf_redirect_info and it may also access other per-CPU variables like xskmap_flush_list. At the very end of the NAPI callback, xdp_do_flush() is invoked which does not access bpf_redirect_info but will touch the individual per-CPU lists. The per-CPU variables are only used in the NAPI callback hence disabling bottom halves is the only protection mechanism. Users from preemptible context (like cpu_map_kthread_run()) explicitly disable bottom halves for protections reasons. Without locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. PREEMPT_RT has forced-threaded interrupts enabled and every NAPI-callback runs in a thread. If each thread has its own data structure then locking can be avoided. Create a struct bpf_net_context which contains struct bpf_redirect_info. Define the variable on stack, use bpf_net_ctx_set() to save a pointer to it, bpf_net_ctx_clear() removes it again. The bpf_net_ctx_set() may nest. For instance a function can be used from within NET_RX_SOFTIRQ/ net_rx_action which uses bpf_net_ctx_set() and NET_TX_SOFTIRQ which does not. Therefore only the first invocations updates the pointer. Use bpf_net_ctx_get_ri() as a wrapper to retrieve the current struct bpf_redirect_info. The returned data structure is zero initialized to ensure nothing is leaked from stack. This is done on first usage of the struct. bpf_net_ctx_set() sets bpf_redirect_info::kern_flags to 0 to note that initialisation is required. First invocation of bpf_net_ctx_get_ri() will memset() the data structure and update bpf_redirect_info::kern_flags. bpf_redirect_info::nh is excluded from memset because it is only used once BPF_F_NEIGH is set which also sets the nh member. The kern_flags is moved past nh to exclude it from memset. The pointer to bpf_net_context is saved task's task_struct. Using always the bpf_net_context approach has the advantage that there is almost zero differences between PREEMPT_RT and non-PREEMPT_RT builds. Cc: Andrii Nakryiko <[email protected]> Cc: Eduard Zingerman <[email protected]> Cc: Hao Luo <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: John Fastabend <[email protected]> Cc: KP Singh <[email protected]> Cc: Martin KaFai Lau <[email protected]> Cc: Song Liu <[email protected]> Cc: Stanislav Fomichev <[email protected]> Cc: Yonghong Song <[email protected]> Acked-by: Alexei Starovoitov <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Reviewed-by: Toke Høiland-Jørgensen <[email protected]> Signed-off-by: Sebastian Andrzej Siewior <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 78f520b commit 401cb7d

File tree

9 files changed

+114
-45
lines changed

9 files changed

+114
-45
lines changed

include/linux/filter.h

Lines changed: 46 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -733,21 +733,59 @@ struct bpf_nh_params {
733733
};
734734
};
735735

736+
/* flags for bpf_redirect_info kern_flags */
737+
#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
738+
#define BPF_RI_F_RI_INIT BIT(1)
739+
736740
struct bpf_redirect_info {
737741
u64 tgt_index;
738742
void *tgt_value;
739743
struct bpf_map *map;
740744
u32 flags;
741-
u32 kern_flags;
742745
u32 map_id;
743746
enum bpf_map_type map_type;
744747
struct bpf_nh_params nh;
748+
u32 kern_flags;
745749
};
746750

747-
DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
751+
struct bpf_net_context {
752+
struct bpf_redirect_info ri;
753+
};
748754

749-
/* flags for bpf_redirect_info kern_flags */
750-
#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
755+
static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
756+
{
757+
struct task_struct *tsk = current;
758+
759+
if (tsk->bpf_net_context != NULL)
760+
return NULL;
761+
bpf_net_ctx->ri.kern_flags = 0;
762+
763+
tsk->bpf_net_context = bpf_net_ctx;
764+
return bpf_net_ctx;
765+
}
766+
767+
static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
768+
{
769+
if (bpf_net_ctx)
770+
current->bpf_net_context = NULL;
771+
}
772+
773+
static inline struct bpf_net_context *bpf_net_ctx_get(void)
774+
{
775+
return current->bpf_net_context;
776+
}
777+
778+
static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
779+
{
780+
struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
781+
782+
if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
783+
memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
784+
bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
785+
}
786+
787+
return &bpf_net_ctx->ri;
788+
}
751789

752790
/* Compute the linear packet data range [data, data_end) which
753791
* will be accessed by various program types (cls_bpf, act_bpf,
@@ -1018,25 +1056,23 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
10181056
const struct bpf_insn *patch, u32 len);
10191057
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
10201058

1021-
void bpf_clear_redirect_map(struct bpf_map *map);
1022-
10231059
static inline bool xdp_return_frame_no_direct(void)
10241060
{
1025-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
1061+
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
10261062

10271063
return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
10281064
}
10291065

10301066
static inline void xdp_set_return_frame_no_direct(void)
10311067
{
1032-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
1068+
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
10331069

10341070
ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
10351071
}
10361072

10371073
static inline void xdp_clear_return_frame_no_direct(void)
10381074
{
1039-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
1075+
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
10401076

10411077
ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
10421078
}
@@ -1592,7 +1628,7 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde
15921628
u64 flags, const u64 flag_mask,
15931629
void *lookup_elem(struct bpf_map *map, u32 key))
15941630
{
1595-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
1631+
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
15961632
const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
15971633

15981634
/* Lower bits of the flags are used as return code on lookup failure */

include/linux/sched.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ struct bio_list;
5454
struct blk_plug;
5555
struct bpf_local_storage;
5656
struct bpf_run_ctx;
57+
struct bpf_net_context;
5758
struct capture_control;
5859
struct cfs_rq;
5960
struct fs_struct;
@@ -1509,6 +1510,8 @@ struct task_struct {
15091510
/* Used for BPF run context */
15101511
struct bpf_run_ctx *bpf_ctx;
15111512
#endif
1513+
/* Used by BPF for per-TASK xdp storage */
1514+
struct bpf_net_context *bpf_net_context;
15121515

15131516
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
15141517
unsigned long lowest_stack;

kernel/bpf/cpumap.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,12 +240,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
240240
int xdp_n, struct xdp_cpumap_stats *stats,
241241
struct list_head *list)
242242
{
243+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
243244
int nframes;
244245

245246
if (!rcpu->prog)
246247
return xdp_n;
247248

248249
rcu_read_lock_bh();
250+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
249251

250252
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
251253

@@ -255,6 +257,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
255257
if (unlikely(!list_empty(list)))
256258
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
257259

260+
bpf_net_ctx_clear(bpf_net_ctx);
258261
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
259262

260263
return nframes;

kernel/bpf/devmap.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,14 @@ static void dev_map_free(struct bpf_map *map)
196196
list_del_rcu(&dtab->list);
197197
spin_unlock(&dev_map_lock);
198198

199-
bpf_clear_redirect_map(map);
199+
/* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
200+
* during NAPI callback and cleared after the XDP redirect. There is no
201+
* explicit RCU read section which protects bpf_redirect_info->map but
202+
* local_bh_disable() also marks the beginning an RCU section. This
203+
* makes the complete softirq callback RCU protected. Thus after
204+
* following synchronize_rcu() there no bpf_redirect_info->map == map
205+
* assignment.
206+
*/
200207
synchronize_rcu();
201208

202209
/* Make sure prior __dev_map_entry_free() have completed. */

kernel/fork.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2355,6 +2355,7 @@ __latent_entropy struct task_struct *copy_process(
23552355
RCU_INIT_POINTER(p->bpf_storage, NULL);
23562356
p->bpf_ctx = NULL;
23572357
#endif
2358+
p->bpf_net_context = NULL;
23582359

23592360
/* Perform scheduler related setup. Assign this task to a CPU. */
23602361
retval = sched_fork(clone_flags, p);

net/bpf/test_run.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,10 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
283283
static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
284284
u32 repeat)
285285
{
286-
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
286+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
287287
int err = 0, act, ret, i, nframes = 0, batch_sz;
288288
struct xdp_frame **frames = xdp->frames;
289+
struct bpf_redirect_info *ri;
289290
struct xdp_page_head *head;
290291
struct xdp_frame *frm;
291292
bool redirect = false;
@@ -295,6 +296,8 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
295296
batch_sz = min_t(u32, repeat, xdp->batch_size);
296297

297298
local_bh_disable();
299+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
300+
ri = bpf_net_ctx_get_ri();
298301
xdp_set_return_frame_no_direct();
299302

300303
for (i = 0; i < batch_sz; i++) {
@@ -359,6 +362,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
359362
}
360363

361364
xdp_clear_return_frame_no_direct();
365+
bpf_net_ctx_clear(bpf_net_ctx);
362366
local_bh_enable();
363367
return err;
364368
}
@@ -394,6 +398,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
394398
static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
395399
u32 *retval, u32 *time, bool xdp)
396400
{
401+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
397402
struct bpf_prog_array_item item = {.prog = prog};
398403
struct bpf_run_ctx *old_ctx;
399404
struct bpf_cg_run_ctx run_ctx;
@@ -419,10 +424,14 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
419424
do {
420425
run_ctx.prog_item = &item;
421426
local_bh_disable();
427+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
428+
422429
if (xdp)
423430
*retval = bpf_prog_run_xdp(prog, ctx);
424431
else
425432
*retval = bpf_prog_run(prog, ctx);
433+
434+
bpf_net_ctx_clear(bpf_net_ctx);
426435
local_bh_enable();
427436
} while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
428437
bpf_reset_run_ctx(old_ctx);

net/core/dev.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4045,10 +4045,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
40454045
{
40464046
struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
40474047
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
4048+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
40484049
int sch_ret;
40494050

40504051
if (!entry)
40514052
return skb;
4053+
4054+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
40524055
if (*pt_prev) {
40534056
*ret = deliver_skb(skb, *pt_prev, orig_dev);
40544057
*pt_prev = NULL;
@@ -4077,10 +4080,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
40774080
break;
40784081
}
40794082
*ret = NET_RX_SUCCESS;
4083+
bpf_net_ctx_clear(bpf_net_ctx);
40804084
return NULL;
40814085
case TC_ACT_SHOT:
40824086
kfree_skb_reason(skb, drop_reason);
40834087
*ret = NET_RX_DROP;
4088+
bpf_net_ctx_clear(bpf_net_ctx);
40844089
return NULL;
40854090
/* used by tc_run */
40864091
case TC_ACT_STOLEN:
@@ -4090,8 +4095,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
40904095
fallthrough;
40914096
case TC_ACT_CONSUMED:
40924097
*ret = NET_RX_SUCCESS;
4098+
bpf_net_ctx_clear(bpf_net_ctx);
40934099
return NULL;
40944100
}
4101+
bpf_net_ctx_clear(bpf_net_ctx);
40954102

40964103
return skb;
40974104
}
@@ -4101,11 +4108,14 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
41014108
{
41024109
struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
41034110
enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4111+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
41044112
int sch_ret;
41054113

41064114
if (!entry)
41074115
return skb;
41084116

4117+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4118+
41094119
/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
41104120
* already set by the caller.
41114121
*/
@@ -4121,10 +4131,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
41214131
/* No need to push/pop skb's mac_header here on egress! */
41224132
skb_do_redirect(skb);
41234133
*ret = NET_XMIT_SUCCESS;
4134+
bpf_net_ctx_clear(bpf_net_ctx);
41244135
return NULL;
41254136
case TC_ACT_SHOT:
41264137
kfree_skb_reason(skb, drop_reason);
41274138
*ret = NET_XMIT_DROP;
4139+
bpf_net_ctx_clear(bpf_net_ctx);
41284140
return NULL;
41294141
/* used by tc_run */
41304142
case TC_ACT_STOLEN:
@@ -4134,8 +4146,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
41344146
fallthrough;
41354147
case TC_ACT_CONSUMED:
41364148
*ret = NET_XMIT_SUCCESS;
4149+
bpf_net_ctx_clear(bpf_net_ctx);
41374150
return NULL;
41384151
}
4152+
bpf_net_ctx_clear(bpf_net_ctx);
41394153

41404154
return skb;
41414155
}
@@ -6325,6 +6339,7 @@ enum {
63256339
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
63266340
unsigned flags, u16 budget)
63276341
{
6342+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
63286343
bool skip_schedule = false;
63296344
unsigned long timeout;
63306345
int rc;
@@ -6342,6 +6357,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
63426357
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
63436358

63446359
local_bh_disable();
6360+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
63456361

63466362
if (flags & NAPI_F_PREFER_BUSY_POLL) {
63476363
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
@@ -6364,6 +6380,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
63646380
netpoll_poll_unlock(have_poll_lock);
63656381
if (rc == budget)
63666382
__busy_poll_stop(napi, skip_schedule);
6383+
bpf_net_ctx_clear(bpf_net_ctx);
63676384
local_bh_enable();
63686385
}
63696386

@@ -6373,6 +6390,7 @@ static void __napi_busy_loop(unsigned int napi_id,
63736390
{
63746391
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
63756392
int (*napi_poll)(struct napi_struct *napi, int budget);
6393+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
63766394
void *have_poll_lock = NULL;
63776395
struct napi_struct *napi;
63786396

@@ -6391,6 +6409,7 @@ static void __napi_busy_loop(unsigned int napi_id,
63916409
int work = 0;
63926410

63936411
local_bh_disable();
6412+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
63946413
if (!napi_poll) {
63956414
unsigned long val = READ_ONCE(napi->state);
63966415

@@ -6421,6 +6440,7 @@ static void __napi_busy_loop(unsigned int napi_id,
64216440
__NET_ADD_STATS(dev_net(napi->dev),
64226441
LINUX_MIB_BUSYPOLLRXPACKETS, work);
64236442
skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6443+
bpf_net_ctx_clear(bpf_net_ctx);
64246444
local_bh_enable();
64256445

64266446
if (!loop_end || loop_end(loop_end_arg, start_time))
@@ -6848,6 +6868,7 @@ static int napi_thread_wait(struct napi_struct *napi)
68486868

68496869
static void napi_threaded_poll_loop(struct napi_struct *napi)
68506870
{
6871+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
68516872
struct softnet_data *sd;
68526873
unsigned long last_qs = jiffies;
68536874

@@ -6856,6 +6877,8 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
68566877
void *have;
68576878

68586879
local_bh_disable();
6880+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6881+
68596882
sd = this_cpu_ptr(&softnet_data);
68606883
sd->in_napi_threaded_poll = true;
68616884

@@ -6871,6 +6894,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
68716894
net_rps_action_and_irq_enable(sd);
68726895
}
68736896
skb_defer_free_flush(sd);
6897+
bpf_net_ctx_clear(bpf_net_ctx);
68746898
local_bh_enable();
68756899

68766900
if (!repoll)
@@ -6896,10 +6920,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
68966920
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
68976921
unsigned long time_limit = jiffies +
68986922
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
6923+
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
68996924
int budget = READ_ONCE(net_hotdata.netdev_budget);
69006925
LIST_HEAD(list);
69016926
LIST_HEAD(repoll);
69026927

6928+
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
69036929
start:
69046930
sd->in_net_rx_action = true;
69056931
local_irq_disable();
@@ -6952,7 +6978,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
69526978
sd->in_net_rx_action = false;
69536979

69546980
net_rps_action_and_irq_enable(sd);
6955-
end:;
6981+
end:
6982+
bpf_net_ctx_clear(bpf_net_ctx);
69566983
}
69576984

69586985
struct netdev_adjacent {

0 commit comments

Comments
 (0)