@@ -68,11 +68,8 @@ struct bpf_cpu_map_entry {
68
68
struct bpf_cpumap_val value ;
69
69
struct bpf_prog * prog ;
70
70
71
- atomic_t refcnt ; /* Control when this struct can be free'ed */
72
- struct rcu_head rcu ;
73
-
74
- struct work_struct kthread_stop_wq ;
75
71
struct completion kthread_running ;
72
+ struct rcu_work free_work ;
76
73
};
77
74
78
75
struct bpf_cpu_map {
@@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
117
114
return & cmap -> map ;
118
115
}
119
116
120
- static void get_cpu_map_entry (struct bpf_cpu_map_entry * rcpu )
121
- {
122
- atomic_inc (& rcpu -> refcnt );
123
- }
124
-
125
117
static void __cpu_map_ring_cleanup (struct ptr_ring * ring )
126
118
{
127
119
/* The tear-down procedure should have made sure that queue is
@@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
142
134
}
143
135
}
144
136
145
- static void put_cpu_map_entry (struct bpf_cpu_map_entry * rcpu )
146
- {
147
- if (atomic_dec_and_test (& rcpu -> refcnt )) {
148
- if (rcpu -> prog )
149
- bpf_prog_put (rcpu -> prog );
150
- /* The queue should be empty at this point */
151
- __cpu_map_ring_cleanup (rcpu -> queue );
152
- ptr_ring_cleanup (rcpu -> queue , NULL );
153
- kfree (rcpu -> queue );
154
- kfree (rcpu );
155
- }
156
- }
157
-
158
- /* called from workqueue, to workaround syscall using preempt_disable */
159
- static void cpu_map_kthread_stop (struct work_struct * work )
160
- {
161
- struct bpf_cpu_map_entry * rcpu ;
162
-
163
- rcpu = container_of (work , struct bpf_cpu_map_entry , kthread_stop_wq );
164
-
165
- /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
166
- * as it waits until all in-flight call_rcu() callbacks complete.
167
- */
168
- rcu_barrier ();
169
-
170
- /* kthread_stop will wake_up_process and wait for it to complete */
171
- kthread_stop (rcpu -> kthread );
172
- }
173
-
174
137
static void cpu_map_bpf_prog_run_skb (struct bpf_cpu_map_entry * rcpu ,
175
138
struct list_head * listp ,
176
139
struct xdp_cpumap_stats * stats )
@@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data)
395
358
}
396
359
__set_current_state (TASK_RUNNING );
397
360
398
- put_cpu_map_entry (rcpu );
399
361
return 0 ;
400
362
}
401
363
@@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
472
434
if (IS_ERR (rcpu -> kthread ))
473
435
goto free_prog ;
474
436
475
- get_cpu_map_entry (rcpu ); /* 1-refcnt for being in cmap->cpu_map[] */
476
- get_cpu_map_entry (rcpu ); /* 1-refcnt for kthread */
477
-
478
437
/* Make sure kthread runs on a single CPU */
479
438
kthread_bind (rcpu -> kthread , cpu );
480
439
wake_up_process (rcpu -> kthread );
@@ -501,40 +460,40 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
501
460
return NULL ;
502
461
}
503
462
504
- static void __cpu_map_entry_free (struct rcu_head * rcu )
463
+ static void __cpu_map_entry_free (struct work_struct * work )
505
464
{
506
465
struct bpf_cpu_map_entry * rcpu ;
507
466
508
467
/* This cpu_map_entry have been disconnected from map and one
509
- * RCU grace-period have elapsed. Thus, XDP cannot queue any
468
+ * RCU grace-period have elapsed. Thus, XDP cannot queue any
510
469
* new packets and cannot change/set flush_needed that can
511
470
* find this entry.
512
471
*/
513
- rcpu = container_of (rcu , struct bpf_cpu_map_entry , rcu );
472
+ rcpu = container_of (to_rcu_work ( work ) , struct bpf_cpu_map_entry , free_work );
514
473
474
+ /* kthread_stop will wake_up_process and wait for it to complete.
475
+ * cpu_map_kthread_run() makes sure the pointer ring is empty
476
+ * before exiting.
477
+ */
478
+ kthread_stop (rcpu -> kthread );
479
+
480
+ if (rcpu -> prog )
481
+ bpf_prog_put (rcpu -> prog );
482
+ /* The queue should be empty at this point */
483
+ __cpu_map_ring_cleanup (rcpu -> queue );
484
+ ptr_ring_cleanup (rcpu -> queue , NULL );
485
+ kfree (rcpu -> queue );
515
486
free_percpu (rcpu -> bulkq );
516
- /* Cannot kthread_stop() here, last put free rcpu resources */
517
- put_cpu_map_entry (rcpu );
487
+ kfree (rcpu );
518
488
}
519
489
520
- /* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
521
- * ensure any driver rcu critical sections have completed, but this
522
- * does not guarantee a flush has happened yet. Because driver side
523
- * rcu_read_lock/unlock only protects the running XDP program. The
524
- * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
525
- * pending flush op doesn't fail.
526
- *
527
- * The bpf_cpu_map_entry is still used by the kthread, and there can
528
- * still be pending packets (in queue and percpu bulkq). A refcnt
529
- * makes sure to last user (kthread_stop vs. call_rcu) free memory
530
- * resources.
531
- *
532
- * The rcu callback __cpu_map_entry_free flush remaining packets in
533
- * percpu bulkq to queue. Due to caller map_delete_elem() disable
534
- * preemption, cannot call kthread_stop() to make sure queue is empty.
535
- * Instead a work_queue is started for stopping kthread,
536
- * cpu_map_kthread_stop, which waits for an RCU grace period before
537
- * stopping kthread, emptying the queue.
490
+ /* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old
491
+ * entry is no longer in use before freeing. We use queue_rcu_work() to call
492
+ * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
493
+ * period. This means that (a) all pending enqueue and flush operations have
494
+ * completed (because of the RCU callback), and (b) we are in a workqueue
495
+ * context where we can stop the kthread and wait for it to exit before freeing
496
+ * everything.
538
497
*/
539
498
static void __cpu_map_entry_replace (struct bpf_cpu_map * cmap ,
540
499
u32 key_cpu , struct bpf_cpu_map_entry * rcpu )
@@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
543
502
544
503
old_rcpu = unrcu_pointer (xchg (& cmap -> cpu_map [key_cpu ], RCU_INITIALIZER (rcpu )));
545
504
if (old_rcpu ) {
546
- call_rcu (& old_rcpu -> rcu , __cpu_map_entry_free );
547
- INIT_WORK (& old_rcpu -> kthread_stop_wq , cpu_map_kthread_stop );
548
- schedule_work (& old_rcpu -> kthread_stop_wq );
505
+ INIT_RCU_WORK (& old_rcpu -> free_work , __cpu_map_entry_free );
506
+ queue_rcu_work (system_wq , & old_rcpu -> free_work );
549
507
}
550
508
}
551
509
@@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)
557
515
if (key_cpu >= map -> max_entries )
558
516
return - EINVAL ;
559
517
560
- /* notice caller map_delete_elem() use preempt_disable () */
518
+ /* notice caller map_delete_elem() uses rcu_read_lock () */
561
519
__cpu_map_entry_replace (cmap , key_cpu , NULL );
562
520
return 0 ;
563
521
}
@@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)
608
566
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
609
567
* so the bpf programs (can be more than one that used this map) were
610
568
* disconnected from events. Wait for outstanding critical sections in
611
- * these programs to complete. The rcu critical section only guarantees
612
- * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
613
- * It does __not__ ensure pending flush operations (if any) are
614
- * complete .
569
+ * these programs to complete. synchronize_rcu() below not only
570
+ * guarantees no further "XDP/bpf-side" reads against
571
+ * bpf_cpu_map->cpu_map, but also ensure pending flush operations
572
+ * (if any) are completed .
615
573
*/
616
-
617
574
synchronize_rcu ();
618
575
619
- /* For cpu_map the remote CPUs can still be using the entries
620
- * (struct bpf_cpu_map_entry ).
576
+ /* The only possible user of bpf_cpu_map_entry is
577
+ * cpu_map_kthread_run( ).
621
578
*/
622
579
for (i = 0 ; i < cmap -> map .max_entries ; i ++ ) {
623
580
struct bpf_cpu_map_entry * rcpu ;
@@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)
626
583
if (!rcpu )
627
584
continue ;
628
585
629
- /* bq flush and cleanup happens after RCU grace-period */
630
- __cpu_map_entry_replace ( cmap , i , NULL ); /* call_rcu */
586
+ /* Stop kthread and cleanup entry directly */
587
+ __cpu_map_entry_free ( & rcpu -> free_work . work );
631
588
}
632
589
bpf_map_area_free (cmap -> cpu_map );
633
590
bpf_map_area_free (cmap );
0 commit comments