Skip to content

Commit 41b0530

Browse files
committed
Merge branch 'bpf-sockmap-fixes'
John Fastabend says: ==================== A set of fixes for sockmap to resolve programs referencing sockmaps and closing without deleting all entries in the map and/or not detaching BPF programs attached to the map. Both leaving entries in the map and not detaching programs may result in the map failing to be removed by BPF infrastructure due to reference counts never reaching zero. For this we pull in the ULP infrastructure to hook into the close() hook of the sock layer. This seemed natural because we have additional sockmap features (to add support for TX hooks) that will also use the ULP infrastructure. This allows us to cleanup entries in the map when socks are closed() and avoid trying to get the sk_state_change() hook to fire in all cases. The second issue resolved here occurs when users don't detach programs. The gist is a refcnt issue resolved by implementing the release callback. See patch for details. For testing I ran both sample/sockmap and selftests bpf/test_maps.c. Dave Watson ran TLS test suite on v1 version of the patches without the put_module error path change. v4 fix missing rcu_unlock() v3 wrap psock reference in RCU v2 changes rebased onto bpf-next with small update adding module_put ==================== Signed-off-by: Daniel Borkmann <[email protected]>
2 parents 7b4eb53 + 3d9e952 commit 41b0530

File tree

4 files changed

+179
-77
lines changed

4 files changed

+179
-77
lines changed

include/net/tcp.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1983,6 +1983,11 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
19831983
#define TCP_ULP_MAX 128
19841984
#define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX)
19851985

1986+
enum {
1987+
TCP_ULP_TLS,
1988+
TCP_ULP_BPF,
1989+
};
1990+
19861991
struct tcp_ulp_ops {
19871992
struct list_head list;
19881993

@@ -1991,12 +1996,15 @@ struct tcp_ulp_ops {
19911996
/* cleanup ulp */
19921997
void (*release)(struct sock *sk);
19931998

1999+
int uid;
19942000
char name[TCP_ULP_NAME_MAX];
2001+
bool user_visible;
19952002
struct module *owner;
19962003
};
19972004
int tcp_register_ulp(struct tcp_ulp_ops *type);
19982005
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
19992006
int tcp_set_ulp(struct sock *sk, const char *name);
2007+
int tcp_set_ulp_id(struct sock *sk, const int ulp);
20002008
void tcp_get_available_ulp(char *buf, size_t len);
20012009
void tcp_cleanup_ulp(struct sock *sk);
20022010

kernel/bpf/sockmap.c

Lines changed: 115 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,113 @@ struct smap_psock {
8686
struct work_struct tx_work;
8787
struct work_struct gc_work;
8888

89+
struct proto *sk_proto;
90+
void (*save_close)(struct sock *sk, long timeout);
8991
void (*save_data_ready)(struct sock *sk);
9092
void (*save_write_space)(struct sock *sk);
91-
void (*save_state_change)(struct sock *sk);
9293
};
9394

9495
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
9596
{
9697
return rcu_dereference_sk_user_data(sk);
9798
}
9899

100+
static struct proto tcp_bpf_proto;
101+
static int bpf_tcp_init(struct sock *sk)
102+
{
103+
struct smap_psock *psock;
104+
105+
rcu_read_lock();
106+
psock = smap_psock_sk(sk);
107+
if (unlikely(!psock)) {
108+
rcu_read_unlock();
109+
return -EINVAL;
110+
}
111+
112+
if (unlikely(psock->sk_proto)) {
113+
rcu_read_unlock();
114+
return -EBUSY;
115+
}
116+
117+
psock->save_close = sk->sk_prot->close;
118+
psock->sk_proto = sk->sk_prot;
119+
sk->sk_prot = &tcp_bpf_proto;
120+
rcu_read_unlock();
121+
return 0;
122+
}
123+
124+
static void bpf_tcp_release(struct sock *sk)
125+
{
126+
struct smap_psock *psock;
127+
128+
rcu_read_lock();
129+
psock = smap_psock_sk(sk);
130+
131+
if (likely(psock)) {
132+
sk->sk_prot = psock->sk_proto;
133+
psock->sk_proto = NULL;
134+
}
135+
rcu_read_unlock();
136+
}
137+
138+
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
139+
140+
static void bpf_tcp_close(struct sock *sk, long timeout)
141+
{
142+
void (*close_fun)(struct sock *sk, long timeout);
143+
struct smap_psock_map_entry *e, *tmp;
144+
struct smap_psock *psock;
145+
struct sock *osk;
146+
147+
rcu_read_lock();
148+
psock = smap_psock_sk(sk);
149+
if (unlikely(!psock)) {
150+
rcu_read_unlock();
151+
return sk->sk_prot->close(sk, timeout);
152+
}
153+
154+
/* The psock may be destroyed anytime after exiting the RCU critial
155+
* section so by the time we use close_fun the psock may no longer
156+
* be valid. However, bpf_tcp_close is called with the sock lock
157+
* held so the close hook and sk are still valid.
158+
*/
159+
close_fun = psock->save_close;
160+
161+
write_lock_bh(&sk->sk_callback_lock);
162+
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
163+
osk = cmpxchg(e->entry, sk, NULL);
164+
if (osk == sk) {
165+
list_del(&e->list);
166+
smap_release_sock(psock, sk);
167+
}
168+
}
169+
write_unlock_bh(&sk->sk_callback_lock);
170+
rcu_read_unlock();
171+
close_fun(sk, timeout);
172+
}
173+
99174
enum __sk_action {
100175
__SK_DROP = 0,
101176
__SK_PASS,
102177
__SK_REDIRECT,
103178
};
104179

180+
static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
181+
.name = "bpf_tcp",
182+
.uid = TCP_ULP_BPF,
183+
.user_visible = false,
184+
.owner = NULL,
185+
.init = bpf_tcp_init,
186+
.release = bpf_tcp_release,
187+
};
188+
189+
static int bpf_tcp_ulp_register(void)
190+
{
191+
tcp_bpf_proto = tcp_prot;
192+
tcp_bpf_proto.close = bpf_tcp_close;
193+
return tcp_register_ulp(&bpf_tcp_ulp_ops);
194+
}
195+
105196
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
106197
{
107198
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
166257
sk->sk_error_report(sk);
167258
}
168259

169-
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
170-
171-
/* Called with lock_sock(sk) held */
172-
static void smap_state_change(struct sock *sk)
173-
{
174-
struct smap_psock_map_entry *e, *tmp;
175-
struct smap_psock *psock;
176-
struct socket_wq *wq;
177-
struct sock *osk;
178-
179-
rcu_read_lock();
180-
181-
/* Allowing transitions into an established syn_recv states allows
182-
* for early binding sockets to a smap object before the connection
183-
* is established.
184-
*/
185-
switch (sk->sk_state) {
186-
case TCP_SYN_SENT:
187-
case TCP_SYN_RECV:
188-
case TCP_ESTABLISHED:
189-
break;
190-
case TCP_CLOSE_WAIT:
191-
case TCP_CLOSING:
192-
case TCP_LAST_ACK:
193-
case TCP_FIN_WAIT1:
194-
case TCP_FIN_WAIT2:
195-
case TCP_LISTEN:
196-
break;
197-
case TCP_CLOSE:
198-
/* Only release if the map entry is in fact the sock in
199-
* question. There is a case where the operator deletes
200-
* the sock from the map, but the TCP sock is closed before
201-
* the psock is detached. Use cmpxchg to verify correct
202-
* sock is removed.
203-
*/
204-
psock = smap_psock_sk(sk);
205-
if (unlikely(!psock))
206-
break;
207-
write_lock_bh(&sk->sk_callback_lock);
208-
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
209-
osk = cmpxchg(e->entry, sk, NULL);
210-
if (osk == sk) {
211-
list_del(&e->list);
212-
smap_release_sock(psock, sk);
213-
}
214-
}
215-
write_unlock_bh(&sk->sk_callback_lock);
216-
break;
217-
default:
218-
psock = smap_psock_sk(sk);
219-
if (unlikely(!psock))
220-
break;
221-
smap_report_sk_error(psock, EPIPE);
222-
break;
223-
}
224-
225-
wq = rcu_dereference(sk->sk_wq);
226-
if (skwq_has_sleeper(wq))
227-
wake_up_interruptible_all(&wq->wait);
228-
rcu_read_unlock();
229-
}
230-
231260
static void smap_read_sock_strparser(struct strparser *strp,
232261
struct sk_buff *skb)
233262
{
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
322351
return;
323352
sk->sk_data_ready = psock->save_data_ready;
324353
sk->sk_write_space = psock->save_write_space;
325-
sk->sk_state_change = psock->save_state_change;
326354
psock->save_data_ready = NULL;
327355
psock->save_write_space = NULL;
328-
psock->save_state_change = NULL;
329356
strp_stop(&psock->strp);
330357
psock->strp_enabled = false;
331358
}
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
350377
if (psock->refcnt)
351378
return;
352379

380+
tcp_cleanup_ulp(sock);
353381
smap_stop_sock(psock, sock);
354382
clear_bit(SMAP_TX_RUNNING, &psock->state);
355383
rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
427455
return;
428456
psock->save_data_ready = sk->sk_data_ready;
429457
psock->save_write_space = sk->sk_write_space;
430-
psock->save_state_change = sk->sk_state_change;
431458
sk->sk_data_ready = smap_data_ready;
432459
sk->sk_write_space = smap_write_space;
433-
sk->sk_state_change = smap_state_change;
434460
psock->strp_enabled = true;
435461
}
436462

@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
509535
if (attr->value_size > KMALLOC_MAX_SIZE)
510536
return ERR_PTR(-E2BIG);
511537

538+
err = bpf_tcp_ulp_register();
539+
if (err && err != -EEXIST)
540+
return ERR_PTR(err);
541+
512542
stab = kzalloc(sizeof(*stab), GFP_USER);
513543
if (!stab)
514544
return ERR_PTR(-ENOMEM);
@@ -590,11 +620,6 @@ static void sock_map_free(struct bpf_map *map)
590620
}
591621
rcu_read_unlock();
592622

593-
if (stab->bpf_verdict)
594-
bpf_prog_put(stab->bpf_verdict);
595-
if (stab->bpf_parse)
596-
bpf_prog_put(stab->bpf_parse);
597-
598623
sock_map_remove_complete(stab);
599624
}
600625

@@ -754,6 +779,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
754779
goto out_progs;
755780
}
756781

782+
err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
783+
if (err)
784+
goto out_progs;
785+
757786
set_bit(SMAP_TX_RUNNING, &psock->state);
758787
}
759788

@@ -866,13 +895,27 @@ static int sock_map_update_elem(struct bpf_map *map,
866895
return err;
867896
}
868897

898+
static void sock_map_release(struct bpf_map *map, struct file *map_file)
899+
{
900+
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
901+
struct bpf_prog *orig;
902+
903+
orig = xchg(&stab->bpf_parse, NULL);
904+
if (orig)
905+
bpf_prog_put(orig);
906+
orig = xchg(&stab->bpf_verdict, NULL);
907+
if (orig)
908+
bpf_prog_put(orig);
909+
}
910+
869911
const struct bpf_map_ops sock_map_ops = {
870912
.map_alloc = sock_map_alloc,
871913
.map_free = sock_map_free,
872914
.map_lookup_elem = sock_map_lookup,
873915
.map_get_next_key = sock_map_get_next_key,
874916
.map_update_elem = sock_map_update_elem,
875917
.map_delete_elem = sock_map_delete_elem,
918+
.map_release = sock_map_release,
876919
};
877920

878921
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,

net/ipv4/tcp_ulp.c

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
2929
return NULL;
3030
}
3131

32+
static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
33+
{
34+
struct tcp_ulp_ops *e;
35+
36+
list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
37+
if (e->uid == ulp)
38+
return e;
39+
}
40+
41+
return NULL;
42+
}
43+
3244
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
3345
{
3446
const struct tcp_ulp_ops *ulp = NULL;
@@ -51,6 +63,18 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
5163
return ulp;
5264
}
5365

66+
static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
67+
{
68+
const struct tcp_ulp_ops *ulp;
69+
70+
rcu_read_lock();
71+
ulp = tcp_ulp_find_id(uid);
72+
if (!ulp || !try_module_get(ulp->owner))
73+
ulp = NULL;
74+
rcu_read_unlock();
75+
return ulp;
76+
}
77+
5478
/* Attach new upper layer protocol to the list
5579
* of available protocols.
5680
*/
@@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp)
5983
int ret = 0;
6084

6185
spin_lock(&tcp_ulp_list_lock);
62-
if (tcp_ulp_find(ulp->name)) {
63-
pr_notice("%s already registered or non-unique name\n",
64-
ulp->name);
86+
if (tcp_ulp_find(ulp->name))
6587
ret = -EEXIST;
66-
} else {
88+
else
6789
list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
68-
}
6990
spin_unlock(&tcp_ulp_list_lock);
7091

7192
return ret;
@@ -124,6 +145,34 @@ int tcp_set_ulp(struct sock *sk, const char *name)
124145
if (!ulp_ops)
125146
return -ENOENT;
126147

148+
if (!ulp_ops->user_visible) {
149+
module_put(ulp_ops->owner);
150+
return -ENOENT;
151+
}
152+
153+
err = ulp_ops->init(sk);
154+
if (err) {
155+
module_put(ulp_ops->owner);
156+
return err;
157+
}
158+
159+
icsk->icsk_ulp_ops = ulp_ops;
160+
return 0;
161+
}
162+
163+
int tcp_set_ulp_id(struct sock *sk, int ulp)
164+
{
165+
struct inet_connection_sock *icsk = inet_csk(sk);
166+
const struct tcp_ulp_ops *ulp_ops;
167+
int err;
168+
169+
if (icsk->icsk_ulp_ops)
170+
return -EEXIST;
171+
172+
ulp_ops = __tcp_ulp_lookup(ulp);
173+
if (!ulp_ops)
174+
return -ENOENT;
175+
127176
err = ulp_ops->init(sk);
128177
if (err) {
129178
module_put(ulp_ops->owner);

0 commit comments

Comments
 (0)