Skip to content

Commit 1a2e10a

Browse files
committed
Merge branch 'Rework-ip_ra_chain-protection'
Kirill Tkhai says: ==================== Rework ip_ra_chain protection Commit 1215e51 "ipv4: fix a deadlock in ip_ra_control" made rtnl_lock() be used in raw_close(). This function is called on every RAW socket destruction, so that rtnl_mutex is taken every time. This scales very sadly. I observe cleanup_net() spending a lot of time in rtnl_lock() and raw_close() is one of the biggest rtnl user (since we have percpu net->ipv4.icmp_sk). This patchset reworks the locking: reverts the problem commit and its descendant, and introduces rtnl-independent locking. This may have a continuation, and someone may work on killing rtnl_lock() in mrtsock_destruct() in the future. v3: Change patches order: [2/5] and [3/5]. v2: Fix sparse warning [4/5], as reported by kbuild test robot. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents f2d254f + d9ff304 commit 1a2e10a

File tree

7 files changed

+38
-30
lines changed

7 files changed

+38
-30
lines changed

include/net/ip.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb)
9191
return 0;
9292
}
9393

94+
/* Special input handler for packets caught by router alert option.
95+
They are selected only by protocol field, and then processed likely
96+
local ones; but only if someone wants them! Otherwise, router
97+
not running rsvpd will kill RSVP.
98+
99+
It is user level problem, what it will make with them.
100+
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
101+
but receiver should be enough clever f.e. to forward mtrace requests,
102+
sent to multicast group to reach destination designated router.
103+
*/
104+
94105
struct ip_ra_chain {
95106
struct ip_ra_chain __rcu *next;
96107
struct sock *sk;
@@ -101,8 +112,6 @@ struct ip_ra_chain {
101112
struct rcu_head rcu;
102113
};
103114

104-
extern struct ip_ra_chain __rcu *ip_ra_chain;
105-
106115
/* IP flags. */
107116
#define IP_CE 0x8000 /* Flag: "Congestion" */
108117
#define IP_DF 0x4000 /* Flag: "Don't Fragment" */

include/net/netns/ipv4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ struct netns_ipv4 {
4949
#endif
5050
struct ipv4_devconf *devconf_all;
5151
struct ipv4_devconf *devconf_dflt;
52+
struct ip_ra_chain __rcu *ra_chain;
53+
struct mutex ra_mutex;
5254
#ifdef CONFIG_IP_MULTIPLE_TABLES
5355
struct fib_rules_ops *rules_ops;
5456
bool fib_has_custom_rules;

net/core/net_namespace.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
301301
net->user_ns = user_ns;
302302
idr_init(&net->netns_ids);
303303
spin_lock_init(&net->nsid_lock);
304+
mutex_init(&net->ipv4.ra_mutex);
304305

305306
list_for_each_entry(ops, &pernet_list, list) {
306307
error = ops_init(ops, net);

net/ipv4/ip_input.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,16 +159,15 @@ bool ip_call_ra_chain(struct sk_buff *skb)
159159
struct net_device *dev = skb->dev;
160160
struct net *net = dev_net(dev);
161161

162-
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
162+
for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
163163
struct sock *sk = ra->sk;
164164

165165
/* If socket is bound to an interface, only report
166166
* the packet if it came from that interface.
167167
*/
168168
if (sk && inet_sk(sk)->inet_num == protocol &&
169169
(!sk->sk_bound_dev_if ||
170-
sk->sk_bound_dev_if == dev->ifindex) &&
171-
net_eq(sock_net(sk), net)) {
170+
sk->sk_bound_dev_if == dev->ifindex)) {
172171
if (ip_is_fragment(ip_hdr(skb))) {
173172
if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
174173
return true;

net/ipv4/ip_sockglue.c

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -322,20 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
322322
return 0;
323323
}
324324

325-
326-
/* Special input handler for packets caught by router alert option.
327-
They are selected only by protocol field, and then processed likely
328-
local ones; but only if someone wants them! Otherwise, router
329-
not running rsvpd will kill RSVP.
330-
331-
It is user level problem, what it will make with them.
332-
I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
333-
but receiver should be enough clever f.e. to forward mtrace requests,
334-
sent to multicast group to reach destination designated router.
335-
*/
336-
struct ip_ra_chain __rcu *ip_ra_chain;
337-
338-
339325
static void ip_ra_destroy_rcu(struct rcu_head *head)
340326
{
341327
struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -349,23 +335,28 @@ int ip_ra_control(struct sock *sk, unsigned char on,
349335
{
350336
struct ip_ra_chain *ra, *new_ra;
351337
struct ip_ra_chain __rcu **rap;
338+
struct net *net = sock_net(sk);
352339

353340
if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
354341
return -EINVAL;
355342

356343
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
357344

358-
for (rap = &ip_ra_chain;
359-
(ra = rtnl_dereference(*rap)) != NULL;
345+
mutex_lock(&net->ipv4.ra_mutex);
346+
for (rap = &net->ipv4.ra_chain;
347+
(ra = rcu_dereference_protected(*rap,
348+
lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
360349
rap = &ra->next) {
361350
if (ra->sk == sk) {
362351
if (on) {
352+
mutex_unlock(&net->ipv4.ra_mutex);
363353
kfree(new_ra);
364354
return -EADDRINUSE;
365355
}
366356
/* dont let ip_call_ra_chain() use sk again */
367357
ra->sk = NULL;
368358
RCU_INIT_POINTER(*rap, ra->next);
359+
mutex_unlock(&net->ipv4.ra_mutex);
369360

370361
if (ra->destructor)
371362
ra->destructor(sk);
@@ -379,14 +370,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
379370
return 0;
380371
}
381372
}
382-
if (!new_ra)
373+
if (!new_ra) {
374+
mutex_unlock(&net->ipv4.ra_mutex);
383375
return -ENOBUFS;
376+
}
384377
new_ra->sk = sk;
385378
new_ra->destructor = destructor;
386379

387380
RCU_INIT_POINTER(new_ra->next, ra);
388381
rcu_assign_pointer(*rap, new_ra);
389382
sock_hold(sk);
383+
mutex_unlock(&net->ipv4.ra_mutex);
390384

391385
return 0;
392386
}
@@ -586,7 +580,6 @@ static bool setsockopt_needs_rtnl(int optname)
586580
case MCAST_LEAVE_GROUP:
587581
case MCAST_LEAVE_SOURCE_GROUP:
588582
case MCAST_UNBLOCK_SOURCE:
589-
case IP_ROUTER_ALERT:
590583
return true;
591584
}
592585
return false;
@@ -639,6 +632,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
639632

640633
/* If optlen==0, it is equivalent to val == 0 */
641634

635+
if (optname == IP_ROUTER_ALERT)
636+
return ip_ra_control(sk, val ? 1 : 0, NULL);
642637
if (ip_mroute_opt(optname))
643638
return ip_mroute_setsockopt(sk, optname, optval, optlen);
644639

@@ -1149,9 +1144,6 @@ static int do_ip_setsockopt(struct sock *sk, int level,
11491144
goto e_inval;
11501145
inet->mc_all = val;
11511146
break;
1152-
case IP_ROUTER_ALERT:
1153-
err = ip_ra_control(sk, val ? 1 : 0, NULL);
1154-
break;
11551147

11561148
case IP_FREEBIND:
11571149
if (optlen < 1)

net/ipv4/ipmr.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
13991399
struct net *net = sock_net(sk);
14001400
struct mr_table *mrt;
14011401

1402-
ASSERT_RTNL();
1402+
rtnl_lock();
14031403
ipmr_for_each_table(mrt, net) {
14041404
if (sk == rtnl_dereference(mrt->mroute_sk)) {
14051405
IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
14111411
mroute_clean_tables(mrt, false);
14121412
}
14131413
}
1414+
rtnl_unlock();
14141415
}
14151416

14161417
/* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
14751476
if (sk != rcu_access_pointer(mrt->mroute_sk)) {
14761477
ret = -EACCES;
14771478
} else {
1479+
/* We need to unlock here because mrtsock_destruct takes
1480+
* care of rtnl itself and we can't change that due to
1481+
* the IP_ROUTER_ALERT setsockopt which runs without it.
1482+
*/
1483+
rtnl_unlock();
14781484
ret = ip_ra_control(sk, 0, NULL);
1479-
goto out_unlock;
1485+
goto out;
14801486
}
14811487
break;
14821488
case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
15881594
}
15891595
out_unlock:
15901596
rtnl_unlock();
1597+
out:
15911598
return ret;
15921599
}
15931600

net/ipv4/raw.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
711711
/*
712712
* Raw sockets may have direct kernel references. Kill them.
713713
*/
714-
rtnl_lock();
715714
ip_ra_control(sk, 0, NULL);
716-
rtnl_unlock();
717715

718716
sk_common_release(sk);
719717
}

0 commit comments

Comments
 (0)