Skip to content

Commit 759ab1e

Browse files
committed
net: store netdevs in an xarray
Iterating over the netdev hash table for netlink dumps is hard. Dumps are done in "chunks" so we need to save the position after each chunk, so we know where to restart from. Because netdevs are stored in a hash table we remember which bucket we were in and how many devices we dumped. Since we don't hold any locks across the "chunks" - devices may come and go while we're dumping. If that happens we may miss a device (if device is deleted from the bucket we were in). We indicate to user space that this may have happened by setting NLM_F_DUMP_INTR. User space is supposed to dump again (I think) if it sees that. Somehow I doubt most user space gets this right.. To illustrate let's look at an example: System state: start: # [A, B, C] del: B # [A, C] with the hash table we may dump [A, B], missing C completely even tho it existed both before and after the "del B". Add an xarray and use it to allocate ifindexes. This way we can iterate ifindexes in order, without the worry that we'll skip one. We may still generate a dump of a state which "never existed", for example for a set of values and sequence of ops: System state: start: # [A, B] add: C # [A, C, B] del: B # [A, C] we may generate a dump of [A], if C got an index between A and B. System has never been in such state. But I'm 90% sure that's perfectly fine, important part is that we can't _miss_ devices which exist before and after. User space which wants to mirror kernel's state subscribes to notifications and does periodic dumps so it will know that C exists from the notification about its creation or from the next dump (next dump is _guaranteed_ to include C, if it doesn't get removed). To avoid any perf regressions keep the hash table for now. Most net namespaces have very few devices and microbenchmarking 1M lookups on Skylake I get the following results (not counting loopback to number of devs): #devs | hash | xa | delta 2 | 18.3 | 20.1 | + 9.8% 16 | 18.3 | 20.1 | + 9.5% 64 | 18.3 | 26.3 | +43.8% 128 | 20.4 | 26.3 | +28.6% 256 | 20.0 | 26.4 | +32.1% 1024 | 26.6 | 26.7 | + 0.2% 8192 |541.3 | 33.5 | -93.8% No surprises since the hash table has 256 entries. The microbenchmark scans indexes in order, if the pattern is more random xa starts to win at 512 devices already. But that's a lot of devices, in practice. Reviewed-by: Leon Romanovsky <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 083476a commit 759ab1e

File tree

2 files changed

+57
-29
lines changed

2 files changed

+57
-29
lines changed

include/net/net_namespace.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <linux/idr.h>
4343
#include <linux/skbuff.h>
4444
#include <linux/notifier.h>
45+
#include <linux/xarray.h>
4546

4647
struct user_namespace;
4748
struct proc_dir_entry;
@@ -69,7 +70,7 @@ struct net {
6970
atomic_t dev_unreg_count;
7071

7172
unsigned int dev_base_seq; /* protected by rtnl_mutex */
72-
int ifindex;
73+
u32 ifindex;
7374

7475
spinlock_t nsid_lock;
7576
atomic_t fnhe_genid;
@@ -110,6 +111,7 @@ struct net {
110111

111112
struct hlist_head *dev_name_head;
112113
struct hlist_head *dev_index_head;
114+
struct xarray dev_by_index;
113115
struct raw_notifier_head netdev_chain;
114116

115117
/* Note that @hash_mix can be read millions times per second,

net/core/dev.c

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,8 @@ static void list_netdevice(struct net_device *dev)
388388
hlist_add_head_rcu(&dev->index_hlist,
389389
dev_index_hash(net, dev->ifindex));
390390
write_unlock(&dev_base_lock);
391+
/* We reserved the ifindex, this can't fail */
392+
WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
391393

392394
dev_base_seq_inc(net);
393395
}
@@ -397,8 +399,12 @@ static void list_netdevice(struct net_device *dev)
397399
*/
398400
static void unlist_netdevice(struct net_device *dev, bool lock)
399401
{
402+
struct net *net = dev_net(dev);
403+
400404
ASSERT_RTNL();
401405

406+
xa_erase(&net->dev_by_index, dev->ifindex);
407+
402408
/* Unlink dev from the device chain */
403409
if (lock)
404410
write_lock(&dev_base_lock);
@@ -9565,23 +9571,35 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
95659571
}
95669572

95679573
/**
9568-
* dev_new_index - allocate an ifindex
9569-
* @net: the applicable net namespace
9574+
* dev_index_reserve() - allocate an ifindex in a namespace
9575+
* @net: the applicable net namespace
9576+
* @ifindex: requested ifindex, pass %0 to get one allocated
9577+
*
9578+
* Allocate a ifindex for a new device. Caller must either use the ifindex
9579+
* to store the device (via list_netdevice()) or call dev_index_release()
9580+
* to give the index up.
95709581
*
9571-
* Returns a suitable unique value for a new device interface
9572-
* number. The caller must hold the rtnl semaphore or the
9573-
* dev_base_lock to be sure it remains unique.
9582+
* Return: a suitable unique value for a new device interface number or -errno.
95749583
*/
9575-
static int dev_new_index(struct net *net)
9584+
static int dev_index_reserve(struct net *net, u32 ifindex)
95769585
{
9577-
int ifindex = net->ifindex;
9586+
int err;
95789587

9579-
for (;;) {
9580-
if (++ifindex <= 0)
9581-
ifindex = 1;
9582-
if (!__dev_get_by_index(net, ifindex))
9583-
return net->ifindex = ifindex;
9584-
}
9588+
if (!ifindex)
9589+
err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9590+
xa_limit_31b, &net->ifindex, GFP_KERNEL);
9591+
else
9592+
err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9593+
if (err < 0)
9594+
return err;
9595+
9596+
return ifindex;
9597+
}
9598+
9599+
static void dev_index_release(struct net *net, int ifindex)
9600+
{
9601+
/* Expect only unused indexes, unlist_netdevice() removes the used */
9602+
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
95859603
}
95869604

95879605
/* Delayed registration/unregisteration */
@@ -10051,11 +10069,10 @@ int register_netdevice(struct net_device *dev)
1005110069
goto err_uninit;
1005210070
}
1005310071

10054-
ret = -EBUSY;
10055-
if (!dev->ifindex)
10056-
dev->ifindex = dev_new_index(net);
10057-
else if (__dev_get_by_index(net, dev->ifindex))
10072+
ret = dev_index_reserve(net, dev->ifindex);
10073+
if (ret < 0)
1005810074
goto err_uninit;
10075+
dev->ifindex = ret;
1005910076

1006010077
/* Transfer changeable features to wanted_features and enable
1006110078
* software offloads (GSO and GRO).
@@ -10102,7 +10119,7 @@ int register_netdevice(struct net_device *dev)
1010210119
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
1010310120
ret = notifier_to_errno(ret);
1010410121
if (ret)
10105-
goto err_uninit;
10122+
goto err_ifindex_release;
1010610123

1010710124
ret = netdev_register_kobject(dev);
1010810125
write_lock(&dev_base_lock);
@@ -10158,6 +10175,8 @@ int register_netdevice(struct net_device *dev)
1015810175

1015910176
err_uninit_notify:
1016010177
call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10178+
err_ifindex_release:
10179+
dev_index_release(net, dev->ifindex);
1016110180
err_uninit:
1016210181
if (dev->netdev_ops->ndo_uninit)
1016310182
dev->netdev_ops->ndo_uninit(dev);
@@ -11035,9 +11054,19 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
1103511054
}
1103611055

1103711056
/* Check that new_ifindex isn't used yet. */
11038-
err = -EBUSY;
11039-
if (new_ifindex && __dev_get_by_index(net, new_ifindex))
11040-
goto out;
11057+
if (new_ifindex) {
11058+
err = dev_index_reserve(net, new_ifindex);
11059+
if (err < 0)
11060+
goto out;
11061+
} else {
11062+
/* If there is an ifindex conflict assign a new one */
11063+
err = dev_index_reserve(net, dev->ifindex);
11064+
if (err == -EBUSY)
11065+
err = dev_index_reserve(net, 0);
11066+
if (err < 0)
11067+
goto out;
11068+
new_ifindex = err;
11069+
}
1104111070

1104211071
/*
1104311072
* And now a mini version of register_netdevice unregister_netdevice.
@@ -11065,13 +11094,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
1106511094
rcu_barrier();
1106611095

1106711096
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11068-
/* If there is an ifindex conflict assign a new one */
11069-
if (!new_ifindex) {
11070-
if (__dev_get_by_index(net, dev->ifindex))
11071-
new_ifindex = dev_new_index(net);
11072-
else
11073-
new_ifindex = dev->ifindex;
11074-
}
1107511097

1107611098
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
1107711099
new_ifindex);
@@ -11249,6 +11271,9 @@ static int __net_init netdev_init(struct net *net)
1124911271
if (net->dev_index_head == NULL)
1125011272
goto err_idx;
1125111273

11274+
net->ifindex = 1;
11275+
xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC);
11276+
1125211277
RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
1125311278

1125411279
return 0;
@@ -11346,6 +11371,7 @@ static void __net_exit netdev_exit(struct net *net)
1134611371
{
1134711372
kfree(net->dev_name_head);
1134811373
kfree(net->dev_index_head);
11374+
xa_destroy(&net->dev_by_index);
1134911375
if (net != &init_net)
1135011376
WARN_ON_ONCE(!list_empty(&net->dev_base_head));
1135111377
}

0 commit comments

Comments
 (0)