Skip to content

Commit 324e227

Browse files
committed
RDMA/device: Add ib_device_get_by_netdev()
Several drivers need to find the ib_device from a given netdev. rxe needs this at speed in an unsleepable context, so choose to implement the translation using a RCU safe hash table. The hash table can have a many to one mapping. This is intended to support some future case where multiple IB drivers (ie iWarp and RoCE) connect to the same netdevs. driver_ids will need to be different to support this. In the process this makes the struct ib_device and ib_port_data RCU safe by deferring their kfrees. Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent c2261dd commit 324e227

File tree

2 files changed

+116
-13
lines changed

2 files changed

+116
-13
lines changed

drivers/infiniband/core/device.c

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <linux/netdevice.h>
4141
#include <linux/security.h>
4242
#include <linux/notifier.h>
43+
#include <linux/hashtable.h>
4344
#include <rdma/rdma_netlink.h>
4445
#include <rdma/ib_addr.h>
4546
#include <rdma/ib_cache.h>
@@ -134,6 +135,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
134135
!xa_is_err(entry); \
135136
(index)++, entry = xan_find_marked(xa, &(index), filter))
136137

138+
/* RCU hash table mapping netdevice pointers to struct ib_port_data */
139+
static DEFINE_SPINLOCK(ndev_hash_lock);
140+
static DECLARE_HASHTABLE(ndev_hash, 5);
141+
137142
static void free_netdevs(struct ib_device *ib_dev);
138143
static int ib_security_change(struct notifier_block *nb, unsigned long event,
139144
void *lsm_data);
@@ -144,6 +149,12 @@ static struct notifier_block ibdev_lsm_nb = {
144149
.notifier_call = ib_security_change,
145150
};
146151

152+
/* Pointer to the RCU head at the start of the ib_port_data array */
153+
struct ib_port_data_rcu {
154+
struct rcu_head rcu_head;
155+
struct ib_port_data pdata[];
156+
};
157+
147158
static int ib_device_check_mandatory(struct ib_device *device)
148159
{
149160
#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
@@ -295,9 +306,12 @@ static void ib_device_release(struct device *device)
295306
WARN_ON(refcount_read(&dev->refcount));
296307
ib_cache_release_one(dev);
297308
ib_security_release_port_pkey_list(dev);
298-
kfree(dev->port_data);
299309
xa_destroy(&dev->client_data);
300-
kfree(dev);
310+
if (dev->port_data)
311+
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
312+
pdata[0]),
313+
rcu_head);
314+
kfree_rcu(dev, rcu_head);
301315
}
302316

303317
static int ib_device_uevent(struct device *device,
@@ -468,6 +482,7 @@ static void remove_client_context(struct ib_device *device,
468482

469483
static int alloc_port_data(struct ib_device *device)
470484
{
485+
struct ib_port_data_rcu *pdata_rcu;
471486
unsigned int port;
472487

473488
if (device->port_data)
@@ -484,17 +499,26 @@ static int alloc_port_data(struct ib_device *device)
484499
* Therefore port_data is declared as a 1 based array with potential
485500
* empty slots at the beginning.
486501
*/
487-
device->port_data = kcalloc(rdma_end_port(device) + 1,
488-
sizeof(*device->port_data), GFP_KERNEL);
489-
if (!device->port_data)
502+
pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
503+
rdma_end_port(device) + 1),
504+
GFP_KERNEL);
505+
if (!pdata_rcu)
490506
return -ENOMEM;
507+
/*
508+
* The rcu_head is put in front of the port data array and the stored
509+
* pointer is adjusted since we never need to see that member until
510+
* kfree_rcu.
511+
*/
512+
device->port_data = pdata_rcu->pdata;
491513

492514
rdma_for_each_port (device, port) {
493515
struct ib_port_data *pdata = &device->port_data[port];
494516

517+
pdata->ib_dev = device;
495518
spin_lock_init(&pdata->pkey_list_lock);
496519
INIT_LIST_HEAD(&pdata->pkey_list);
497520
spin_lock_init(&pdata->netdev_lock);
521+
INIT_HLIST_NODE(&pdata->ndev_hash_link);
498522
}
499523
return 0;
500524
}
@@ -1042,6 +1066,29 @@ int ib_query_port(struct ib_device *device,
10421066
}
10431067
EXPORT_SYMBOL(ib_query_port);
10441068

1069+
static void add_ndev_hash(struct ib_port_data *pdata)
1070+
{
1071+
unsigned long flags;
1072+
1073+
might_sleep();
1074+
1075+
spin_lock_irqsave(&ndev_hash_lock, flags);
1076+
if (hash_hashed(&pdata->ndev_hash_link)) {
1077+
hash_del_rcu(&pdata->ndev_hash_link);
1078+
spin_unlock_irqrestore(&ndev_hash_lock, flags);
1079+
/*
1080+
* We cannot do hash_add_rcu after a hash_del_rcu until the
1081+
* grace period
1082+
*/
1083+
synchronize_rcu();
1084+
spin_lock_irqsave(&ndev_hash_lock, flags);
1085+
}
1086+
if (pdata->netdev)
1087+
hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
1088+
(uintptr_t)pdata->netdev);
1089+
spin_unlock_irqrestore(&ndev_hash_lock, flags);
1090+
}
1091+
10451092
/**
10461093
* ib_device_set_netdev - Associate the ib_dev with an underlying net_device
10471094
* @ib_dev: Device to modify
@@ -1078,17 +1125,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
10781125

10791126
pdata = &ib_dev->port_data[port];
10801127
spin_lock_irqsave(&pdata->netdev_lock, flags);
1081-
if (pdata->netdev == ndev) {
1128+
old_ndev = rcu_dereference_protected(
1129+
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
1130+
if (old_ndev == ndev) {
10821131
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
10831132
return 0;
10841133
}
1085-
old_ndev = pdata->netdev;
10861134

10871135
if (ndev)
10881136
dev_hold(ndev);
1089-
pdata->netdev = ndev;
1137+
rcu_assign_pointer(pdata->netdev, ndev);
10901138
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
10911139

1140+
add_ndev_hash(pdata);
10921141
if (old_ndev)
10931142
dev_put(old_ndev);
10941143

@@ -1103,11 +1152,24 @@ static void free_netdevs(struct ib_device *ib_dev)
11031152

11041153
rdma_for_each_port (ib_dev, port) {
11051154
struct ib_port_data *pdata = &ib_dev->port_data[port];
1155+
struct net_device *ndev;
11061156

11071157
spin_lock_irqsave(&pdata->netdev_lock, flags);
1108-
if (pdata->netdev) {
1109-
dev_put(pdata->netdev);
1110-
pdata->netdev = NULL;
1158+
ndev = rcu_dereference_protected(
1159+
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
1160+
if (ndev) {
1161+
spin_lock(&ndev_hash_lock);
1162+
hash_del_rcu(&pdata->ndev_hash_link);
1163+
spin_unlock(&ndev_hash_lock);
1164+
1165+
/*
1166+
* If this is the last dev_put there is still a
1167+
* synchronize_rcu before the netdev is kfreed, so we
1168+
* can continue to rely on unlocked pointer
1169+
* comparisons after the put
1170+
*/
1171+
rcu_assign_pointer(pdata->netdev, NULL);
1172+
dev_put(ndev);
11111173
}
11121174
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
11131175
}
@@ -1132,7 +1194,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
11321194
res = ib_dev->ops.get_netdev(ib_dev, port);
11331195
else {
11341196
spin_lock(&pdata->netdev_lock);
1135-
res = pdata->netdev;
1197+
res = rcu_dereference_protected(
1198+
pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
11361199
if (res)
11371200
dev_hold(res);
11381201
spin_unlock(&pdata->netdev_lock);
@@ -1150,6 +1213,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
11501213
return res;
11511214
}
11521215

1216+
/**
1217+
* ib_device_get_by_netdev - Find an IB device associated with a netdev
1218+
* @ndev: netdev to locate
1219+
* @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
1220+
*
1221+
* Find and hold an ib_device that is associated with a netdev via
1222+
* ib_device_set_netdev(). The caller must call ib_device_put() on the
1223+
* returned pointer.
1224+
*/
1225+
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
1226+
enum rdma_driver_id driver_id)
1227+
{
1228+
struct ib_device *res = NULL;
1229+
struct ib_port_data *cur;
1230+
1231+
rcu_read_lock();
1232+
hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
1233+
(uintptr_t)ndev) {
1234+
if (rcu_access_pointer(cur->netdev) == ndev &&
1235+
(driver_id == RDMA_DRIVER_UNKNOWN ||
1236+
cur->ib_dev->driver_id == driver_id) &&
1237+
ib_device_try_get(cur->ib_dev)) {
1238+
res = cur->ib_dev;
1239+
break;
1240+
}
1241+
}
1242+
rcu_read_unlock();
1243+
1244+
return res;
1245+
}
1246+
EXPORT_SYMBOL(ib_device_get_by_netdev);
1247+
11531248
/**
11541249
* ib_enum_roce_netdev - enumerate all RoCE ports
11551250
* @ib_dev : IB device we want to query

include/rdma/ib_verbs.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2198,6 +2198,8 @@ struct ib_port_immutable {
21982198
};
21992199

22002200
struct ib_port_data {
2201+
struct ib_device *ib_dev;
2202+
22012203
struct ib_port_immutable immutable;
22022204

22032205
spinlock_t pkey_list_lock;
@@ -2206,7 +2208,8 @@ struct ib_port_data {
22062208
struct ib_port_cache cache;
22072209

22082210
spinlock_t netdev_lock;
2209-
struct net_device *netdev;
2211+
struct net_device __rcu *netdev;
2212+
struct hlist_node ndev_hash_link;
22102213
};
22112214

22122215
/* rdma netdev type - specifies protocol type */
@@ -2545,6 +2548,7 @@ struct ib_device {
25452548
struct device *dma_device;
25462549
struct ib_device_ops ops;
25472550
char name[IB_DEVICE_NAME_MAX];
2551+
struct rcu_head rcu_head;
25482552

25492553
struct list_head event_handler_list;
25502554
spinlock_t event_handler_lock;
@@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev)
39964000
}
39974001

39984002
void ib_device_put(struct ib_device *device);
4003+
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
4004+
enum rdma_driver_id driver_id);
4005+
struct ib_device *ib_device_get_by_name(const char *name,
4006+
enum rdma_driver_id driver_id);
39994007
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
40004008
u16 pkey, const union ib_gid *gid,
40014009
const struct sockaddr *addr);

0 commit comments

Comments
 (0)