Skip to content

Commit ceb99ba

Browse files
Wengang-oracleMukesh Kacker
authored andcommitted
rds: free ib_device related resource
There is a (rare) case that a ib_device gets removed(driver unload) while upper layer(RDS) is still having references to the resources allocated from this ib_device. The result is either causing memory leak or crashing when accessing the freed memory. The resources are mainly rds_ib_mr objects, in-use rds_ib_mr (rds_mr) objects are stored in rds_sock.rs_rdma_keys. The fix is to 1) links up all in-use rds_ib_mr objects to the pool 2) links the rds_sock to rds_ib_mr 3) the destroy of the rds_ib_mr_pool takes care of freeing rds_ib_mrs by calling rds_rdma_drop_keys() Orabug: 21795824 Signed-off-by: Wengang Wang <[email protected]> Acked-by: Chien Yen <[email protected]> Signed-off-by: Guangyu Sun <[email protected]> Signed-off-by: Mukesh Kacker <[email protected]>
1 parent 23f63db commit ceb99ba

File tree

4 files changed

+95
-40
lines changed

4 files changed

+95
-40
lines changed

net/rds/ib.c

Lines changed: 34 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -201,27 +201,42 @@ void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
201201
* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
202202
* from interrupt context so we push freing off into a work struct in krdsd.
203203
*/
204-
static void rds_ib_dev_free(struct work_struct *work)
204+
205+
/* free up rds_ibdev->dev related resource. We have to wait until freeing
206+
* work is done to avoid the racing with freeing in mlx4_remove_one ->
207+
* mlx4_cleanup_mr_table path
208+
*/
209+
static void rds_ib_dev_free_dev(struct rds_ib_device *rds_ibdev)
205210
{
206-
struct rds_ib_ipaddr *i_ipaddr, *i_next;
207-
struct rds_ib_device *rds_ibdev = container_of(work,
208-
struct rds_ib_device, free_work);
211+
mutex_lock(&rds_ibdev->free_dev_lock);
212+
if (!atomic_dec_and_test(&rds_ibdev->free_dev))
213+
goto out;
209214

210-
if (rds_ibdev->dev) {
211-
if (rds_ibdev->mr_8k_pool)
212-
rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
213-
if (rds_ibdev->mr_1m_pool)
214-
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
215-
if (rds_ibdev->mr)
216-
ib_dereg_mr(rds_ibdev->mr);
217-
if (rds_ibdev->pd)
218-
ib_dealloc_pd(rds_ibdev->pd);
219-
}
220215
if (rds_ibdev->srq) {
221216
rds_ib_srq_exit(rds_ibdev);
222217
kfree(rds_ibdev->srq);
223218
}
224219

220+
if (rds_ibdev->mr_8k_pool)
221+
rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool);
222+
if (rds_ibdev->mr_1m_pool)
223+
rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool);
224+
if (rds_ibdev->mr)
225+
ib_dereg_mr(rds_ibdev->mr);
226+
if (rds_ibdev->pd)
227+
ib_dealloc_pd(rds_ibdev->pd);
228+
out:
229+
mutex_unlock(&rds_ibdev->free_dev_lock);
230+
}
231+
232+
static void rds_ib_dev_free(struct work_struct *work)
233+
{
234+
struct rds_ib_ipaddr *i_ipaddr, *i_next;
235+
struct rds_ib_device *rds_ibdev = container_of(work,
236+
struct rds_ib_device, free_work);
237+
238+
rds_ib_dev_free_dev(rds_ibdev);
239+
225240
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
226241
list_del(&i_ipaddr->list);
227242
kfree(i_ipaddr);
@@ -230,13 +245,7 @@ static void rds_ib_dev_free(struct work_struct *work)
230245
if (rds_ibdev->vector_load)
231246
kfree(rds_ibdev->vector_load);
232247

233-
if (rds_ibdev->dev) {
234-
WARN_ON(!waitqueue_active(&rds_ibdev->wait));
235-
236-
rds_ibdev->done = 1;
237-
wake_up(&rds_ibdev->wait);
238-
} else
239-
kfree(rds_ibdev);
248+
kfree(rds_ibdev);
240249
}
241250

242251
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
@@ -314,22 +323,9 @@ void rds_ib_remove_one(struct ib_device *device)
314323
*/
315324
synchronize_rcu();
316325
rds_ib_dev_put(rds_ibdev);
326+
/* free up lower layer resource since it may be the last change */
327+
rds_ib_dev_free_dev(rds_ibdev);
317328
rds_ib_dev_put(rds_ibdev);
318-
319-
if (!wait_event_timeout(rds_ibdev->wait, rds_ibdev->done, 30*HZ)) {
320-
printk(KERN_WARNING "RDS/IB: device cleanup timed out after "
321-
" 30 secs (refcount=%d)\n",
322-
atomic_read(&rds_ibdev->refcount));
323-
/* when rds_ib_remove_one return, driver's mlx4_ib_removeone
324-
* function destroy ib_device, so we must clear rds_ibdev->dev
325-
* to NULL, or will cause crash when rds connection be released,
326-
* at the moment rds_ib_dev_free through ib_device
327-
* .i.e rds_ibdev->dev to release mr and fmr, reusing the
328-
* released ib_device will cause crash.
329-
*/
330-
rds_ibdev->dev = NULL;
331-
} else
332-
kfree(rds_ibdev);
333329
}
334330

335331
struct ib_client rds_ib_client = {
@@ -2105,11 +2101,11 @@ void rds_ib_add_one(struct ib_device *device)
21052101
if (!rds_ibdev)
21062102
goto free_attr;
21072103

2104+
atomic_set(&rds_ibdev->free_dev, 1);
2105+
mutex_init(&rds_ibdev->free_dev_lock);
21082106
spin_lock_init(&rds_ibdev->spinlock);
21092107
atomic_set(&rds_ibdev->refcount, 1);
21102108
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
2111-
init_waitqueue_head(&rds_ibdev->wait);
2112-
rds_ibdev->done = 0;
21132109

21142110
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
21152111
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);

net/rds/ib.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,8 +433,13 @@ struct rds_ib_device {
433433
struct rds_ib_port *ports;
434434
struct ib_event_handler event_handler;
435435
int *vector_load;
436-
wait_queue_head_t wait;
437-
int done;
436+
437+
/* flag indicating ib_device is under freeing up or is freed up to make
438+
* the race between rds_ib_remove_one() and rds_release() safe.
439+
*/
440+
atomic_t free_dev;
441+
/* wait until freeing work is done */
442+
struct mutex free_dev_lock;
438443
};
439444

440445
#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
@@ -485,6 +490,8 @@ struct rds_ib_statistics {
485490
uint64_t s_ib_rdma_mr_1m_pool_flush;
486491
uint64_t s_ib_rdma_mr_1m_pool_wait;
487492
uint64_t s_ib_rdma_mr_1m_pool_depleted;
493+
uint64_t s_ib_rdma_mr_1m_pool_reuse;
494+
uint64_t s_ib_rdma_mr_8k_pool_reuse;
488495
uint64_t s_ib_atomic_cswp;
489496
uint64_t s_ib_atomic_fadd;
490497
uint64_t s_ib_srq_lows;

net/rds/ib_rdma.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ struct rds_ib_mr {
6060
unsigned int sg_len;
6161
u64 *dma;
6262
int sg_dma_len;
63+
64+
struct rds_sock *rs;
65+
struct list_head pool_list;
6366
};
6467

6568
/*
@@ -83,6 +86,13 @@ struct rds_ib_mr_pool {
8386
atomic_t max_items_soft;
8487
unsigned long max_free_pinned;
8588
struct ib_fmr_attr fmr_attr;
89+
90+
spinlock_t busy_lock; /* protect ops on 'busy_list' */
91+
/* All in use MRs allocated from this pool are listed here. This list
92+
* helps freeing up in use MRs when a ib_device got removed while it's
93+
* resources are still in use in RDS layer. Protected by busy_lock.
94+
*/
95+
struct list_head busy_list;
8696
};
8797

8898
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
@@ -232,6 +242,9 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
232242
if (!pool)
233243
return ERR_PTR(-ENOMEM);
234244

245+
spin_lock_init(&pool->busy_lock);
246+
INIT_LIST_HEAD(&pool->busy_list);
247+
235248
pool->pool_type = pool_type;
236249
INIT_XLIST_HEAD(&pool->free_list);
237250
INIT_XLIST_HEAD(&pool->drop_list);
@@ -266,6 +279,22 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
266279

267280
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
268281
{
282+
struct rds_ib_mr *ibmr;
283+
LIST_HEAD(drp_list);
284+
285+
/* move MRs in in-use list to drop or free list */
286+
spin_lock(&pool->busy_lock);
287+
list_splice_init(&pool->busy_list, &drp_list);
288+
spin_unlock(&pool->busy_lock);
289+
290+
/* rds_rdma_drop_keys may drops more than one MRs in one iteration */
291+
while (!list_empty(&drp_list)) {
292+
ibmr = list_first_entry(&drp_list, struct rds_ib_mr, pool_list);
293+
list_del_init(&ibmr->pool_list);
294+
if (ibmr->rs)
295+
rds_rdma_drop_keys(ibmr->rs);
296+
}
297+
269298
cancel_delayed_work_sync(&pool->flush_worker);
270299
rds_ib_flush_mr_pool(pool, 1, NULL);
271300
WARN_ON(atomic_read(&pool->item_count));
@@ -296,6 +325,15 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
296325

297326
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
298327
preempt_enable();
328+
if (ibmr) {
329+
if (pool->pool_type == RDS_IB_MR_8K_POOL)
330+
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_reuse);
331+
else
332+
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_reuse);
333+
spin_lock(&pool->busy_lock);
334+
list_add(&ibmr->pool_list, &pool->busy_list);
335+
spin_unlock(&pool->busy_lock);
336+
}
299337
return ibmr;
300338
}
301339

@@ -373,6 +411,11 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
373411

374412
memset(ibmr, 0, sizeof(*ibmr));
375413

414+
INIT_LIST_HEAD(&ibmr->pool_list);
415+
spin_lock(&pool->busy_lock);
416+
list_add(&ibmr->pool_list, &pool->busy_list);
417+
spin_unlock(&pool->busy_lock);
418+
376419
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
377420
(IB_ACCESS_LOCAL_WRITE |
378421
IB_ACCESS_REMOTE_READ |
@@ -812,6 +855,13 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
812855

813856
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
814857

858+
ibmr->rs = NULL;
859+
860+
/* remove from pool->busy_list or a tmp list(destroy path) */
861+
spin_lock(&pool->busy_lock);
862+
list_del_init(&ibmr->pool_list);
863+
spin_unlock(&pool->busy_lock);
864+
815865
/* Return it to the pool's free list */
816866
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
817867
xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
@@ -886,6 +936,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
886936
else
887937
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
888938

939+
ibmr->rs = rs;
889940
ibmr->device = rds_ibdev;
890941
rds_ibdev = NULL;
891942

net/rds/rdma.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
149149
if (rs->rs_transport && rs->rs_transport->flush_mrs)
150150
rs->rs_transport->flush_mrs();
151151
}
152+
EXPORT_SYMBOL_GPL(rds_rdma_drop_keys);
152153

153154
/*
154155
* Helper function to pin user pages.

0 commit comments

Comments
 (0)