Skip to content

Commit e2ac236

Browse files
chuckleveramschuma-ntap
authored andcommitted
xprtrdma: Allocate MRs on demand
Frequent MR list exhaustion can impact I/O throughput, so enough MRs are always created during transport set-up to prevent running out. This means more MRs are created than most workloads need. Commit 94f58c5 ("xprtrdma: Allow Read list and Reply chunk simultaneously") introduced support for sending two chunk lists per RPC, which consumes more MRs per RPC. Instead of trying to provision more MRs, introduce a mechanism for allocating MRs on demand. A few MRs are allocated during transport set-up to kick things off. This significantly reduces the average number of MRs per transport while allowing the MR count to grow for workloads or devices that need more MRs. FRWR with mlx4 allocated almost 400 MRs per transport before this patch. Now it starts with 32. Signed-off-by: Chuck Lever <[email protected]> Tested-by: Steve Wise <[email protected]> Signed-off-by: Anna Schumaker <[email protected]>
1 parent a54d405 commit e2ac236

File tree

5 files changed

+114
-124
lines changed

5 files changed

+114
-124
lines changed

net/sunrpc/xprtrdma/fmr_ops.c

Lines changed: 7 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ fmr_is_supported(struct rpcrdma_ia *ia)
4646
}
4747

4848
static int
49-
__fmr_init(struct rpcrdma_mw *mw, struct ib_pd *pd)
49+
fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
5050
{
5151
static struct ib_fmr_attr fmr_attr = {
5252
.max_pages = RPCRDMA_MAX_FMR_SGES,
@@ -66,7 +66,7 @@ __fmr_init(struct rpcrdma_mw *mw, struct ib_pd *pd)
6666

6767
sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
6868

69-
mw->fmr.fm_mr = ib_alloc_fmr(pd, RPCRDMA_FMR_ACCESS_FLAGS,
69+
mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
7070
&fmr_attr);
7171
if (IS_ERR(mw->fmr.fm_mr))
7272
goto out_fmr_err;
@@ -96,7 +96,7 @@ __fmr_unmap(struct rpcrdma_mw *mw)
9696
}
9797

9898
static void
99-
__fmr_release(struct rpcrdma_mw *r)
99+
fmr_op_release_mr(struct rpcrdma_mw *r)
100100
{
101101
LIST_HEAD(unmap_list);
102102
int rc;
@@ -116,13 +116,11 @@ __fmr_release(struct rpcrdma_mw *r)
116116
if (rc)
117117
pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
118118
r, rc);
119+
120+
kfree(r);
119121
}
120122

121123
/* Reset of a single FMR.
122-
*
123-
* There's no recovery if this fails. The FMR is abandoned, but
124-
* remains in rb_all. It will be cleaned up when the transport is
125-
* destroyed.
126124
*/
127125
static void
128126
fmr_op_recover_mr(struct rpcrdma_mw *mw)
@@ -166,41 +164,6 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
166164
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
167165
}
168166

169-
static int
170-
fmr_op_init(struct rpcrdma_xprt *r_xprt)
171-
{
172-
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
173-
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
174-
struct rpcrdma_mw *r;
175-
int i, rc;
176-
177-
spin_lock_init(&buf->rb_mwlock);
178-
INIT_LIST_HEAD(&buf->rb_mws);
179-
INIT_LIST_HEAD(&buf->rb_all);
180-
181-
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
182-
i += 2; /* head + tail */
183-
i *= buf->rb_max_requests; /* one set for each RPC slot */
184-
dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
185-
186-
while (i--) {
187-
r = kzalloc(sizeof(*r), GFP_KERNEL);
188-
if (!r)
189-
return -ENOMEM;
190-
191-
rc = __fmr_init(r, pd);
192-
if (rc) {
193-
kfree(r);
194-
return rc;
195-
}
196-
197-
r->mw_xprt = r_xprt;
198-
list_add(&r->mw_list, &buf->rb_mws);
199-
list_add(&r->mw_all, &buf->rb_all);
200-
}
201-
return 0;
202-
}
203-
204167
/* Use the ib_map_phys_fmr() verb to register a memory region
205168
* for remote access via RDMA READ or RDMA WRITE.
206169
*/
@@ -374,27 +337,14 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
374337
}
375338
}
376339

377-
static void
378-
fmr_op_destroy(struct rpcrdma_buffer *buf)
379-
{
380-
struct rpcrdma_mw *r;
381-
382-
while (!list_empty(&buf->rb_all)) {
383-
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
384-
list_del(&r->mw_all);
385-
__fmr_release(r);
386-
kfree(r);
387-
}
388-
}
389-
390340
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
391341
.ro_map = fmr_op_map,
392342
.ro_unmap_sync = fmr_op_unmap_sync,
393343
.ro_unmap_safe = fmr_op_unmap_safe,
394344
.ro_recover_mr = fmr_op_recover_mr,
395345
.ro_open = fmr_op_open,
396346
.ro_maxpages = fmr_op_maxpages,
397-
.ro_init = fmr_op_init,
398-
.ro_destroy = fmr_op_destroy,
347+
.ro_init_mr = fmr_op_init_mr,
348+
.ro_release_mr = fmr_op_release_mr,
399349
.ro_displayname = "fmr",
400350
};

net/sunrpc/xprtrdma/frwr_ops.c

Lines changed: 7 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,13 @@ frwr_is_supported(struct rpcrdma_ia *ia)
9191
}
9292

9393
static int
94-
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, unsigned int depth)
94+
frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
9595
{
96+
unsigned int depth = ia->ri_max_frmr_depth;
9697
struct rpcrdma_frmr *f = &r->frmr;
9798
int rc;
9899

99-
f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
100+
f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
100101
if (IS_ERR(f->fr_mr))
101102
goto out_mr_err;
102103

@@ -123,7 +124,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, unsigned int depth)
123124
}
124125

125126
static void
126-
__frwr_release(struct rpcrdma_mw *r)
127+
frwr_op_release_mr(struct rpcrdma_mw *r)
127128
{
128129
int rc;
129130

@@ -132,6 +133,7 @@ __frwr_release(struct rpcrdma_mw *r)
132133
pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
133134
r, rc);
134135
kfree(r->mw_sg);
136+
kfree(r);
135137
}
136138

137139
static int
@@ -319,45 +321,6 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
319321
complete_all(&frmr->fr_linv_done);
320322
}
321323

322-
static int
323-
frwr_op_init(struct rpcrdma_xprt *r_xprt)
324-
{
325-
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
326-
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
327-
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
328-
int i;
329-
330-
spin_lock_init(&buf->rb_mwlock);
331-
INIT_LIST_HEAD(&buf->rb_mws);
332-
INIT_LIST_HEAD(&buf->rb_all);
333-
334-
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
335-
i += 2; /* head + tail */
336-
i *= buf->rb_max_requests; /* one set for each RPC slot */
337-
dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
338-
339-
while (i--) {
340-
struct rpcrdma_mw *r;
341-
int rc;
342-
343-
r = kzalloc(sizeof(*r), GFP_KERNEL);
344-
if (!r)
345-
return -ENOMEM;
346-
347-
rc = __frwr_init(r, pd, depth);
348-
if (rc) {
349-
kfree(r);
350-
return rc;
351-
}
352-
353-
r->mw_xprt = r_xprt;
354-
list_add(&r->mw_list, &buf->rb_mws);
355-
list_add(&r->mw_all, &buf->rb_all);
356-
}
357-
358-
return 0;
359-
}
360-
361324
/* Post a REG_MR Work Request to register a memory region
362325
* for remote access via RDMA READ or RDMA WRITE.
363326
*/
@@ -618,27 +581,14 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
618581
}
619582
}
620583

621-
static void
622-
frwr_op_destroy(struct rpcrdma_buffer *buf)
623-
{
624-
struct rpcrdma_mw *r;
625-
626-
while (!list_empty(&buf->rb_all)) {
627-
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
628-
list_del(&r->mw_all);
629-
__frwr_release(r);
630-
kfree(r);
631-
}
632-
}
633-
634584
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
635585
.ro_map = frwr_op_map,
636586
.ro_unmap_sync = frwr_op_unmap_sync,
637587
.ro_unmap_safe = frwr_op_unmap_safe,
638588
.ro_recover_mr = frwr_op_recover_mr,
639589
.ro_open = frwr_op_open,
640590
.ro_maxpages = frwr_op_maxpages,
641-
.ro_init = frwr_op_init,
642-
.ro_destroy = frwr_op_destroy,
591+
.ro_init_mr = frwr_op_init_mr,
592+
.ro_release_mr = frwr_op_release_mr,
643593
.ro_displayname = "frwr",
644594
};

net/sunrpc/xprtrdma/transport.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -682,9 +682,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
682682
r_xprt->rx_stats.failed_marshal_count,
683683
r_xprt->rx_stats.bad_reply_count,
684684
r_xprt->rx_stats.nomsg_call_count);
685-
seq_printf(seq, "%lu %lu\n",
685+
seq_printf(seq, "%lu %lu %lu\n",
686686
r_xprt->rx_stats.mrs_recovered,
687-
r_xprt->rx_stats.mrs_orphaned);
687+
r_xprt->rx_stats.mrs_orphaned,
688+
r_xprt->rx_stats.mrs_allocated);
688689
}
689690

690691
static int

net/sunrpc/xprtrdma/verbs.c

Lines changed: 92 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,55 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
782782
schedule_delayed_work(&buf->rb_recovery_worker, 0);
783783
}
784784

785+
static void
786+
rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
787+
{
788+
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
789+
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
790+
unsigned int count;
791+
LIST_HEAD(free);
792+
LIST_HEAD(all);
793+
794+
for (count = 0; count < 32; count++) {
795+
struct rpcrdma_mw *mw;
796+
int rc;
797+
798+
mw = kzalloc(sizeof(*mw), GFP_KERNEL);
799+
if (!mw)
800+
break;
801+
802+
rc = ia->ri_ops->ro_init_mr(ia, mw);
803+
if (rc) {
804+
kfree(mw);
805+
break;
806+
}
807+
808+
mw->mw_xprt = r_xprt;
809+
810+
list_add(&mw->mw_list, &free);
811+
list_add(&mw->mw_all, &all);
812+
}
813+
814+
spin_lock(&buf->rb_mwlock);
815+
list_splice(&free, &buf->rb_mws);
816+
list_splice(&all, &buf->rb_all);
817+
r_xprt->rx_stats.mrs_allocated += count;
818+
spin_unlock(&buf->rb_mwlock);
819+
820+
dprintk("RPC: %s: created %u MRs\n", __func__, count);
821+
}
822+
823+
static void
824+
rpcrdma_mr_refresh_worker(struct work_struct *work)
825+
{
826+
struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
827+
rb_refresh_worker.work);
828+
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
829+
rx_buf);
830+
831+
rpcrdma_create_mrs(r_xprt);
832+
}
833+
785834
struct rpcrdma_req *
786835
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
787836
{
@@ -837,21 +886,23 @@ int
837886
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
838887
{
839888
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
840-
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
841889
int i, rc;
842890

843891
buf->rb_max_requests = r_xprt->rx_data.max_requests;
844892
buf->rb_bc_srv_max_requests = 0;
845893
atomic_set(&buf->rb_credits, 1);
894+
spin_lock_init(&buf->rb_mwlock);
846895
spin_lock_init(&buf->rb_lock);
847896
spin_lock_init(&buf->rb_recovery_lock);
897+
INIT_LIST_HEAD(&buf->rb_mws);
898+
INIT_LIST_HEAD(&buf->rb_all);
848899
INIT_LIST_HEAD(&buf->rb_stale_mrs);
900+
INIT_DELAYED_WORK(&buf->rb_refresh_worker,
901+
rpcrdma_mr_refresh_worker);
849902
INIT_DELAYED_WORK(&buf->rb_recovery_worker,
850903
rpcrdma_mr_recovery_worker);
851904

852-
rc = ia->ri_ops->ro_init(r_xprt);
853-
if (rc)
854-
goto out;
905+
rpcrdma_create_mrs(r_xprt);
855906

856907
INIT_LIST_HEAD(&buf->rb_send_bufs);
857908
INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -927,6 +978,32 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
927978
kfree(req);
928979
}
929980

981+
static void
982+
rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
983+
{
984+
struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
985+
rx_buf);
986+
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
987+
struct rpcrdma_mw *mw;
988+
unsigned int count;
989+
990+
count = 0;
991+
spin_lock(&buf->rb_mwlock);
992+
while (!list_empty(&buf->rb_all)) {
993+
mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
994+
list_del(&mw->mw_all);
995+
996+
spin_unlock(&buf->rb_mwlock);
997+
ia->ri_ops->ro_release_mr(mw);
998+
count++;
999+
spin_lock(&buf->rb_mwlock);
1000+
}
1001+
spin_unlock(&buf->rb_mwlock);
1002+
r_xprt->rx_stats.mrs_allocated = 0;
1003+
1004+
dprintk("RPC: %s: released %u MRs\n", __func__, count);
1005+
}
1006+
9301007
void
9311008
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
9321009
{
@@ -955,7 +1032,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
9551032
}
9561033
spin_unlock(&buf->rb_reqslock);
9571034

958-
ia->ri_ops->ro_destroy(buf);
1035+
rpcrdma_destroy_mrs(buf);
9591036
}
9601037

9611038
struct rpcrdma_mw *
@@ -973,8 +1050,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
9731050
spin_unlock(&buf->rb_mwlock);
9741051

9751052
if (!mw)
976-
pr_err("RPC: %s: no MWs available\n", __func__);
1053+
goto out_nomws;
9771054
return mw;
1055+
1056+
out_nomws:
1057+
dprintk("RPC: %s: no MWs available\n", __func__);
1058+
schedule_delayed_work(&buf->rb_refresh_worker, 0);
1059+
1060+
/* Allow the reply handler and refresh worker to run */
1061+
cond_resched();
1062+
1063+
return NULL;
9781064
}
9791065

9801066
void

0 commit comments

Comments
 (0)