Skip to content

Commit 7a89f9c

Browse files
chuckleveramschuma-ntap
authored andcommitted
xprtrdma: Honor ->send_request API contract
Commit c93c622 ("xprtrdma: Disconnect on registration failure") added a disconnect for some RPC marshaling failures. This is needed only in a handful of cases, but it was triggering for simple stuff like temporary resource shortages. Try to straighten this out. Fix up the lower layers so they don't return -ENOMEM or other error codes that the RPC client's FSM doesn't explicitly recognize. Also fix up the places in the send_request path that do want a disconnect. For example, when ib_post_send or ib_post_recv fail, this is a sign that there is a send or receive queue resource miscalculation. That should be rare, and is a sign of a software bug. But xprtrdma can recover: disconnect to reset the transport and start over. Signed-off-by: Chuck Lever <[email protected]> Tested-by: Steve Wise <[email protected]> Signed-off-by: Anna Schumaker <[email protected]>
1 parent 3d4cf35 commit 7a89f9c

File tree

5 files changed

+39
-24
lines changed

5 files changed

+39
-24
lines changed

net/sunrpc/xprtrdma/fmr_ops.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
219219
rpcrdma_defer_mr_recovery(mw);
220220
mw = rpcrdma_get_mw(r_xprt);
221221
if (!mw)
222-
return -ENOMEM;
222+
return -ENOBUFS;
223223

224224
pageoff = offset_in_page(seg1->mr_offset);
225225
seg1->mr_offset -= pageoff; /* start of page */
@@ -269,14 +269,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
269269
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
270270
mw->mw_sg, mw->mw_nents);
271271
rpcrdma_defer_mr_recovery(mw);
272-
return -ENOMEM;
272+
return -EIO;
273273

274274
out_maperr:
275275
pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
276276
len, (unsigned long long)dma_pages[0],
277277
pageoff, mw->mw_nents, rc);
278278
rpcrdma_defer_mr_recovery(mw);
279-
return rc;
279+
return -EIO;
280280
}
281281

282282
/* Invalidate all memory regions that were registered for "req".

net/sunrpc/xprtrdma/frwr_ops.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
382382
rpcrdma_defer_mr_recovery(mw);
383383
mw = rpcrdma_get_mw(r_xprt);
384384
if (!mw)
385-
return -ENOMEM;
385+
return -ENOBUFS;
386386
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
387387
frmr = &mw->frmr;
388388
frmr->fr_state = FRMR_IS_VALID;
@@ -456,18 +456,18 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
456456
pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
457457
mw->mw_sg, mw->mw_nents);
458458
rpcrdma_defer_mr_recovery(mw);
459-
return -ENOMEM;
459+
return -EIO;
460460

461461
out_mapmr_err:
462462
pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
463463
frmr->fr_mr, n, mw->mw_nents);
464-
rc = n < 0 ? n : -EIO;
465464
rpcrdma_defer_mr_recovery(mw);
466-
return rc;
465+
return -EIO;
467466

468467
out_senderr:
468+
pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
469469
rpcrdma_defer_mr_recovery(mw);
470-
return rc;
470+
return -ENOTCONN;
471471
}
472472

473473
static struct ib_send_wr *
@@ -569,7 +569,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
569569
return;
570570

571571
reset_mrs:
572-
pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
572+
pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
573+
rdma_disconnect(ia->ri_id);
573574

574575
/* Find and reset the MRs in the LOCAL_INV WRs that did not
575576
* get posted. This is synchronous, and slow.

net/sunrpc/xprtrdma/rpc_rdma.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
251251
/* alloc the pagelist for receiving buffer */
252252
ppages[p] = alloc_page(GFP_ATOMIC);
253253
if (!ppages[p])
254-
return -ENOMEM;
254+
return -EAGAIN;
255255
}
256256
seg[n].mr_page = ppages[p];
257257
seg[n].mr_offset = (void *)(unsigned long) page_base;

net/sunrpc/xprtrdma/transport.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
558558

559559
out_fail:
560560
rpcrdma_buffer_put(req);
561-
r_xprt->rx_stats.failed_marshal_count++;
562561
return NULL;
563562
}
564563

@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
590589
rpcrdma_buffer_put(req);
591590
}
592591

593-
/*
592+
/**
593+
* xprt_rdma_send_request - marshal and send an RPC request
594+
* @task: RPC task with an RPC message in rq_snd_buf
595+
*
596+
* Return values:
597+
* 0: The request has been sent
598+
* ENOTCONN: Caller needs to invoke connect logic then call again
599+
* ENOBUFS: Call again later to send the request
600+
* EIO: A permanent error occurred. The request was not sent,
601+
* and don't try it again
602+
*
594603
* send_request invokes the meat of RPC RDMA. It must do the following:
604+
*
595605
* 1. Marshal the RPC request into an RPC RDMA request, which means
596606
* putting a header in front of data, and creating IOVs for RDMA
597607
* from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
600610
* the request (rpcrdma_ep_post).
601611
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
602612
*/
603-
604613
static int
605614
xprt_rdma_send_request(struct rpc_task *task)
606615
{
@@ -630,11 +639,12 @@ xprt_rdma_send_request(struct rpc_task *task)
630639
return 0;
631640

632641
failed_marshal:
633-
r_xprt->rx_stats.failed_marshal_count++;
634642
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
635643
__func__, rc);
636644
if (rc == -EIO)
637-
return -EIO;
645+
r_xprt->rx_stats.failed_marshal_count++;
646+
if (rc != -ENOTCONN)
647+
return rc;
638648
drop_connection:
639649
xprt_disconnect_done(xprt);
640650
return -ENOTCONN; /* implies disconnect */

net/sunrpc/xprtrdma/verbs.c

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,7 +1151,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
11511151
if (rep) {
11521152
rc = rpcrdma_ep_post_recv(ia, ep, rep);
11531153
if (rc)
1154-
goto out;
1154+
return rc;
11551155
req->rl_reply = NULL;
11561156
}
11571157

@@ -1176,10 +1176,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
11761176

11771177
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
11781178
if (rc)
1179-
dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1180-
rc);
1181-
out:
1182-
return rc;
1179+
goto out_postsend_err;
1180+
return 0;
1181+
1182+
out_postsend_err:
1183+
pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
1184+
return -ENOTCONN;
11831185
}
11841186

11851187
/*
@@ -1204,11 +1206,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
12041206
DMA_BIDIRECTIONAL);
12051207

12061208
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1207-
12081209
if (rc)
1209-
dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1210-
rc);
1211-
return rc;
1210+
goto out_postrecv;
1211+
return 0;
1212+
1213+
out_postrecv:
1214+
pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
1215+
return -ENOTCONN;
12121216
}
12131217

12141218
/**

0 commit comments

Comments
 (0)