Skip to content

Commit 7d81ee8

Browse files
committed
svcrdma: Single-stage RDMA Read
Currently the generic RPC server layer calls svc_rdma_recvfrom() twice to retrieve an RPC message that uses Read chunks. I'm not exactly sure why this design was chosen originally. Instead, let's wait for the Read chunk completion inline in the first call to svc_rdma_recvfrom(). The goal is to eliminate some page allocator churn. rdma_read_complete() replaces pages in the second svc_rqst by calling put_page() repeatedly while the upper layer waits for the request to be constructed, which adds unnecessary NFS WRITE round- trip latency. Signed-off-by: Chuck Lever <[email protected]> Reviewed-by: Tom Talpey <[email protected]>
1 parent 82011c8 commit 7d81ee8

File tree

2 files changed

+37
-67
lines changed

2 files changed

+37
-67
lines changed

net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
853853
spin_unlock(&rdma_xprt->sc_rq_dto_lock);
854854
percpu_counter_inc(&svcrdma_stat_recv);
855855

856+
/* Unblock the transport for the next receive */
857+
svc_xprt_received(xprt);
858+
856859
ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
857860
ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
858861
DMA_FROM_DEVICE);
@@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
884887
rqstp->rq_xprt_ctxt = ctxt;
885888
rqstp->rq_prot = IPPROTO_MAX;
886889
svc_xprt_copy_addrs(rqstp, xprt);
887-
svc_xprt_received(xprt);
888890
return rqstp->rq_arg.len;
889891

890892
out_readlist:
891893
ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
892894
if (ret < 0)
893895
goto out_readfail;
894-
svc_xprt_received(xprt);
895-
return 0;
896+
goto complete;
896897

897898
out_err:
898899
svc_rdma_send_error(rdma_xprt, ctxt, ret);
899900
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
900-
svc_xprt_received(xprt);
901901
return 0;
902902

903903
out_readfail:
904904
if (ret == -EINVAL)
905905
svc_rdma_send_error(rdma_xprt, ctxt, ret);
906906
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
907-
svc_xprt_received(xprt);
908907
return ret;
909908

910909
out_backchannel:
911910
svc_rdma_handle_bc_reply(rqstp, ctxt);
912911
out_drop:
913912
svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
914-
svc_xprt_received(xprt);
915913
return 0;
916914
}

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 33 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
150150
struct svcxprt_rdma *cc_rdma;
151151
struct list_head cc_rwctxts;
152152
int cc_sqecount;
153+
enum ib_wc_status cc_status;
154+
struct completion cc_done;
153155
};
154156

155157
static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
@@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
299301
struct svc_rdma_chunk_ctxt *cc =
300302
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
301303
struct svcxprt_rdma *rdma = cc->cc_rdma;
302-
struct svc_rdma_read_info *info =
303-
container_of(cc, struct svc_rdma_read_info, ri_cc);
304304

305305
trace_svcrdma_wc_read(wc, &cc->cc_cid);
306306

307307
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
308308
wake_up(&rdma->sc_send_wait);
309309

310-
if (unlikely(wc->status != IB_WC_SUCCESS)) {
311-
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
312-
svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
313-
} else {
314-
spin_lock(&rdma->sc_rq_dto_lock);
315-
list_add_tail(&info->ri_readctxt->rc_list,
316-
&rdma->sc_read_complete_q);
317-
/* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
318-
set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
319-
spin_unlock(&rdma->sc_rq_dto_lock);
320-
321-
svc_xprt_enqueue(&rdma->sc_xprt);
322-
}
323-
324-
svc_rdma_read_info_free(info);
310+
cc->cc_status = wc->status;
311+
complete(&cc->cc_done);
312+
return;
325313
}
326314

327315
/* This function sleeps when the transport's Send Queue is congested.
@@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
676664
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
677665
struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
678666
struct svc_rqst *rqstp = info->ri_rqst;
679-
struct svc_rdma_rw_ctxt *ctxt;
680667
unsigned int sge_no, seg_len, len;
668+
struct svc_rdma_rw_ctxt *ctxt;
681669
struct scatterlist *sg;
682670
int ret;
683671

@@ -693,8 +681,6 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
693681
seg_len = min_t(unsigned int, len,
694682
PAGE_SIZE - info->ri_pageoff);
695683

696-
head->rc_arg.pages[info->ri_pageno] =
697-
rqstp->rq_pages[info->ri_pageno];
698684
if (!info->ri_pageoff)
699685
head->rc_page_count++;
700686

@@ -788,12 +774,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
788774
page_len = min_t(unsigned int, remaining,
789775
PAGE_SIZE - info->ri_pageoff);
790776

791-
head->rc_arg.pages[info->ri_pageno] =
792-
rqstp->rq_pages[info->ri_pageno];
793777
if (!info->ri_pageoff)
794778
head->rc_page_count++;
795779

796-
dst = page_address(head->rc_arg.pages[info->ri_pageno]);
780+
dst = page_address(rqstp->rq_pages[info->ri_pageno]);
797781
memcpy(dst + info->ri_pageno, src + offset, page_len);
798782

799783
info->ri_totalbytes += page_len;
@@ -813,7 +797,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
813797
* svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
814798
* @info: context for RDMA Reads
815799
*
816-
* The chunk data lands in head->rc_arg as a series of contiguous pages,
800+
* The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
817801
* like an incoming TCP call.
818802
*
819803
* Return values:
@@ -827,8 +811,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
827811
{
828812
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
829813
const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
814+
struct xdr_buf *buf = &info->ri_rqst->rq_arg;
830815
struct svc_rdma_chunk *chunk, *next;
831-
struct xdr_buf *buf = &head->rc_arg;
832816
unsigned int start, length;
833817
int ret;
834818

@@ -864,9 +848,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
864848
buf->len += info->ri_totalbytes;
865849
buf->buflen += info->ri_totalbytes;
866850

867-
head->rc_hdr_count = 1;
868-
buf->head[0].iov_base = page_address(head->rc_pages[0]);
851+
buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
869852
buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
853+
buf->pages = &info->ri_rqst->rq_pages[1];
870854
buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
871855
return 0;
872856
}
@@ -875,9 +859,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
875859
* svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
876860
* @info: context for RDMA Reads
877861
*
878-
* The chunk data lands in the page list of head->rc_arg.pages.
862+
* The chunk data lands in the page list of rqstp->rq_arg.pages.
879863
*
880-
* Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
864+
* Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
881865
* Therefore, XDR round-up of the Read chunk and trailing
882866
* inline content must both be added at the end of the pagelist.
883867
*
@@ -891,7 +875,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
891875
static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
892876
{
893877
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
894-
struct xdr_buf *buf = &head->rc_arg;
878+
struct xdr_buf *buf = &info->ri_rqst->rq_arg;
895879
struct svc_rdma_chunk *chunk;
896880
unsigned int length;
897881
int ret;
@@ -901,8 +885,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
901885
if (ret < 0)
902886
goto out;
903887

904-
head->rc_hdr_count = 0;
905-
906888
/* Split the Receive buffer between the head and tail
907889
* buffers at Read chunk's position. XDR roundup of the
908890
* chunk is not included in either the pagelist or in
@@ -921,7 +903,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
921903
* Currently these chunks always start at page offset 0,
922904
* thus the rounded-up length never crosses a page boundary.
923905
*/
924-
length = XDR_QUADLEN(info->ri_totalbytes) << 2;
906+
buf->pages = &info->ri_rqst->rq_pages[0];
907+
length = xdr_align_size(chunk->ch_length);
925908
buf->page_len = length;
926909
buf->len += length;
927910
buf->buflen += length;
@@ -1033,8 +1016,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
10331016
* @info: context for RDMA Reads
10341017
*
10351018
* The start of the data lands in the first page just after the
1036-
* Transport header, and the rest lands in the page list of
1037-
* head->rc_arg.pages.
1019+
* Transport header, and the rest lands in rqstp->rq_arg.pages.
10381020
*
10391021
* Assumptions:
10401022
* - A PZRC is never sent in an RDMA_MSG message, though it's
@@ -1049,8 +1031,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
10491031
*/
10501032
static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
10511033
{
1052-
struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
1053-
struct xdr_buf *buf = &head->rc_arg;
1034+
struct xdr_buf *buf = &info->ri_rqst->rq_arg;
10541035
int ret;
10551036

10561037
ret = svc_rdma_read_call_chunk(info);
@@ -1060,35 +1041,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
10601041
buf->len += info->ri_totalbytes;
10611042
buf->buflen += info->ri_totalbytes;
10621043

1063-
head->rc_hdr_count = 1;
1064-
buf->head[0].iov_base = page_address(head->rc_pages[0]);
1044+
buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
10651045
buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
1046+
buf->pages = &info->ri_rqst->rq_pages[1];
10661047
buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
10671048

10681049
out:
10691050
return ret;
10701051
}
10711052

1072-
/* Pages under I/O have been copied to head->rc_pages. Ensure they
1073-
* are not released by svc_xprt_release() until the I/O is complete.
1074-
*
1075-
* This has to be done after all Read WRs are constructed to properly
1076-
* handle a page that is part of I/O on behalf of two different RDMA
1077-
* segments.
1078-
*
1079-
* Do this only if I/O has been posted. Otherwise, we do indeed want
1080-
* svc_xprt_release() to clean things up properly.
1081-
*/
1082-
static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
1083-
const unsigned int start,
1084-
const unsigned int num_pages)
1085-
{
1086-
unsigned int i;
1087-
1088-
for (i = start; i < num_pages + start; i++)
1089-
rqstp->rq_pages[i] = NULL;
1090-
}
1091-
10921053
/**
10931054
* svc_rdma_process_read_list - Pull list of Read chunks from the client
10941055
* @rdma: controlling RDMA transport
@@ -1153,11 +1114,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
11531114
goto out_err;
11541115

11551116
trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
1117+
init_completion(&cc->cc_done);
11561118
ret = svc_rdma_post_chunk_ctxt(cc);
11571119
if (ret < 0)
11581120
goto out_err;
1159-
svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
1160-
return 1;
1121+
1122+
ret = 1;
1123+
wait_for_completion(&cc->cc_done);
1124+
if (cc->cc_status != IB_WC_SUCCESS)
1125+
ret = -EIO;
1126+
1127+
/* rq_respages starts after the last arg page */
1128+
rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
1129+
rqstp->rq_next_page = rqstp->rq_respages + 1;
1130+
1131+
/* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
1132+
head->rc_page_count = 0;
11611133

11621134
out_err:
11631135
svc_rdma_read_info_free(info);

0 commit comments

Comments
 (0)