Skip to content

Commit 401910d

Browse files
sowminivdavem330
authored andcommitted
rds: deliver zerocopy completion notification with data
This commit is an optimization over commit 01883ed ("rds: support for zcopy completion notification") for PF_RDS sockets. RDS applications are predominantly request-response transactions, so it is more efficient to reduce the number of system calls and have zerocopy completion notification delivered as ancillary data on the POLLIN channel. Cookies are passed up as ancillary data (at level SOL_RDS) in a struct rds_zcopy_cookies when the returned value of recvmsg() is greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed with each message. This commit removes support for zerocopy completion notification on MSG_ERRQUEUE for PF_RDS sockets. Signed-off-by: Sowmini Varadhan <[email protected]> Acked-by: Willem de Bruijn <[email protected]> Acked-by: Santosh Shilimkar <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 67490e3 commit 401910d

File tree

6 files changed

+60
-27
lines changed

6 files changed

+60
-27
lines changed

include/uapi/linux/errqueue.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,11 @@ struct sock_extended_err {
2020
#define SO_EE_ORIGIN_ICMP6 3
2121
#define SO_EE_ORIGIN_TXSTATUS 4
2222
#define SO_EE_ORIGIN_ZEROCOPY 5
23-
#define SO_EE_ORIGIN_ZCOOKIE 6
2423
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
2524

2625
#define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1))
2726

2827
#define SO_EE_CODE_ZEROCOPY_COPIED 1
29-
#define SO_EE_ORIGIN_MAX_ZCOOKIES 8
3028

3129
/**
3230
* struct scm_timestamping - timestamps exposed through cmsg

include/uapi/linux/rds.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
105105
#define RDS_CMSG_RXPATH_LATENCY 11
106106
#define RDS_CMSG_ZCOPY_COOKIE 12
107+
#define RDS_CMSG_ZCOPY_COMPLETION 13
107108

108109
#define RDS_INFO_FIRST 10000
109110
#define RDS_INFO_COUNTERS 10000
@@ -317,6 +318,12 @@ struct rds_rdma_notify {
317318
#define RDS_RDMA_DROPPED 3
318319
#define RDS_RDMA_OTHER_ERROR 4
319320

321+
#define RDS_MAX_ZCOOKIES 8
322+
struct rds_zcopy_cookies {
323+
__u32 num;
324+
__u32 cookies[RDS_MAX_ZCOOKIES];
325+
};
326+
320327
/*
321328
* Common set of flags for all RDMA related structs
322329
*/

net/rds/af_rds.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ static int rds_release(struct socket *sock)
7777
rds_send_drop_to(rs, NULL);
7878
rds_rdma_drop_keys(rs);
7979
rds_notify_queue_get(rs, NULL);
80+
__skb_queue_purge(&rs->rs_zcookie_queue);
8081

8182
spin_lock_bh(&rds_sock_lock);
8283
list_del_init(&rs->rs_item);
@@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
144145
* - to signal that a previously congested destination may have become
145146
* uncongested
146147
* - A notification has been queued to the socket (this can be a congestion
147-
* update, or a RDMA completion).
148+
* update, or a RDMA completion, or a MSG_ZEROCOPY completion).
148149
*
149150
* EPOLLOUT is asserted if there is room on the send queue. This does not mean
150151
* however, that the next sendmsg() call will succeed. If the application tries
@@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
178179
spin_unlock(&rs->rs_lock);
179180
}
180181
if (!list_empty(&rs->rs_recv_queue) ||
181-
!list_empty(&rs->rs_notify_queue))
182+
!list_empty(&rs->rs_notify_queue) ||
183+
!skb_queue_empty(&rs->rs_zcookie_queue))
182184
mask |= (EPOLLIN | EPOLLRDNORM);
183185
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
184186
mask |= (EPOLLOUT | EPOLLWRNORM);
@@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
513515
INIT_LIST_HEAD(&rs->rs_recv_queue);
514516
INIT_LIST_HEAD(&rs->rs_notify_queue);
515517
INIT_LIST_HEAD(&rs->rs_cong_list);
518+
skb_queue_head_init(&rs->rs_zcookie_queue);
516519
spin_lock_init(&rs->rs_rdma_lock);
517520
rs->rs_rdma_keys = RB_ROOT;
518521
rs->rs_rx_traces = 0;

net/rds/message.c

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -58,55 +58,46 @@ EXPORT_SYMBOL_GPL(rds_message_addref);
5858

5959
static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
6060
{
61-
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
62-
int ncookies;
63-
u32 *ptr;
61+
struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb;
62+
int ncookies = ck->num;
6463

65-
if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
64+
if (ncookies == RDS_MAX_ZCOOKIES)
6665
return false;
67-
ncookies = serr->ee.ee_data;
68-
if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
69-
return false;
70-
ptr = skb_put(skb, sizeof(u32));
71-
*ptr = cookie;
72-
serr->ee.ee_data = ++ncookies;
66+
ck->cookies[ncookies] = cookie;
67+
ck->num = ++ncookies;
7368
return true;
7469
}
7570

7671
static void rds_rm_zerocopy_callback(struct rds_sock *rs,
7772
struct rds_znotifier *znotif)
7873
{
79-
struct sock *sk = rds_rs_to_sk(rs);
8074
struct sk_buff *skb, *tail;
81-
struct sock_exterr_skb *serr;
8275
unsigned long flags;
8376
struct sk_buff_head *q;
8477
u32 cookie = znotif->z_cookie;
78+
struct rds_zcopy_cookies *ck;
8579

86-
q = &sk->sk_error_queue;
80+
q = &rs->rs_zcookie_queue;
8781
spin_lock_irqsave(&q->lock, flags);
8882
tail = skb_peek_tail(q);
8983

9084
if (tail && skb_zcookie_add(tail, cookie)) {
9185
spin_unlock_irqrestore(&q->lock, flags);
9286
mm_unaccount_pinned_pages(&znotif->z_mmp);
9387
consume_skb(rds_skb_from_znotifier(znotif));
94-
sk->sk_error_report(sk);
88+
/* caller invokes rds_wake_sk_sleep() */
9589
return;
9690
}
9791

9892
skb = rds_skb_from_znotifier(znotif);
99-
serr = SKB_EXT_ERR(skb);
100-
memset(&serr->ee, 0, sizeof(serr->ee));
101-
serr->ee.ee_errno = 0;
102-
serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
103-
serr->ee.ee_info = 0;
93+
ck = (struct rds_zcopy_cookies *)skb->cb;
94+
memset(ck, 0, sizeof(*ck));
10495
WARN_ON(!skb_zcookie_add(skb, cookie));
10596

10697
__skb_queue_tail(q, skb);
10798

10899
spin_unlock_irqrestore(&q->lock, flags);
109-
sk->sk_error_report(sk);
100+
/* caller invokes rds_wake_sk_sleep() */
110101

111102
mm_unaccount_pinned_pages(&znotif->z_mmp);
112103
}
@@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm)
129120
if (rm->data.op_mmp_znotifier) {
130121
zcopy = true;
131122
rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
123+
rds_wake_sk_sleep(rs);
132124
rm->data.op_mmp_znotifier = NULL;
133125
}
134126
sock_put(rds_rs_to_sk(rs));
@@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
362354
int total_copied = 0;
363355
struct sk_buff *skb;
364356

365-
skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
366-
GFP_KERNEL);
357+
skb = alloc_skb(0, GFP_KERNEL);
367358
if (!skb)
368359
return -ENOMEM;
360+
BUILD_BUG_ON(sizeof(skb->cb) <
361+
max_t(int, sizeof(struct rds_znotifier),
362+
sizeof(struct rds_zcopy_cookies)));
369363
rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
370364
if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
371365
length)) {

net/rds/rds.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,8 @@ struct rds_sock {
603603
/* Socket receive path trace points*/
604604
u8 rs_rx_traces;
605605
u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
606+
607+
struct sk_buff_head rs_zcookie_queue;
606608
};
607609

608610
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)

net/rds/recv.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,32 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
577577
return ret;
578578
}
579579

580+
static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
581+
{
582+
struct sk_buff *skb;
583+
struct sk_buff_head *q = &rs->rs_zcookie_queue;
584+
struct rds_zcopy_cookies *done;
585+
586+
if (!msg->msg_control)
587+
return false;
588+
589+
if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
590+
msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
591+
return false;
592+
593+
skb = skb_dequeue(q);
594+
if (!skb)
595+
return false;
596+
done = (struct rds_zcopy_cookies *)skb->cb;
597+
if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
598+
done)) {
599+
skb_queue_head(q, skb);
600+
return false;
601+
}
602+
consume_skb(skb);
603+
return true;
604+
}
605+
580606
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
581607
int msg_flags)
582608
{
@@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
611637

612638
if (!rds_next_incoming(rs, &inc)) {
613639
if (nonblock) {
614-
ret = -EAGAIN;
640+
bool reaped = rds_recvmsg_zcookie(rs, msg);
641+
642+
ret = reaped ? 0 : -EAGAIN;
615643
break;
616644
}
617645

@@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
660688
ret = -EFAULT;
661689
goto out;
662690
}
691+
rds_recvmsg_zcookie(rs, msg);
663692

664693
rds_stats_inc(s_recv_delivered);
665694

0 commit comments

Comments
 (0)