Skip to content

Commit 4ab6c99

Browse files
wdebruijdavem330
authored andcommitted
sock: MSG_ZEROCOPY notification coalescing
In the simple case, each sendmsg() call generates data and eventually a zerocopy ready notification N, where N indicates the Nth successful invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket. TCP and corked sockets can cause send() calls to append new data to an existing sk_buff and, thus, ubuf_info. In that case the notification must hold a range. odify ubuf_info to store a inclusive range [N..N+m] and add skb_zerocopy_realloc() to optionally extend an existing range. Also coalesce notifications in this common case: if a notification [1, 1] is about to be queued while [0, 0] is the queue tail, just modify the head of the queue to read [0, 1]. Coalescing is limited to a few TSO frames worth of data to bound notification latency. Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 1f8b977 commit 4ab6c99

File tree

2 files changed

+106
-10
lines changed

2 files changed

+106
-10
lines changed

include/linux/skbuff.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -444,15 +444,26 @@ enum {
444444
*/
445445
struct ubuf_info {
446446
void (*callback)(struct ubuf_info *, bool zerocopy_success);
447-
void *ctx;
448-
unsigned long desc;
449-
u16 zerocopy:1;
447+
union {
448+
struct {
449+
unsigned long desc;
450+
void *ctx;
451+
};
452+
struct {
453+
u32 id;
454+
u16 len;
455+
u16 zerocopy:1;
456+
u32 bytelen;
457+
};
458+
};
450459
atomic_t refcnt;
451460
};
452461

453462
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
454463

455464
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
465+
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
466+
struct ubuf_info *uarg);
456467

457468
static inline void sock_zerocopy_get(struct ubuf_info *uarg)
458469
{

net/core/skbuff.c

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -915,7 +915,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
915915
uarg = (void *)skb->cb;
916916

917917
uarg->callback = sock_zerocopy_callback;
918-
uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
918+
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
919+
uarg->len = 1;
920+
uarg->bytelen = size;
919921
uarg->zerocopy = 1;
920922
atomic_set(&uarg->refcnt, 0);
921923
sock_hold(sk);
@@ -929,26 +931,101 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
929931
return container_of((void *)uarg, struct sk_buff, cb);
930932
}
931933

934+
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
935+
struct ubuf_info *uarg)
936+
{
937+
if (uarg) {
938+
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
939+
u32 bytelen, next;
940+
941+
/* realloc only when socket is locked (TCP, UDP cork),
942+
* so uarg->len and sk_zckey access is serialized
943+
*/
944+
if (!sock_owned_by_user(sk)) {
945+
WARN_ON_ONCE(1);
946+
return NULL;
947+
}
948+
949+
bytelen = uarg->bytelen + size;
950+
if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
951+
/* TCP can create new skb to attach new uarg */
952+
if (sk->sk_type == SOCK_STREAM)
953+
goto new_alloc;
954+
return NULL;
955+
}
956+
957+
next = (u32)atomic_read(&sk->sk_zckey);
958+
if ((u32)(uarg->id + uarg->len) == next) {
959+
uarg->len++;
960+
uarg->bytelen = bytelen;
961+
atomic_set(&sk->sk_zckey, ++next);
962+
return uarg;
963+
}
964+
}
965+
966+
new_alloc:
967+
return sock_zerocopy_alloc(sk, size);
968+
}
969+
EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
970+
971+
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
972+
{
973+
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
974+
u32 old_lo, old_hi;
975+
u64 sum_len;
976+
977+
old_lo = serr->ee.ee_info;
978+
old_hi = serr->ee.ee_data;
979+
sum_len = old_hi - old_lo + 1ULL + len;
980+
981+
if (sum_len >= (1ULL << 32))
982+
return false;
983+
984+
if (lo != old_hi + 1)
985+
return false;
986+
987+
serr->ee.ee_data += len;
988+
return true;
989+
}
990+
932991
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
933992
{
934-
struct sk_buff *skb = skb_from_uarg(uarg);
993+
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
935994
struct sock_exterr_skb *serr;
936995
struct sock *sk = skb->sk;
937-
u16 id = uarg->desc;
996+
struct sk_buff_head *q;
997+
unsigned long flags;
998+
u32 lo, hi;
999+
u16 len;
9381000

939-
if (sock_flag(sk, SOCK_DEAD))
1001+
/* if !len, there was only 1 call, and it was aborted
1002+
* so do not queue a completion notification
1003+
*/
1004+
if (!uarg->len || sock_flag(sk, SOCK_DEAD))
9401005
goto release;
9411006

1007+
len = uarg->len;
1008+
lo = uarg->id;
1009+
hi = uarg->id + len - 1;
1010+
9421011
serr = SKB_EXT_ERR(skb);
9431012
memset(serr, 0, sizeof(*serr));
9441013
serr->ee.ee_errno = 0;
9451014
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
946-
serr->ee.ee_data = id;
1015+
serr->ee.ee_data = hi;
1016+
serr->ee.ee_info = lo;
9471017
if (!success)
9481018
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
9491019

950-
skb_queue_tail(&sk->sk_error_queue, skb);
951-
skb = NULL;
1020+
q = &sk->sk_error_queue;
1021+
spin_lock_irqsave(&q->lock, flags);
1022+
tail = skb_peek_tail(q);
1023+
if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
1024+
!skb_zerocopy_notify_extend(tail, lo, len)) {
1025+
__skb_queue_tail(q, skb);
1026+
skb = NULL;
1027+
}
1028+
spin_unlock_irqrestore(&q->lock, flags);
9521029

9531030
sk->sk_error_report(sk);
9541031

@@ -975,6 +1052,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
9751052
struct sock *sk = skb_from_uarg(uarg)->sk;
9761053

9771054
atomic_dec(&sk->sk_zckey);
1055+
uarg->len--;
9781056

9791057
/* sock_zerocopy_put expects a ref. Most sockets take one per
9801058
* skb, which is zero on abort. tcp_sendmsg holds one extra, to
@@ -995,9 +1073,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
9951073
struct msghdr *msg, int len,
9961074
struct ubuf_info *uarg)
9971075
{
1076+
struct ubuf_info *orig_uarg = skb_zcopy(skb);
9981077
struct iov_iter orig_iter = msg->msg_iter;
9991078
int err, orig_len = skb->len;
10001079

1080+
/* An skb can only point to one uarg. This edge case happens when
1081+
* TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
1082+
*/
1083+
if (orig_uarg && uarg != orig_uarg)
1084+
return -EEXIST;
1085+
10011086
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
10021087
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
10031088
/* Streams do not free skb on error. Reset to prev state. */

0 commit comments

Comments
 (0)