Skip to content

Commit 3f8e0aa

Browse files
Paolo Abenidavem330
authored andcommitted
mptcp: rework mptcp_sendmsg_frag to accept optional dfrag
This will simplify mptcp-level retransmission implementation in the next patch. If dfrag is provided by the caller, skip kernel space memory allocation and use data and metadata provided by the dfrag itself. Because a peer could ack data at TCP level but refrain from sending mptcp-level ACKs, we could grow the mptcp socket backlog indefinitely. We should thus block mptcp_sendmsg until the peer has acked some of the sent data. In order to be able to do so, increment the mptcp socket wmem_queued counter on memory allocation and decrement it when releasing the memory on mptcp-level ack reception. Because TCP performns sndbuf auto-tuning up to tcp_wmem_max[2], make this the mptcp sk_sndbuf limit. In the future we could add experiment with autotuning as TCP does in tcp_sndbuf_expand(). v2 -> v3: - remove 'inline' in foo.c files (David S. Miller) Co-developed-by: Florian Westphal <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Signed-off-by: Mat Martineau <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 7948f6c commit 3f8e0aa

File tree

1 file changed

+74
-49
lines changed

1 file changed

+74
-49
lines changed

net/mptcp/protocol.c

Lines changed: 74 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -316,15 +316,15 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
316316
return NULL;
317317
}
318318

319-
static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
320-
const struct sk_buff *skb,
321-
const struct mptcp_ext *mpext)
319+
static bool mptcp_skb_can_collapse_to(u64 write_seq,
320+
const struct sk_buff *skb,
321+
const struct mptcp_ext *mpext)
322322
{
323323
if (!tcp_skb_can_collapse_to(skb))
324324
return false;
325325

326326
/* can collapse only if MPTCP level sequence is in order */
327-
return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
327+
return mpext && mpext->data_seq + mpext->data_len == write_seq;
328328
}
329329

330330
static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
@@ -417,23 +417,28 @@ mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
417417
}
418418

419419
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
420-
struct msghdr *msg, long *timeo, int *pmss_now,
420+
struct msghdr *msg, struct mptcp_data_frag *dfrag,
421+
long *timeo, int *pmss_now,
421422
int *ps_goal)
422423
{
423424
int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
424425
bool dfrag_collapsed, can_collapse = false;
425426
struct mptcp_sock *msk = mptcp_sk(sk);
426427
struct mptcp_ext *mpext = NULL;
427-
struct mptcp_data_frag *dfrag;
428+
bool retransmission = !!dfrag;
428429
struct sk_buff *skb, *tail;
429430
struct page_frag *pfrag;
431+
struct page *page;
432+
u64 *write_seq;
430433
size_t psize;
431434

432435
/* use the mptcp page cache so that we can easily move the data
433436
* from one substream to another, but do per subflow memory accounting
437+
* Note: pfrag is used only !retransmission, but the compiler if
438+
* fooled into a warning if we don't init here
434439
*/
435440
pfrag = sk_page_frag(sk);
436-
while (!mptcp_page_frag_refill(ssk, pfrag) ||
441+
while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
437442
!mptcp_ext_cache_refill(msk)) {
438443
ret = sk_stream_wait_memory(ssk, timeo);
439444
if (ret)
@@ -447,6 +452,13 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
447452
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
448453
return 0;
449454
}
455+
if (!retransmission) {
456+
write_seq = &msk->write_seq;
457+
page = pfrag->page;
458+
} else {
459+
write_seq = &dfrag->data_seq;
460+
page = dfrag->page;
461+
}
450462

451463
/* compute copy limit */
452464
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
@@ -464,63 +476,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
464476
* SSN association set here
465477
*/
466478
can_collapse = (size_goal - skb->len > 0) &&
467-
mptcp_skb_can_collapse_to(msk, skb, mpext);
479+
mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
468480
if (!can_collapse)
469481
TCP_SKB_CB(skb)->eor = 1;
470482
else
471483
avail_size = size_goal - skb->len;
472484
}
473485

474-
/* reuse tail pfrag, if possible, or carve a new one from the page
475-
* allocator
476-
*/
477-
dfrag = mptcp_rtx_tail(sk);
478-
offset = pfrag->offset;
479-
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
480-
if (!dfrag_collapsed) {
481-
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
486+
if (!retransmission) {
487+
/* reuse tail pfrag, if possible, or carve a new one from the
488+
* page allocator
489+
*/
490+
dfrag = mptcp_rtx_tail(sk);
491+
offset = pfrag->offset;
492+
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
493+
if (!dfrag_collapsed) {
494+
dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
495+
offset = dfrag->offset;
496+
frag_truesize = dfrag->overhead;
497+
}
498+
psize = min_t(size_t, pfrag->size - offset, avail_size);
499+
500+
/* Copy to page */
501+
pr_debug("left=%zu", msg_data_left(msg));
502+
psize = copy_page_from_iter(pfrag->page, offset,
503+
min_t(size_t, msg_data_left(msg),
504+
psize),
505+
&msg->msg_iter);
506+
pr_debug("left=%zu", msg_data_left(msg));
507+
if (!psize)
508+
return -EINVAL;
509+
510+
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
511+
return -ENOMEM;
512+
} else {
482513
offset = dfrag->offset;
483-
frag_truesize = dfrag->overhead;
514+
psize = min_t(size_t, dfrag->data_len, avail_size);
484515
}
485-
psize = min_t(size_t, pfrag->size - offset, avail_size);
486-
487-
/* Copy to page */
488-
pr_debug("left=%zu", msg_data_left(msg));
489-
psize = copy_page_from_iter(pfrag->page, offset,
490-
min_t(size_t, msg_data_left(msg), psize),
491-
&msg->msg_iter);
492-
pr_debug("left=%zu", msg_data_left(msg));
493-
if (!psize)
494-
return -EINVAL;
495-
496-
if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
497-
return -ENOMEM;
498516

499517
/* tell the TCP stack to delay the push so that we can safely
500518
* access the skb after the sendpages call
501519
*/
502-
ret = do_tcp_sendpages(ssk, pfrag->page, offset, psize,
520+
ret = do_tcp_sendpages(ssk, page, offset, psize,
503521
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
504522
if (ret <= 0)
505523
return ret;
506524

507525
frag_truesize += ret;
508-
if (unlikely(ret < psize))
509-
iov_iter_revert(&msg->msg_iter, psize - ret);
526+
if (!retransmission) {
527+
if (unlikely(ret < psize))
528+
iov_iter_revert(&msg->msg_iter, psize - ret);
510529

511-
/* send successful, keep track of sent data for mptcp-level
512-
* retransmission
513-
*/
514-
dfrag->data_len += ret;
515-
if (!dfrag_collapsed) {
516-
get_page(dfrag->page);
517-
list_add_tail(&dfrag->list, &msk->rtx_queue);
518-
}
530+
/* send successful, keep track of sent data for mptcp-level
531+
* retransmission
532+
*/
533+
dfrag->data_len += ret;
534+
if (!dfrag_collapsed) {
535+
get_page(dfrag->page);
536+
list_add_tail(&dfrag->list, &msk->rtx_queue);
537+
sk_wmem_queued_add(sk, frag_truesize);
538+
} else {
539+
sk_wmem_queued_add(sk, ret);
540+
}
519541

520-
/* charge data on mptcp rtx queue to the master socket
521-
* Note: we charge such data both to sk and ssk
522-
*/
523-
sk->sk_forward_alloc -= frag_truesize;
542+
/* charge data on mptcp rtx queue to the master socket
543+
* Note: we charge such data both to sk and ssk
544+
*/
545+
sk->sk_forward_alloc -= frag_truesize;
546+
}
524547

525548
/* if the tail skb extension is still the cached one, collapsing
526549
* really happened. Note: we can't check for 'same skb' as the sk_buff
@@ -539,7 +562,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
539562
msk->cached_ext = NULL;
540563

541564
memset(mpext, 0, sizeof(*mpext));
542-
mpext->data_seq = msk->write_seq;
565+
mpext->data_seq = *write_seq;
543566
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
544567
mpext->data_len = ret;
545568
mpext->use_map = 1;
@@ -550,8 +573,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
550573
mpext->dsn64);
551574

552575
out:
553-
pfrag->offset += frag_truesize;
554-
msk->write_seq += ret;
576+
if (!retransmission)
577+
pfrag->offset += frag_truesize;
578+
*write_seq += ret;
555579
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
556580

557581
return ret;
@@ -663,7 +687,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
663687

664688
lock_sock(ssk);
665689
while (msg_data_left(msg)) {
666-
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
690+
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
667691
&size_goal);
668692
if (ret < 0)
669693
break;
@@ -974,6 +998,7 @@ static int mptcp_init_sock(struct sock *sk)
974998
return ret;
975999

9761000
sk_sockets_allocated_inc(sk);
1001+
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
9771002

9781003
if (!mptcp_is_enabled(sock_net(sk)))
9791004
return -ENOPROTOOPT;

0 commit comments

Comments
 (0)