Skip to content

Commit 8795260

Browse files
Paolo Abenikuba-moo
authored andcommitted
mptcp: protect the rx path with the msk socket spinlock
Such spinlock is currently used only to protect the 'owned' flag inside the socket lock itself. With this patch, we extend its scope to protect the whole msk receive path and sk_forward_memory. Given the above, we can always move data into the msk receive queue (and OoO queue) from the subflow. We leverage the previous commit, so that we need to acquire the spinlock in the tx path only when moving fwd memory. recvmsg() must now explicitly acquire the socket spinlock when moving skbs out of sk_receive_queue. To reduce the number of lock operations required we use a second rx queue and splice the first into the latter in mptcp_lock_sock(). Additionally rmem allocated memory is bulk-freed via release_cb() Acked-by: Florian Westphal <[email protected]> Co-developed-by: Florian Westphal <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Reviewed-by: Mat Martineau <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent e93da92 commit 8795260

File tree

2 files changed

+107
-47
lines changed

2 files changed

+107
-47
lines changed

net/mptcp/protocol.c

Lines changed: 102 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -453,15 +453,15 @@ static bool mptcp_subflow_cleanup_rbuf(struct sock *ssk)
453453

454454
static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
455455
{
456+
struct sock *ack_hint = READ_ONCE(msk->ack_hint);
456457
struct mptcp_subflow_context *subflow;
457458

458459
/* if the hinted ssk is still active, try to use it */
459-
if (likely(msk->ack_hint)) {
460+
if (likely(ack_hint)) {
460461
mptcp_for_each_subflow(msk, subflow) {
461462
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
462463

463-
if (msk->ack_hint == ssk &&
464-
mptcp_subflow_cleanup_rbuf(ssk))
464+
if (ack_hint == ssk && mptcp_subflow_cleanup_rbuf(ssk))
465465
return;
466466
}
467467
}
@@ -614,13 +614,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
614614
break;
615615
}
616616
} while (more_data_avail);
617-
msk->ack_hint = ssk;
617+
WRITE_ONCE(msk->ack_hint, ssk);
618618

619619
*bytes += moved;
620620
return done;
621621
}
622622

623-
static bool mptcp_ofo_queue(struct mptcp_sock *msk)
623+
static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
624624
{
625625
struct sock *sk = (struct sock *)msk;
626626
struct sk_buff *skb, *tail;
@@ -666,34 +666,27 @@ static bool mptcp_ofo_queue(struct mptcp_sock *msk)
666666
/* In most cases we will be able to lock the mptcp socket. If its already
667667
* owned, we need to defer to the work queue to avoid ABBA deadlock.
668668
*/
669-
static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
669+
static void move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
670670
{
671671
struct sock *sk = (struct sock *)msk;
672672
unsigned int moved = 0;
673673

674-
if (READ_ONCE(sk->sk_lock.owned))
675-
return false;
676-
677-
if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
678-
return false;
679-
680-
/* must re-check after taking the lock */
681-
if (!READ_ONCE(sk->sk_lock.owned)) {
682-
__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
683-
mptcp_ofo_queue(msk);
674+
if (inet_sk_state_load(sk) == TCP_CLOSE)
675+
return;
684676

685-
/* If the moves have caught up with the DATA_FIN sequence number
686-
* it's time to ack the DATA_FIN and change socket state, but
687-
* this is not a good place to change state. Let the workqueue
688-
* do it.
689-
*/
690-
if (mptcp_pending_data_fin(sk, NULL))
691-
mptcp_schedule_work(sk);
692-
}
677+
mptcp_data_lock(sk);
693678

694-
spin_unlock_bh(&sk->sk_lock.slock);
679+
__mptcp_move_skbs_from_subflow(msk, ssk, &moved);
680+
__mptcp_ofo_queue(msk);
695681

696-
return moved > 0;
682+
/* If the moves have caught up with the DATA_FIN sequence number
683+
* it's time to ack the DATA_FIN and change socket state, but
684+
* this is not a good place to change state. Let the workqueue
685+
* do it.
686+
*/
687+
if (mptcp_pending_data_fin(sk, NULL))
688+
mptcp_schedule_work(sk);
689+
mptcp_data_unlock(sk);
697690
}
698691

699692
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
@@ -937,17 +930,30 @@ static bool mptcp_wmem_alloc(struct sock *sk, int size)
937930
if (msk->wmem_reserved >= size)
938931
goto account;
939932

940-
if (!sk_wmem_schedule(sk, size))
933+
mptcp_data_lock(sk);
934+
if (!sk_wmem_schedule(sk, size)) {
935+
mptcp_data_unlock(sk);
941936
return false;
937+
}
942938

943939
sk->sk_forward_alloc -= size;
944940
msk->wmem_reserved += size;
941+
mptcp_data_unlock(sk);
945942

946943
account:
947944
msk->wmem_reserved -= size;
948945
return true;
949946
}
950947

948+
static void mptcp_wmem_uncharge(struct sock *sk, int size)
949+
{
950+
struct mptcp_sock *msk = mptcp_sk(sk);
951+
952+
if (msk->wmem_reserved < 0)
953+
msk->wmem_reserved = 0;
954+
msk->wmem_reserved += size;
955+
}
956+
951957
static void dfrag_uncharge(struct sock *sk, int len)
952958
{
953959
sk_mem_uncharge(sk, len);
@@ -976,6 +982,7 @@ static void mptcp_clean_una(struct sock *sk)
976982
if (__mptcp_check_fallback(msk))
977983
atomic64_set(&msk->snd_una, msk->snd_nxt);
978984

985+
mptcp_data_lock(sk);
979986
snd_una = atomic64_read(&msk->snd_una);
980987

981988
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
@@ -1007,6 +1014,7 @@ static void mptcp_clean_una(struct sock *sk)
10071014
out:
10081015
if (cleaned && tcp_under_memory_pressure(sk))
10091016
sk_mem_reclaim_partial(sk);
1017+
mptcp_data_unlock(sk);
10101018
}
10111019

10121020
static void mptcp_clean_una_wakeup(struct sock *sk)
@@ -1436,7 +1444,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
14361444

14371445
if (copy_page_from_iter(dfrag->page, offset, psize,
14381446
&msg->msg_iter) != psize) {
1439-
msk->wmem_reserved += psize + frag_truesize;
1447+
mptcp_wmem_uncharge(sk, psize + frag_truesize);
14401448
ret = -EFAULT;
14411449
goto out;
14421450
}
@@ -1502,11 +1510,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
15021510
struct msghdr *msg,
15031511
size_t len)
15041512
{
1505-
struct sock *sk = (struct sock *)msk;
15061513
struct sk_buff *skb;
15071514
int copied = 0;
15081515

1509-
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1516+
while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
15101517
u32 offset = MPTCP_SKB_CB(skb)->offset;
15111518
u32 data_len = skb->len - offset;
15121519
u32 count = min_t(size_t, len - copied, data_len);
@@ -1526,7 +1533,10 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
15261533
break;
15271534
}
15281535

1529-
__skb_unlink(skb, &sk->sk_receive_queue);
1536+
/* we will bulk release the skb memory later */
1537+
skb->destructor = NULL;
1538+
msk->rmem_released += skb->truesize;
1539+
__skb_unlink(skb, &msk->receive_queue);
15301540
__kfree_skb(skb);
15311541

15321542
if (copied >= len)
@@ -1634,25 +1644,47 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
16341644
msk->rcvq_space.time = mstamp;
16351645
}
16361646

1647+
static void __mptcp_update_rmem(struct sock *sk)
1648+
{
1649+
struct mptcp_sock *msk = mptcp_sk(sk);
1650+
1651+
if (!msk->rmem_released)
1652+
return;
1653+
1654+
atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
1655+
sk_mem_uncharge(sk, msk->rmem_released);
1656+
msk->rmem_released = 0;
1657+
}
1658+
1659+
static void __mptcp_splice_receive_queue(struct sock *sk)
1660+
{
1661+
struct mptcp_sock *msk = mptcp_sk(sk);
1662+
1663+
skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
1664+
}
1665+
16371666
static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
16381667
{
1668+
struct sock *sk = (struct sock *)msk;
16391669
unsigned int moved = 0;
1640-
bool done;
1641-
1642-
/* avoid looping forever below on racing close */
1643-
if (((struct sock *)msk)->sk_state == TCP_CLOSE)
1644-
return false;
1670+
bool ret, done;
16451671

16461672
__mptcp_flush_join_list(msk);
16471673
do {
16481674
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
16491675
bool slowpath;
16501676

1651-
if (!ssk)
1677+
/* we can have data pending in the subflows only if the msk
1678+
* receive buffer was full at subflow_data_ready() time,
1679+
* that is an unlikely slow path.
1680+
*/
1681+
if (likely(!ssk))
16521682
break;
16531683

16541684
slowpath = lock_sock_fast(ssk);
1685+
mptcp_data_lock(sk);
16551686
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
1687+
mptcp_data_unlock(sk);
16561688
if (moved && rcv) {
16571689
WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
16581690
tcp_cleanup_rbuf(ssk, 1);
@@ -1661,11 +1693,19 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
16611693
unlock_sock_fast(ssk, slowpath);
16621694
} while (!done);
16631695

1664-
if (mptcp_ofo_queue(msk) || moved > 0) {
1665-
mptcp_check_data_fin((struct sock *)msk);
1666-
return true;
1696+
/* acquire the data lock only if some input data is pending */
1697+
ret = moved > 0;
1698+
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) ||
1699+
!skb_queue_empty_lockless(&sk->sk_receive_queue)) {
1700+
mptcp_data_lock(sk);
1701+
__mptcp_update_rmem(sk);
1702+
ret |= __mptcp_ofo_queue(msk);
1703+
__mptcp_splice_receive_queue(sk);
1704+
mptcp_data_unlock(sk);
16671705
}
1668-
return false;
1706+
if (ret)
1707+
mptcp_check_data_fin((struct sock *)msk);
1708+
return !skb_queue_empty(&msk->receive_queue);
16691709
}
16701710

16711711
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
@@ -1679,7 +1719,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
16791719
if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
16801720
return -EOPNOTSUPP;
16811721

1682-
lock_sock(sk);
1722+
mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
16831723
if (unlikely(sk->sk_state == TCP_LISTEN)) {
16841724
copied = -ENOTCONN;
16851725
goto out_err;
@@ -1689,7 +1729,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
16891729

16901730
len = min_t(size_t, len, INT_MAX);
16911731
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1692-
__mptcp_flush_join_list(msk);
16931732

16941733
for (;;) {
16951734
int bytes_read, old_space;
@@ -1703,7 +1742,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
17031742

17041743
copied += bytes_read;
17051744

1706-
if (skb_queue_empty(&sk->sk_receive_queue) &&
1745+
if (skb_queue_empty(&msk->receive_queue) &&
17071746
__mptcp_move_skbs(msk, len - copied))
17081747
continue;
17091748

@@ -1734,8 +1773,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
17341773
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
17351774
mptcp_check_for_eof(msk);
17361775

1737-
if (sk->sk_shutdown & RCV_SHUTDOWN)
1776+
if (sk->sk_shutdown & RCV_SHUTDOWN) {
1777+
/* race breaker: the shutdown could be after the
1778+
* previous receive queue check
1779+
*/
1780+
if (__mptcp_move_skbs(msk, len - copied))
1781+
continue;
17381782
break;
1783+
}
17391784

17401785
if (sk->sk_state == TCP_CLOSE) {
17411786
copied = -ENOTCONN;
@@ -1757,7 +1802,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
17571802
mptcp_wait_data(sk, &timeo);
17581803
}
17591804

1760-
if (skb_queue_empty(&sk->sk_receive_queue)) {
1805+
if (skb_queue_empty_lockless(&sk->sk_receive_queue) &&
1806+
skb_queue_empty(&msk->receive_queue)) {
17611807
/* entire backlog drained, clear DATA_READY. */
17621808
clear_bit(MPTCP_DATA_READY, &msk->flags);
17631809

@@ -1773,7 +1819,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
17731819
out_err:
17741820
pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
17751821
msk, test_bit(MPTCP_DATA_READY, &msk->flags),
1776-
skb_queue_empty(&sk->sk_receive_queue), copied);
1822+
skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
17771823
mptcp_rcv_space_adjust(msk, copied);
17781824

17791825
release_sock(sk);
@@ -2076,9 +2122,11 @@ static int __mptcp_init_sock(struct sock *sk)
20762122
INIT_LIST_HEAD(&msk->join_list);
20772123
INIT_LIST_HEAD(&msk->rtx_queue);
20782124
INIT_WORK(&msk->work, mptcp_worker);
2125+
__skb_queue_head_init(&msk->receive_queue);
20792126
msk->out_of_order_queue = RB_ROOT;
20802127
msk->first_pending = NULL;
20812128
msk->wmem_reserved = 0;
2129+
msk->rmem_released = 0;
20822130

20832131
msk->ack_hint = NULL;
20842132
msk->first = NULL;
@@ -2274,6 +2322,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
22742322
sk->sk_prot->destroy(sk);
22752323

22762324
WARN_ON_ONCE(msk->wmem_reserved);
2325+
WARN_ON_ONCE(msk->rmem_released);
22772326
sk_stream_kill_queues(sk);
22782327
xfrm_sk_free_policy(sk);
22792328
sk_refcnt_debug_release(sk);
@@ -2491,6 +2540,11 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
24912540

24922541
void mptcp_destroy_common(struct mptcp_sock *msk)
24932542
{
2543+
struct sock *sk = (struct sock *)msk;
2544+
2545+
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
2546+
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
2547+
24942548
skb_rbtree_purge(&msk->out_of_order_queue);
24952549
mptcp_token_destroy(msk);
24962550
mptcp_pm_free_anno_list(msk);
@@ -2626,6 +2680,7 @@ static void mptcp_release_cb(struct sock *sk)
26262680

26272681
/* clear any wmem reservation and errors */
26282682
__mptcp_update_wmem(sk);
2683+
__mptcp_update_rmem(sk);
26292684

26302685
do {
26312686
flags = sk->sk_tsq_flags;

net/mptcp/protocol.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ struct mptcp_sock {
227227
unsigned long timer_ival;
228228
u32 token;
229229
int rmem_pending;
230+
int rmem_released;
230231
unsigned long flags;
231232
bool can_ack;
232233
bool fully_established;
@@ -238,6 +239,7 @@ struct mptcp_sock {
238239
struct work_struct work;
239240
struct sk_buff *ooo_last_skb;
240241
struct rb_root out_of_order_queue;
242+
struct sk_buff_head receive_queue;
241243
struct list_head conn_list;
242244
struct list_head rtx_queue;
243245
struct mptcp_data_frag *first_pending;
@@ -267,6 +269,9 @@ struct mptcp_sock {
267269
local_bh_enable(); \
268270
} while (0)
269271

272+
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
273+
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
274+
270275
#define mptcp_for_each_subflow(__msk, __subflow) \
271276
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
272277

0 commit comments

Comments
 (0)