Skip to content

Commit d5f4919

Browse files
Paolo Abenidavem330
authored andcommitted
mptcp: allow picking different xmit subflows
Update the scheduler to less trivial heuristic: cache the last used subflow, and try to send on it a reasonably long burst of data. When the burst or the subflow send space is exhausted, pick the subflow with the lower ratio between write space and send buffer - that is, the subflow with the greater relative amount of free space. v1 -> v2: - fix 32 bit build breakage due to 64bits div - fix checkpath issues (uint64_t -> u64) Signed-off-by: Paolo Abeni <[email protected]> Reviewed-by: Mat Martineau <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 4596a2c commit d5f4919

File tree

2 files changed

+99
-18
lines changed

2 files changed

+99
-18
lines changed

net/mptcp/protocol.c

Lines changed: 95 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
10311031
}
10321032
}
10331033

1034+
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
1035+
{
1036+
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1037+
1038+
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
1039+
if (subflow->request_join && !subflow->fully_established)
1040+
return false;
1041+
1042+
/* only send if our side has not closed yet */
1043+
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
1044+
}
1045+
1046+
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1047+
sizeof(struct tcphdr) - \
1048+
MAX_TCP_OPTION_SPACE - \
1049+
sizeof(struct ipv6hdr) - \
1050+
sizeof(struct frag_hdr))
1051+
1052+
struct subflow_send_info {
1053+
struct sock *ssk;
1054+
u64 ratio;
1055+
};
1056+
10341057
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
10351058
u32 *sndbuf)
10361059
{
1060+
struct subflow_send_info send_info[2];
10371061
struct mptcp_subflow_context *subflow;
1038-
struct sock *sk = (struct sock *)msk;
1039-
struct sock *backup = NULL;
1040-
bool free;
1062+
int i, nr_active = 0;
1063+
struct sock *ssk;
1064+
u64 ratio;
1065+
u32 pace;
10411066

1042-
sock_owned_by_me(sk);
1067+
sock_owned_by_me((struct sock *)msk);
10431068

10441069
*sndbuf = 0;
10451070
if (!mptcp_ext_cache_refill(msk))
10461071
return NULL;
10471072

1048-
mptcp_for_each_subflow(msk, subflow) {
1049-
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1050-
1051-
free = sk_stream_is_writeable(subflow->tcp_sock);
1052-
if (!free) {
1053-
mptcp_nospace(msk);
1073+
if (__mptcp_check_fallback(msk)) {
1074+
if (!msk->first)
10541075
return NULL;
1076+
*sndbuf = msk->first->sk_sndbuf;
1077+
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
1078+
}
1079+
1080+
/* re-use last subflow, if the burst allow that */
1081+
if (msk->last_snd && msk->snd_burst > 0 &&
1082+
sk_stream_memory_free(msk->last_snd) &&
1083+
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
1084+
mptcp_for_each_subflow(msk, subflow) {
1085+
ssk = mptcp_subflow_tcp_sock(subflow);
1086+
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
10551087
}
1088+
return msk->last_snd;
1089+
}
1090+
1091+
/* pick the subflow with the lower wmem/wspace ratio */
1092+
for (i = 0; i < 2; ++i) {
1093+
send_info[i].ssk = NULL;
1094+
send_info[i].ratio = -1;
1095+
}
1096+
mptcp_for_each_subflow(msk, subflow) {
1097+
ssk = mptcp_subflow_tcp_sock(subflow);
1098+
if (!mptcp_subflow_active(subflow))
1099+
continue;
10561100

1101+
nr_active += !subflow->backup;
10571102
*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
1058-
if (subflow->backup) {
1059-
if (!backup)
1060-
backup = ssk;
1103+
if (!sk_stream_memory_free(subflow->tcp_sock))
1104+
continue;
10611105

1106+
pace = READ_ONCE(ssk->sk_pacing_rate);
1107+
if (!pace)
10621108
continue;
1063-
}
10641109

1065-
return ssk;
1110+
ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
1111+
pace);
1112+
if (ratio < send_info[subflow->backup].ratio) {
1113+
send_info[subflow->backup].ssk = ssk;
1114+
send_info[subflow->backup].ratio = ratio;
1115+
}
10661116
}
10671117

1068-
return backup;
1118+
pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
1119+
msk, nr_active, send_info[0].ssk, send_info[0].ratio,
1120+
send_info[1].ssk, send_info[1].ratio);
1121+
1122+
/* pick the best backup if no other subflow is active */
1123+
if (!nr_active)
1124+
send_info[0].ssk = send_info[1].ssk;
1125+
1126+
if (send_info[0].ssk) {
1127+
msk->last_snd = send_info[0].ssk;
1128+
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
1129+
sk_stream_wspace(msk->last_snd));
1130+
return msk->last_snd;
1131+
}
1132+
return NULL;
10691133
}
10701134

10711135
static void ssk_check_wmem(struct mptcp_sock *msk)
@@ -1160,6 +1224,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
11601224
break;
11611225
}
11621226

1227+
/* burst can be negative, we will try move to the next subflow
1228+
* at selection time, if possible.
1229+
*/
1230+
msk->snd_burst -= ret;
11631231
copied += ret;
11641232

11651233
tx_ok = msg_data_left(msg);
@@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
13751443
unsigned int moved = 0;
13761444
bool done;
13771445

1446+
/* avoid looping forever below on racing close */
1447+
if (((struct sock *)msk)->sk_state == TCP_CLOSE)
1448+
return false;
1449+
1450+
__mptcp_flush_join_list(msk);
13781451
do {
13791452
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
13801453

@@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
15391612

15401613
sock_owned_by_me((const struct sock *)msk);
15411614

1615+
if (__mptcp_check_fallback(msk))
1616+
return msk->first;
1617+
15421618
mptcp_for_each_subflow(msk, subflow) {
15431619
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
15441620

1621+
if (!mptcp_subflow_active(subflow))
1622+
continue;
1623+
15451624
/* still data outstanding at TCP level? Don't retransmit. */
15461625
if (!tcp_write_queue_empty(ssk))
15471626
return NULL;

net/mptcp/protocol.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ struct mptcp_sock {
196196
u64 write_seq;
197197
u64 ack_seq;
198198
u64 rcv_data_fin_seq;
199+
struct sock *last_snd;
200+
int snd_burst;
199201
atomic64_t snd_una;
200202
unsigned long timer_ival;
201203
u32 token;
@@ -473,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)
473475

474476
void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
475477

476-
static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
478+
static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
477479
{
478480
return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
479481
}
480482

481-
static inline bool mptcp_check_fallback(struct sock *sk)
483+
static inline bool mptcp_check_fallback(const struct sock *sk)
482484
{
483485
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
484486
struct mptcp_sock *msk = mptcp_sk(subflow->conn);

0 commit comments

Comments
 (0)