@@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
1031
1031
}
1032
1032
}
1033
1033
1034
+ static bool mptcp_subflow_active (struct mptcp_subflow_context * subflow )
1035
+ {
1036
+ struct sock * ssk = mptcp_subflow_tcp_sock (subflow );
1037
+
1038
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
1039
+ if (subflow -> request_join && !subflow -> fully_established )
1040
+ return false;
1041
+
1042
+ /* only send if our side has not closed yet */
1043
+ return ((1 << ssk -> sk_state ) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT ));
1044
+ }
1045
+
1046
+ #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
1047
+ sizeof(struct tcphdr) - \
1048
+ MAX_TCP_OPTION_SPACE - \
1049
+ sizeof(struct ipv6hdr) - \
1050
+ sizeof(struct frag_hdr))
1051
+
1052
+ struct subflow_send_info {
1053
+ struct sock * ssk ;
1054
+ u64 ratio ;
1055
+ };
1056
+
1034
1057
static struct sock * mptcp_subflow_get_send (struct mptcp_sock * msk ,
1035
1058
u32 * sndbuf )
1036
1059
{
1060
+ struct subflow_send_info send_info [2 ];
1037
1061
struct mptcp_subflow_context * subflow ;
1038
- struct sock * sk = (struct sock * )msk ;
1039
- struct sock * backup = NULL ;
1040
- bool free ;
1062
+ int i , nr_active = 0 ;
1063
+ struct sock * ssk ;
1064
+ u64 ratio ;
1065
+ u32 pace ;
1041
1066
1042
- sock_owned_by_me (sk );
1067
+ sock_owned_by_me (( struct sock * ) msk );
1043
1068
1044
1069
* sndbuf = 0 ;
1045
1070
if (!mptcp_ext_cache_refill (msk ))
1046
1071
return NULL ;
1047
1072
1048
- mptcp_for_each_subflow (msk , subflow ) {
1049
- struct sock * ssk = mptcp_subflow_tcp_sock (subflow );
1050
-
1051
- free = sk_stream_is_writeable (subflow -> tcp_sock );
1052
- if (!free ) {
1053
- mptcp_nospace (msk );
1073
+ if (__mptcp_check_fallback (msk )) {
1074
+ if (!msk -> first )
1054
1075
return NULL ;
1076
+ * sndbuf = msk -> first -> sk_sndbuf ;
1077
+ return sk_stream_memory_free (msk -> first ) ? msk -> first : NULL ;
1078
+ }
1079
+
1080
+ /* re-use last subflow, if the burst allow that */
1081
+ if (msk -> last_snd && msk -> snd_burst > 0 &&
1082
+ sk_stream_memory_free (msk -> last_snd ) &&
1083
+ mptcp_subflow_active (mptcp_subflow_ctx (msk -> last_snd ))) {
1084
+ mptcp_for_each_subflow (msk , subflow ) {
1085
+ ssk = mptcp_subflow_tcp_sock (subflow );
1086
+ * sndbuf = max (tcp_sk (ssk )-> snd_wnd , * sndbuf );
1055
1087
}
1088
+ return msk -> last_snd ;
1089
+ }
1090
+
1091
+ /* pick the subflow with the lower wmem/wspace ratio */
1092
+ for (i = 0 ; i < 2 ; ++ i ) {
1093
+ send_info [i ].ssk = NULL ;
1094
+ send_info [i ].ratio = -1 ;
1095
+ }
1096
+ mptcp_for_each_subflow (msk , subflow ) {
1097
+ ssk = mptcp_subflow_tcp_sock (subflow );
1098
+ if (!mptcp_subflow_active (subflow ))
1099
+ continue ;
1056
1100
1101
+ nr_active += !subflow -> backup ;
1057
1102
* sndbuf = max (tcp_sk (ssk )-> snd_wnd , * sndbuf );
1058
- if (subflow -> backup ) {
1059
- if (!backup )
1060
- backup = ssk ;
1103
+ if (!sk_stream_memory_free (subflow -> tcp_sock ))
1104
+ continue ;
1061
1105
1106
+ pace = READ_ONCE (ssk -> sk_pacing_rate );
1107
+ if (!pace )
1062
1108
continue ;
1063
- }
1064
1109
1065
- return ssk ;
1110
+ ratio = div_u64 ((u64 )READ_ONCE (ssk -> sk_wmem_queued ) << 32 ,
1111
+ pace );
1112
+ if (ratio < send_info [subflow -> backup ].ratio ) {
1113
+ send_info [subflow -> backup ].ssk = ssk ;
1114
+ send_info [subflow -> backup ].ratio = ratio ;
1115
+ }
1066
1116
}
1067
1117
1068
- return backup ;
1118
+ pr_debug ("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld" ,
1119
+ msk , nr_active , send_info [0 ].ssk , send_info [0 ].ratio ,
1120
+ send_info [1 ].ssk , send_info [1 ].ratio );
1121
+
1122
+ /* pick the best backup if no other subflow is active */
1123
+ if (!nr_active )
1124
+ send_info [0 ].ssk = send_info [1 ].ssk ;
1125
+
1126
+ if (send_info [0 ].ssk ) {
1127
+ msk -> last_snd = send_info [0 ].ssk ;
1128
+ msk -> snd_burst = min_t (int , MPTCP_SEND_BURST_SIZE ,
1129
+ sk_stream_wspace (msk -> last_snd ));
1130
+ return msk -> last_snd ;
1131
+ }
1132
+ return NULL ;
1069
1133
}
1070
1134
1071
1135
static void ssk_check_wmem (struct mptcp_sock * msk )
@@ -1160,6 +1224,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1160
1224
break ;
1161
1225
}
1162
1226
1227
+ /* burst can be negative, we will try move to the next subflow
1228
+ * at selection time, if possible.
1229
+ */
1230
+ msk -> snd_burst -= ret ;
1163
1231
copied += ret ;
1164
1232
1165
1233
tx_ok = msg_data_left (msg );
@@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
1375
1443
unsigned int moved = 0 ;
1376
1444
bool done ;
1377
1445
1446
+ /* avoid looping forever below on racing close */
1447
+ if (((struct sock * )msk )-> sk_state == TCP_CLOSE )
1448
+ return false;
1449
+
1450
+ __mptcp_flush_join_list (msk );
1378
1451
do {
1379
1452
struct sock * ssk = mptcp_subflow_recv_lookup (msk );
1380
1453
@@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
1539
1612
1540
1613
sock_owned_by_me ((const struct sock * )msk );
1541
1614
1615
+ if (__mptcp_check_fallback (msk ))
1616
+ return msk -> first ;
1617
+
1542
1618
mptcp_for_each_subflow (msk , subflow ) {
1543
1619
struct sock * ssk = mptcp_subflow_tcp_sock (subflow );
1544
1620
1621
+ if (!mptcp_subflow_active (subflow ))
1622
+ continue ;
1623
+
1545
1624
/* still data outstanding at TCP level? Don't retransmit. */
1546
1625
if (!tcp_write_queue_empty (ssk ))
1547
1626
return NULL ;
0 commit comments