@@ -122,14 +122,27 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
122
122
return NULL ;
123
123
}
124
124
125
+ static inline bool mptcp_skb_can_collapse_to (const struct mptcp_sock * msk ,
126
+ const struct sk_buff * skb ,
127
+ const struct mptcp_ext * mpext )
128
+ {
129
+ if (!tcp_skb_can_collapse_to (skb ))
130
+ return false;
131
+
132
+ /* can collapse only if MPTCP level sequence is in order */
133
+ return mpext && mpext -> data_seq + mpext -> data_len == msk -> write_seq ;
134
+ }
135
+
125
136
static int mptcp_sendmsg_frag (struct sock * sk , struct sock * ssk ,
126
- struct msghdr * msg , long * timeo )
137
+ struct msghdr * msg , long * timeo , int * pmss_now ,
138
+ int * ps_goal )
127
139
{
128
- int mss_now = 0 , size_goal = 0 , ret = 0 ;
140
+ int mss_now , avail_size , size_goal , ret ;
129
141
struct mptcp_sock * msk = mptcp_sk (sk );
130
142
struct mptcp_ext * mpext = NULL ;
143
+ struct sk_buff * skb , * tail ;
144
+ bool can_collapse = false;
131
145
struct page_frag * pfrag ;
132
- struct sk_buff * skb ;
133
146
size_t psize ;
134
147
135
148
/* use the mptcp page cache so that we can easily move the data
@@ -145,8 +158,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
145
158
146
159
/* compute copy limit */
147
160
mss_now = tcp_send_mss (ssk , & size_goal , msg -> msg_flags );
148
- psize = min_t (int , pfrag -> size - pfrag -> offset , size_goal );
161
+ * pmss_now = mss_now ;
162
+ * ps_goal = size_goal ;
163
+ avail_size = size_goal ;
164
+ skb = tcp_write_queue_tail (ssk );
165
+ if (skb ) {
166
+ mpext = skb_ext_find (skb , SKB_EXT_MPTCP );
167
+
168
+ /* Limit the write to the size available in the
169
+ * current skb, if any, so that we create at most a new skb.
170
+ * Explicitly tells TCP internals to avoid collapsing on later
171
+ * queue management operation, to avoid breaking the ext <->
172
+ * SSN association set here
173
+ */
174
+ can_collapse = (size_goal - skb -> len > 0 ) &&
175
+ mptcp_skb_can_collapse_to (msk , skb , mpext );
176
+ if (!can_collapse )
177
+ TCP_SKB_CB (skb )-> eor = 1 ;
178
+ else
179
+ avail_size = size_goal - skb -> len ;
180
+ }
181
+ psize = min_t (size_t , pfrag -> size - pfrag -> offset , avail_size );
149
182
183
+ /* Copy to page */
150
184
pr_debug ("left=%zu" , msg_data_left (msg ));
151
185
psize = copy_page_from_iter (pfrag -> page , pfrag -> offset ,
152
186
min_t (size_t , msg_data_left (msg ), psize ),
@@ -155,21 +189,28 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
155
189
if (!psize )
156
190
return - EINVAL ;
157
191
158
- /* Mark the end of the previous write so the beginning of the
159
- * next write (with its own mptcp skb extension data) is not
160
- * collapsed.
192
+ /* tell the TCP stack to delay the push so that we can safely
193
+ * access the skb after the sendpages call
161
194
*/
162
- skb = tcp_write_queue_tail (ssk );
163
- if (skb )
164
- TCP_SKB_CB (skb )-> eor = 1 ;
165
-
166
195
ret = do_tcp_sendpages (ssk , pfrag -> page , pfrag -> offset , psize ,
167
196
msg -> msg_flags | MSG_SENDPAGE_NOTLAST );
168
197
if (ret <= 0 )
169
198
return ret ;
170
199
if (unlikely (ret < psize ))
171
200
iov_iter_revert (& msg -> msg_iter , psize - ret );
172
201
202
+ /* if the tail skb extension is still the cached one, collapsing
203
+ * really happened. Note: we can't check for 'same skb' as the sk_buff
204
+ * hdr on tail can be transmitted, freed and re-allocated by the
205
+ * do_tcp_sendpages() call
206
+ */
207
+ tail = tcp_write_queue_tail (ssk );
208
+ if (mpext && tail && mpext == skb_ext_find (tail , SKB_EXT_MPTCP )) {
209
+ WARN_ON_ONCE (!can_collapse );
210
+ mpext -> data_len += ret ;
211
+ goto out ;
212
+ }
213
+
173
214
skb = tcp_write_queue_tail (ssk );
174
215
mpext = __skb_ext_set (skb , SKB_EXT_MPTCP , msk -> cached_ext );
175
216
msk -> cached_ext = NULL ;
@@ -185,11 +226,11 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
185
226
mpext -> data_seq , mpext -> subflow_seq , mpext -> data_len ,
186
227
mpext -> dsn64 );
187
228
229
+ out :
188
230
pfrag -> offset += ret ;
189
231
msk -> write_seq += ret ;
190
232
mptcp_subflow_ctx (ssk )-> rel_write_seq += ret ;
191
233
192
- tcp_push (ssk , msg -> msg_flags , mss_now , tcp_sk (ssk )-> nonagle , size_goal );
193
234
return ret ;
194
235
}
195
236
@@ -212,11 +253,11 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
212
253
213
254
static int mptcp_sendmsg (struct sock * sk , struct msghdr * msg , size_t len )
214
255
{
256
+ int mss_now = 0 , size_goal = 0 , ret = 0 ;
215
257
struct mptcp_sock * msk = mptcp_sk (sk );
216
258
struct socket * ssock ;
217
259
size_t copied = 0 ;
218
260
struct sock * ssk ;
219
- int ret = 0 ;
220
261
long timeo ;
221
262
222
263
if (msg -> msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL ))
@@ -243,15 +284,19 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
243
284
244
285
lock_sock (ssk );
245
286
while (msg_data_left (msg )) {
246
- ret = mptcp_sendmsg_frag (sk , ssk , msg , & timeo );
287
+ ret = mptcp_sendmsg_frag (sk , ssk , msg , & timeo , & mss_now ,
288
+ & size_goal );
247
289
if (ret < 0 )
248
290
break ;
249
291
250
292
copied += ret ;
251
293
}
252
294
253
- if (copied > 0 )
295
+ if (copied ) {
254
296
ret = copied ;
297
+ tcp_push (ssk , msg -> msg_flags , mss_now , tcp_sk (ssk )-> nonagle ,
298
+ size_goal );
299
+ }
255
300
256
301
ssk_check_wmem (msk , ssk );
257
302
release_sock (ssk );
0 commit comments