Skip to content

Commit 6d0060f

Browse files
mjmartineaudavem330
authored andcommitted
mptcp: Write MPTCP DSS headers to outgoing data packets
Per-packet metadata required to write the MPTCP DSS option is written to the skb_ext area. One write to the socket may contain more than one packet of data, which is copied to page fragments and mapped in to MPTCP DSS segments with size determined by the available page fragments and the maximum mapping length allowed by the MPTCP specification. If do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok - the later skbs can either have duplicated DSS mapping information or none at all, and the receiver can handle that. The current implementation uses the subflow frag cache and tcp sendpages to avoid excessive code duplication. More work is required to ensure that it works correctly under memory pressure and to support MPTCP-level retransmissions. The MPTCP DSS checksum is not yet implemented. Co-developed-by: Paolo Abeni <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Co-developed-by: Peter Krystad <[email protected]> Signed-off-by: Peter Krystad <[email protected]> Co-developed-by: Florian Westphal <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Mat Martineau <[email protected]> Signed-off-by: Christoph Paasch <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 717e79c commit 6d0060f

File tree

4 files changed

+286
-6
lines changed

4 files changed

+286
-6
lines changed

include/net/mptcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct mptcp_out_options {
3232
u16 suboptions;
3333
u64 sndr_key;
3434
u64 rcvr_key;
35+
struct mptcp_ext ext_copy;
3536
#endif
3637
};
3738

net/mptcp/options.c

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,13 +134,13 @@ void mptcp_rcv_synsent(struct sock *sk)
134134
}
135135
}
136136

137-
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
138-
unsigned int *size, unsigned int remaining,
139-
struct mptcp_out_options *opts)
137+
static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size,
138+
unsigned int remaining,
139+
struct mptcp_out_options *opts)
140140
{
141141
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
142142

143-
if (subflow->mp_capable && !subflow->fourth_ack) {
143+
if (!subflow->fourth_ack) {
144144
opts->suboptions = OPTION_MPTCP_MPC_ACK;
145145
opts->sndr_key = subflow->local_key;
146146
opts->rcvr_key = subflow->remote_key;
@@ -153,6 +153,112 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
153153
return false;
154154
}
155155

156+
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
157+
struct mptcp_ext *ext)
158+
{
159+
ext->data_fin = 1;
160+
161+
if (!ext->use_map) {
162+
/* RFC6824 requires a DSS mapping with specific values
163+
* if DATA_FIN is set but no data payload is mapped
164+
*/
165+
ext->use_map = 1;
166+
ext->dsn64 = 1;
167+
ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
168+
ext->subflow_seq = 0;
169+
ext->data_len = 1;
170+
} else {
171+
/* If there's an existing DSS mapping, DATA_FIN consumes
172+
* 1 additional byte of mapping space.
173+
*/
174+
ext->data_len++;
175+
}
176+
}
177+
178+
static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
179+
unsigned int *size,
180+
unsigned int remaining,
181+
struct mptcp_out_options *opts)
182+
{
183+
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
184+
unsigned int dss_size = 0;
185+
struct mptcp_ext *mpext;
186+
struct mptcp_sock *msk;
187+
unsigned int ack_size;
188+
u8 tcp_fin;
189+
190+
if (skb) {
191+
mpext = mptcp_get_ext(skb);
192+
tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
193+
} else {
194+
mpext = NULL;
195+
tcp_fin = 0;
196+
}
197+
198+
if (!skb || (mpext && mpext->use_map) || tcp_fin) {
199+
unsigned int map_size;
200+
201+
map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
202+
203+
remaining -= map_size;
204+
dss_size = map_size;
205+
if (mpext)
206+
opts->ext_copy = *mpext;
207+
208+
if (skb && tcp_fin &&
209+
subflow->conn->sk_state != TCP_ESTABLISHED)
210+
mptcp_write_data_fin(subflow, &opts->ext_copy);
211+
}
212+
213+
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
214+
215+
/* Add kind/length/subtype/flag overhead if mapping is not populated */
216+
if (dss_size == 0)
217+
ack_size += TCPOLEN_MPTCP_DSS_BASE;
218+
219+
dss_size += ack_size;
220+
221+
msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
222+
if (msk) {
223+
opts->ext_copy.data_ack = msk->ack_seq;
224+
} else {
225+
mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
226+
NULL, &opts->ext_copy.data_ack);
227+
opts->ext_copy.data_ack++;
228+
}
229+
230+
opts->ext_copy.ack64 = 1;
231+
opts->ext_copy.use_ack = 1;
232+
233+
*size = ALIGN(dss_size, 4);
234+
return true;
235+
}
236+
237+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
238+
unsigned int *size, unsigned int remaining,
239+
struct mptcp_out_options *opts)
240+
{
241+
unsigned int opt_size = 0;
242+
bool ret = false;
243+
244+
if (mptcp_established_options_mp(sk, &opt_size, remaining, opts))
245+
ret = true;
246+
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
247+
opts))
248+
ret = true;
249+
250+
/* we reserved enough space for the above options, and exceeding the
251+
* TCP option space would be fatal
252+
*/
253+
if (WARN_ON_ONCE(opt_size > remaining))
254+
return false;
255+
256+
*size += opt_size;
257+
remaining -= opt_size;
258+
259+
return ret;
260+
}
261+
156262
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
157263
struct mptcp_out_options *opts)
158264
{
@@ -194,4 +300,45 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
194300
ptr += 2;
195301
}
196302
}
303+
304+
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
305+
struct mptcp_ext *mpext = &opts->ext_copy;
306+
u8 len = TCPOLEN_MPTCP_DSS_BASE;
307+
u8 flags = 0;
308+
309+
if (mpext->use_ack) {
310+
len += TCPOLEN_MPTCP_DSS_ACK64;
311+
flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
312+
}
313+
314+
if (mpext->use_map) {
315+
len += TCPOLEN_MPTCP_DSS_MAP64;
316+
317+
/* Use only 64-bit mapping flags for now, add
318+
* support for optional 32-bit mappings later.
319+
*/
320+
flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
321+
if (mpext->data_fin)
322+
flags |= MPTCP_DSS_DATA_FIN;
323+
}
324+
325+
*ptr++ = htonl((TCPOPT_MPTCP << 24) |
326+
(len << 16) |
327+
(MPTCPOPT_DSS << 12) |
328+
(flags));
329+
330+
if (mpext->use_ack) {
331+
put_unaligned_be64(mpext->data_ack, ptr);
332+
ptr += 2;
333+
}
334+
335+
if (mpext->use_map) {
336+
put_unaligned_be64(mpext->data_seq, ptr);
337+
ptr += 2;
338+
put_unaligned_be32(mpext->subflow_seq, ptr);
339+
ptr += 1;
340+
put_unaligned_be32(mpext->data_len << 16 |
341+
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
342+
}
343+
}
197344
}

net/mptcp/protocol.c

Lines changed: 114 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,93 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
9797
return NULL;
9898
}
9999

100+
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
101+
{
102+
if (!msk->cached_ext)
103+
msk->cached_ext = __skb_ext_alloc();
104+
105+
return !!msk->cached_ext;
106+
}
107+
108+
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
109+
struct msghdr *msg, long *timeo)
110+
{
111+
int mss_now = 0, size_goal = 0, ret = 0;
112+
struct mptcp_sock *msk = mptcp_sk(sk);
113+
struct mptcp_ext *mpext = NULL;
114+
struct page_frag *pfrag;
115+
struct sk_buff *skb;
116+
size_t psize;
117+
118+
/* use the mptcp page cache so that we can easily move the data
119+
* from one substream to another, but do per subflow memory accounting
120+
*/
121+
pfrag = sk_page_frag(sk);
122+
while (!sk_page_frag_refill(ssk, pfrag) ||
123+
!mptcp_ext_cache_refill(msk)) {
124+
ret = sk_stream_wait_memory(ssk, timeo);
125+
if (ret)
126+
return ret;
127+
}
128+
129+
/* compute copy limit */
130+
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
131+
psize = min_t(int, pfrag->size - pfrag->offset, size_goal);
132+
133+
pr_debug("left=%zu", msg_data_left(msg));
134+
psize = copy_page_from_iter(pfrag->page, pfrag->offset,
135+
min_t(size_t, msg_data_left(msg), psize),
136+
&msg->msg_iter);
137+
pr_debug("left=%zu", msg_data_left(msg));
138+
if (!psize)
139+
return -EINVAL;
140+
141+
/* Mark the end of the previous write so the beginning of the
142+
* next write (with its own mptcp skb extension data) is not
143+
* collapsed.
144+
*/
145+
skb = tcp_write_queue_tail(ssk);
146+
if (skb)
147+
TCP_SKB_CB(skb)->eor = 1;
148+
149+
ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
150+
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
151+
if (ret <= 0)
152+
return ret;
153+
if (unlikely(ret < psize))
154+
iov_iter_revert(&msg->msg_iter, psize - ret);
155+
156+
skb = tcp_write_queue_tail(ssk);
157+
mpext = __skb_ext_set(skb, SKB_EXT_MPTCP, msk->cached_ext);
158+
msk->cached_ext = NULL;
159+
160+
memset(mpext, 0, sizeof(*mpext));
161+
mpext->data_seq = msk->write_seq;
162+
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
163+
mpext->data_len = ret;
164+
mpext->use_map = 1;
165+
mpext->dsn64 = 1;
166+
167+
pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d",
168+
mpext->data_seq, mpext->subflow_seq, mpext->data_len,
169+
mpext->dsn64);
170+
171+
pfrag->offset += ret;
172+
msk->write_seq += ret;
173+
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
174+
175+
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal);
176+
return ret;
177+
}
178+
100179
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
101180
{
102181
struct mptcp_sock *msk = mptcp_sk(sk);
103182
struct socket *ssock;
183+
size_t copied = 0;
104184
struct sock *ssk;
105-
int ret;
185+
int ret = 0;
186+
long timeo;
106187

107188
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
108189
return -EOPNOTSUPP;
@@ -116,14 +197,29 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
116197
return ret;
117198
}
118199

200+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
201+
119202
ssk = mptcp_subflow_get(msk);
120203
if (!ssk) {
121204
release_sock(sk);
122205
return -ENOTCONN;
123206
}
124207

125-
ret = sock_sendmsg(ssk->sk_socket, msg);
208+
pr_debug("conn_list->subflow=%p", ssk);
126209

210+
lock_sock(ssk);
211+
while (msg_data_left(msg)) {
212+
ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo);
213+
if (ret < 0)
214+
break;
215+
216+
copied += ret;
217+
}
218+
219+
if (copied > 0)
220+
ret = copied;
221+
222+
release_sock(ssk);
127223
release_sock(sk);
128224
return ret;
129225
}
@@ -235,6 +331,8 @@ static void mptcp_close(struct sock *sk, long timeout)
235331
__mptcp_close_ssk(sk, ssk, subflow, timeout);
236332
}
237333

334+
if (msk->cached_ext)
335+
__skb_ext_put(msk->cached_ext);
238336
release_sock(sk);
239337
sk_common_release(sk);
240338
}
@@ -286,6 +384,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
286384
struct mptcp_subflow_context *subflow;
287385
struct sock *new_mptcp_sock;
288386
struct sock *ssk = newsk;
387+
u64 ack_seq;
289388

290389
subflow = mptcp_subflow_ctx(newsk);
291390
lock_sock(sk);
@@ -310,6 +409,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
310409
msk->subflow = NULL;
311410

312411
mptcp_token_update_accept(newsk, new_mptcp_sock);
412+
413+
mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
414+
msk->write_seq = subflow->idsn + 1;
415+
ack_seq++;
416+
msk->ack_seq = ack_seq;
417+
subflow->rel_write_seq = 1;
313418
newsk = new_mptcp_sock;
314419
mptcp_copy_inaddrs(newsk, ssk);
315420
list_add(&subflow->node, &msk->conn_list);
@@ -404,6 +509,7 @@ void mptcp_finish_connect(struct sock *ssk)
404509
struct mptcp_subflow_context *subflow;
405510
struct mptcp_sock *msk;
406511
struct sock *sk;
512+
u64 ack_seq;
407513

408514
subflow = mptcp_subflow_ctx(ssk);
409515

@@ -413,12 +519,18 @@ void mptcp_finish_connect(struct sock *ssk)
413519
sk = subflow->conn;
414520
msk = mptcp_sk(sk);
415521

522+
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
523+
ack_seq++;
524+
subflow->rel_write_seq = 1;
525+
416526
/* the socket is not connected yet, no msk/subflow ops can access/race
417527
* accessing the field below
418528
*/
419529
WRITE_ONCE(msk->remote_key, subflow->remote_key);
420530
WRITE_ONCE(msk->local_key, subflow->local_key);
421531
WRITE_ONCE(msk->token, subflow->token);
532+
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
533+
WRITE_ONCE(msk->ack_seq, ack_seq);
422534
}
423535

424536
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)

0 commit comments

Comments
 (0)