Skip to content

Commit fd5f4d7

Browse files
committed
Merge branch 'splice-net-rewrite-splice-to-socket-fix-splice_f_more-and-handle-msg_splice_pages-in-af_tls'
David Howells says: ==================== splice, net: Rewrite splice-to-socket, fix SPLICE_F_MORE and handle MSG_SPLICE_PAGES in AF_TLS Here are patches to do the following: (1) Block MSG_SENDPAGE_* flags from leaking into ->sendmsg() from userspace, whilst allowing splice_to_socket() to pass them in. (2) Allow MSG_SPLICE_PAGES to be passed into tls_*_sendmsg(). Until support is added, it will be ignored and a splice-driven sendmsg() will be treated like a normal sendmsg(). TCP, UDP, AF_UNIX and Chelsio-TLS already handle the flag in net-next. (3) Replace a chain of functions to splice-to-sendpage with a single function to splice via sendmsg() with MSG_SPLICE_PAGES. This allows a bunch of pages to be spliced from a pipe in a single call using a bio_vec[] and pushes the main processing loop down into the bowels of the protocol driver rather than repeatedly calling in with a page at a time. (4) Provide a ->splice_eof() op[2] that allows splice to signal to its output that the input observed a premature EOF and that the caller didn't flag SPLICE_F_MORE, thereby allowing a corked socket to be flushed. This attempts to maintain the current behaviour. It is also not called if we didn't manage to read any data and so didn't called the actor function. This needs routing though several layers to get it down to the network protocol. [!] Note that I chose not to pass in any flags - I'm not sure it's particularly useful to pass in the splice flags; I also elected not to return any error code - though we might actually want to do that. (5) Provide tls_{device,sw}_splice_eof() to flush a pending TLS record if there is one. (6) Provide splice_eof() for UDP, TCP, Chelsio-TLS and AF_KCM. AF_UNIX doesn't seem to pay attention to the MSG_MORE or MSG_SENDPAGE_NOTLAST flags. (7) Alter the behaviour of sendfile() and fix SPLICE_F_MORE/MSG_MORE signalling[1] such SPLICE_F_MORE is always signalled until we have read sufficient data to finish the request. If we get a zero-length before we've managed to splice sufficient data, we now leave the socket expecting more data and leave it to userspace to deal with it. (8) Make AF_TLS handle the MSG_SPLICE_PAGES internal sendmsg flag. MSG_SPLICE_PAGES is an internal hint that tells the protocol that it should splice the pages supplied if it can. Its sendpage implementations are then turned into wrappers around that. Link: https://lore.kernel.org/r/[email protected]/ [1] Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ [2] Link: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=51c78a4d532efe9543a4df019ff405f05c6157f6 # part 1 Link: https://lore.kernel.org/r/[email protected]/ # v1 ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 7360132 + 3dc8976 commit fd5f4d7

File tree

25 files changed

+478
-238
lines changed

25 files changed

+478
-238
lines changed

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,7 @@ void chtls_destroy_sock(struct sock *sk);
568568
int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
569569
int chtls_recvmsg(struct sock *sk, struct msghdr *msg,
570570
size_t len, int flags, int *addr_len);
571+
void chtls_splice_eof(struct socket *sock);
571572
int chtls_sendpage(struct sock *sk, struct page *page,
572573
int offset, size_t size, int flags);
573574
int send_tx_flowc_wr(struct sock *sk, int compl,

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,6 +1237,15 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
12371237
goto done;
12381238
}
12391239

1240+
void chtls_splice_eof(struct socket *sock)
1241+
{
1242+
struct sock *sk = sock->sk;
1243+
1244+
lock_sock(sk);
1245+
chtls_tcp_push(sk, 0);
1246+
release_sock(sk);
1247+
}
1248+
12401249
int chtls_sendpage(struct sock *sk, struct page *page,
12411250
int offset, size_t size, int flags)
12421251
{

drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ static void __init chtls_init_ulp_ops(void)
606606
chtls_cpl_prot.destroy = chtls_destroy_sock;
607607
chtls_cpl_prot.shutdown = chtls_shutdown;
608608
chtls_cpl_prot.sendmsg = chtls_sendmsg;
609+
chtls_cpl_prot.splice_eof = chtls_splice_eof;
609610
chtls_cpl_prot.sendpage = chtls_sendpage;
610611
chtls_cpl_prot.recvmsg = chtls_recvmsg;
611612
chtls_cpl_prot.setsockopt = chtls_setsockopt;

fs/splice.c

Lines changed: 167 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <linux/fsnotify.h>
3434
#include <linux/security.h>
3535
#include <linux/gfp.h>
36+
#include <linux/net.h>
3637
#include <linux/socket.h>
3738
#include <linux/sched/signal.h>
3839

@@ -448,30 +449,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
448449
};
449450
EXPORT_SYMBOL(nosteal_pipe_buf_ops);
450451

451-
/*
452-
* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
453-
* using sendpage(). Return the number of bytes sent.
454-
*/
455-
static int pipe_to_sendpage(struct pipe_inode_info *pipe,
456-
struct pipe_buffer *buf, struct splice_desc *sd)
457-
{
458-
struct file *file = sd->u.file;
459-
loff_t pos = sd->pos;
460-
int more;
461-
462-
if (!likely(file->f_op->sendpage))
463-
return -EINVAL;
464-
465-
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
466-
467-
if (sd->len < sd->total_len &&
468-
pipe_occupancy(pipe->head, pipe->tail) > 1)
469-
more |= MSG_SENDPAGE_NOTLAST;
470-
471-
return file->f_op->sendpage(file, buf->page, buf->offset,
472-
sd->len, &pos, more);
473-
}
474-
475452
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
476453
{
477454
smp_mb();
@@ -652,7 +629,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des
652629
* Description:
653630
* This function does little more than loop over the pipe and call
654631
* @actor to do the actual moving of a single struct pipe_buffer to
655-
* the desired destination. See pipe_to_file, pipe_to_sendpage, or
632+
* the desired destination. See pipe_to_file, pipe_to_sendmsg, or
656633
* pipe_to_user.
657634
*
658635
*/
@@ -833,8 +810,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
833810

834811
EXPORT_SYMBOL(iter_file_splice_write);
835812

813+
#ifdef CONFIG_NET
836814
/**
837-
* generic_splice_sendpage - splice data from a pipe to a socket
815+
* splice_to_socket - splice data from a pipe to a socket
838816
* @pipe: pipe to splice from
839817
* @out: socket to write to
840818
* @ppos: position in @out
@@ -846,13 +824,131 @@ EXPORT_SYMBOL(iter_file_splice_write);
846824
* is involved.
847825
*
848826
*/
849-
ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
850-
loff_t *ppos, size_t len, unsigned int flags)
827+
ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
828+
loff_t *ppos, size_t len, unsigned int flags)
851829
{
852-
return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
853-
}
830+
struct socket *sock = sock_from_file(out);
831+
struct bio_vec bvec[16];
832+
struct msghdr msg = {};
833+
ssize_t ret = 0;
834+
size_t spliced = 0;
835+
bool need_wakeup = false;
836+
837+
pipe_lock(pipe);
838+
839+
while (len > 0) {
840+
unsigned int head, tail, mask, bc = 0;
841+
size_t remain = len;
842+
843+
/*
844+
* Check for signal early to make process killable when there
845+
* are always buffers available
846+
*/
847+
ret = -ERESTARTSYS;
848+
if (signal_pending(current))
849+
break;
850+
851+
while (pipe_empty(pipe->head, pipe->tail)) {
852+
ret = 0;
853+
if (!pipe->writers)
854+
goto out;
855+
856+
if (spliced)
857+
goto out;
858+
859+
ret = -EAGAIN;
860+
if (flags & SPLICE_F_NONBLOCK)
861+
goto out;
862+
863+
ret = -ERESTARTSYS;
864+
if (signal_pending(current))
865+
goto out;
866+
867+
if (need_wakeup) {
868+
wakeup_pipe_writers(pipe);
869+
need_wakeup = false;
870+
}
871+
872+
pipe_wait_readable(pipe);
873+
}
874+
875+
head = pipe->head;
876+
tail = pipe->tail;
877+
mask = pipe->ring_size - 1;
878+
879+
while (!pipe_empty(head, tail)) {
880+
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
881+
size_t seg;
882+
883+
if (!buf->len) {
884+
tail++;
885+
continue;
886+
}
887+
888+
seg = min_t(size_t, remain, buf->len);
889+
seg = min_t(size_t, seg, PAGE_SIZE);
890+
891+
ret = pipe_buf_confirm(pipe, buf);
892+
if (unlikely(ret)) {
893+
if (ret == -ENODATA)
894+
ret = 0;
895+
break;
896+
}
854897

855-
EXPORT_SYMBOL(generic_splice_sendpage);
898+
bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
899+
remain -= seg;
900+
if (seg >= buf->len)
901+
tail++;
902+
if (bc >= ARRAY_SIZE(bvec))
903+
break;
904+
}
905+
906+
if (!bc)
907+
break;
908+
909+
msg.msg_flags = MSG_SPLICE_PAGES;
910+
if (flags & SPLICE_F_MORE)
911+
msg.msg_flags |= MSG_MORE;
912+
if (remain && pipe_occupancy(pipe->head, tail) > 0)
913+
msg.msg_flags |= MSG_MORE;
914+
915+
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
916+
len - remain);
917+
ret = sock_sendmsg(sock, &msg);
918+
if (ret <= 0)
919+
break;
920+
921+
spliced += ret;
922+
len -= ret;
923+
tail = pipe->tail;
924+
while (ret > 0) {
925+
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
926+
size_t seg = min_t(size_t, ret, buf->len);
927+
928+
buf->offset += seg;
929+
buf->len -= seg;
930+
ret -= seg;
931+
932+
if (!buf->len) {
933+
pipe_buf_release(pipe, buf);
934+
tail++;
935+
}
936+
}
937+
938+
if (tail != pipe->tail) {
939+
pipe->tail = tail;
940+
if (pipe->files)
941+
need_wakeup = true;
942+
}
943+
}
944+
945+
out:
946+
pipe_unlock(pipe);
947+
if (need_wakeup)
948+
wakeup_pipe_writers(pipe);
949+
return spliced ?: ret;
950+
}
951+
#endif
856952

857953
static int warn_unsupported(struct file *file, const char *op)
858954
{
@@ -873,6 +969,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
873969
return out->f_op->splice_write(pipe, out, ppos, len, flags);
874970
}
875971

972+
/*
973+
* Indicate to the caller that there was a premature EOF when reading from the
974+
* source and the caller didn't indicate they would be sending more data after
975+
* this.
976+
*/
977+
static void do_splice_eof(struct splice_desc *sd)
978+
{
979+
if (sd->splice_eof)
980+
sd->splice_eof(sd);
981+
}
982+
876983
/*
877984
* Attempt to initiate a splice from a file to a pipe.
878985
*/
@@ -956,13 +1063,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
9561063
*/
9571064
bytes = 0;
9581065
len = sd->total_len;
1066+
1067+
/* Don't block on output, we have to drain the direct pipe. */
9591068
flags = sd->flags;
1069+
sd->flags &= ~SPLICE_F_NONBLOCK;
9601070

9611071
/*
962-
* Don't block on output, we have to drain the direct pipe.
1072+
* We signal MORE until we've read sufficient data to fulfill the
1073+
* request and we keep signalling it if the caller set it.
9631074
*/
964-
sd->flags &= ~SPLICE_F_NONBLOCK;
9651075
more = sd->flags & SPLICE_F_MORE;
1076+
sd->flags |= SPLICE_F_MORE;
9661077

9671078
WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
9681079

@@ -972,20 +1083,18 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
9721083

9731084
ret = do_splice_to(in, &pos, pipe, len, flags);
9741085
if (unlikely(ret <= 0))
975-
goto out_release;
1086+
goto read_failure;
9761087

9771088
read_len = ret;
9781089
sd->total_len = read_len;
9791090

9801091
/*
981-
* If more data is pending, set SPLICE_F_MORE
982-
* If this is the last data and SPLICE_F_MORE was not set
983-
* initially, clears it.
1092+
* If we now have sufficient data to fulfill the request then
1093+
* we clear SPLICE_F_MORE if it was not set initially.
9841094
*/
985-
if (read_len < len)
986-
sd->flags |= SPLICE_F_MORE;
987-
else if (!more)
1095+
if (read_len >= len && !more)
9881096
sd->flags &= ~SPLICE_F_MORE;
1097+
9891098
/*
9901099
* NOTE: nonblocking mode only applies to the input. We
9911100
* must not do the output in nonblocking mode as then we
@@ -1012,6 +1121,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
10121121
file_accessed(in);
10131122
return bytes;
10141123

1124+
read_failure:
1125+
/*
1126+
* If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1127+
* "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1128+
* "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1129+
* least 1 byte *then* we will also do the ->splice_eof() call.
1130+
*/
1131+
if (ret == 0 && !more && len > 0 && bytes)
1132+
do_splice_eof(sd);
10151133
out_release:
10161134
/*
10171135
* If we did an incomplete transfer we must release
@@ -1040,6 +1158,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
10401158
sd->flags);
10411159
}
10421160

1161+
static void direct_file_splice_eof(struct splice_desc *sd)
1162+
{
1163+
struct file *file = sd->u.file;
1164+
1165+
if (file->f_op->splice_eof)
1166+
file->f_op->splice_eof(file);
1167+
}
1168+
10431169
/**
10441170
* do_splice_direct - splices data directly between two files
10451171
* @in: file to splice from
@@ -1065,6 +1191,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
10651191
.flags = flags,
10661192
.pos = *ppos,
10671193
.u.file = out,
1194+
.splice_eof = direct_file_splice_eof,
10681195
.opos = opos,
10691196
};
10701197
long ret;

include/linux/fs.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1796,6 +1796,7 @@ struct file_operations {
17961796
int (*flock) (struct file *, int, struct file_lock *);
17971797
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
17981798
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
1799+
void (*splice_eof)(struct file *file);
17991800
int (*setlease)(struct file *, long, struct file_lock **, void **);
18001801
long (*fallocate)(struct file *file, int mode, loff_t offset,
18011802
loff_t len);
@@ -2759,8 +2760,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *,
27592760
struct pipe_inode_info *, size_t, unsigned int);
27602761
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
27612762
struct file *, loff_t *, size_t, unsigned int);
2762-
extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
2763-
struct file *out, loff_t *, size_t len, unsigned int flags);
27642763
extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
27652764
loff_t *opos, size_t len, unsigned int flags);
27662765

include/linux/net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ struct proto_ops {
210210
int offset, size_t size, int flags);
211211
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
212212
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
213+
void (*splice_eof)(struct socket *sock);
213214
int (*set_peek_off)(struct sock *sk, int val);
214215
int (*peek_len)(struct socket *sock);
215216

include/linux/socket.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,9 @@ struct ucred {
339339
#endif
340340

341341
/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
342-
#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)
342+
#define MSG_INTERNAL_SENDMSG_FLAGS \
343+
(MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \
344+
MSG_SENDPAGE_DECRYPTED)
343345

344346
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
345347
#define SOL_IP 0

include/linux/splice.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct splice_desc {
3838
struct file *file; /* file to read/write */
3939
void *data; /* cookie */
4040
} u;
41+
void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */
4142
loff_t pos; /* file position */
4243
loff_t *opos; /* sendfile: output position */
4344
size_t num_spliced; /* number of bytes already spliced */
@@ -84,6 +85,8 @@ extern long do_splice(struct file *in, loff_t *off_in,
8485

8586
extern long do_tee(struct file *in, struct file *out, size_t len,
8687
unsigned int flags);
88+
extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
89+
loff_t *ppos, size_t len, unsigned int flags);
8790

8891
/*
8992
* for dynamic pipe sizing

include/net/inet_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock,
3535
struct sock *newsk);
3636
int inet_send_prepare(struct sock *sk);
3737
int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
38+
void inet_splice_eof(struct socket *sock);
3839
ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
3940
size_t size, int flags);
4041
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,

include/net/sock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,7 @@ struct proto {
12791279
size_t len, int flags, int *addr_len);
12801280
int (*sendpage)(struct sock *sk, struct page *page,
12811281
int offset, size_t size, int flags);
1282+
void (*splice_eof)(struct socket *sock);
12821283
int (*bind)(struct sock *sk,
12831284
struct sockaddr *addr, int addr_len);
12841285
int (*bind_add)(struct sock *sk,

0 commit comments

Comments
 (0)