Skip to content

Commit 9c55e01

Browse files
Jens Axboedavem330
authored andcommitted
[TCP]: Splice receive support.
Support for network splice receive. Signed-off-by: Jens Axboe <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent bbdfc2f commit 9c55e01

File tree

7 files changed

+401
-0
lines changed

7 files changed

+401
-0
lines changed

include/linux/net.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <asm/socket.h>
2323

2424
struct poll_table_struct;
25+
struct pipe_inode_info;
2526
struct inode;
2627
struct net;
2728

@@ -172,6 +173,8 @@ struct proto_ops {
172173
struct vm_area_struct * vma);
173174
ssize_t (*sendpage) (struct socket *sock, struct page *page,
174175
int offset, size_t size, int flags);
176+
ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
177+
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
175178
};
176179

177180
struct net_proto_family {

include/linux/skbuff.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595

9696
struct net_device;
9797
struct scatterlist;
98+
struct pipe_inode_info;
9899

99100
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
100101
struct nf_conntrack {
@@ -1559,6 +1560,11 @@ extern int skb_store_bits(struct sk_buff *skb, int offset,
15591560
extern __wsum skb_copy_and_csum_bits(const struct sk_buff *skb,
15601561
int offset, u8 *to, int len,
15611562
__wsum csum);
1563+
extern int skb_splice_bits(struct sk_buff *skb,
1564+
unsigned int offset,
1565+
struct pipe_inode_info *pipe,
1566+
unsigned int len,
1567+
unsigned int flags);
15621568
extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
15631569
extern void skb_split(struct sk_buff *skb,
15641570
struct sk_buff *skb1, const u32 len);

include/net/tcp.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ extern int tcp_twsk_unique(struct sock *sk,
309309

310310
extern void tcp_twsk_destructor(struct sock *sk);
311311

312+
extern ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
313+
struct pipe_inode_info *pipe, size_t len, unsigned int flags);
314+
312315
static inline void tcp_dec_quickack_mode(struct sock *sk,
313316
const unsigned int pkts)
314317
{

net/core/skbuff.c

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
#endif
5353
#include <linux/string.h>
5454
#include <linux/skbuff.h>
55+
#include <linux/splice.h>
5556
#include <linux/cache.h>
5657
#include <linux/rtnetlink.h>
5758
#include <linux/init.h>
@@ -71,6 +72,40 @@
7172
static struct kmem_cache *skbuff_head_cache __read_mostly;
7273
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
7374

75+
static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
76+
struct pipe_buffer *buf)
77+
{
78+
struct sk_buff *skb = (struct sk_buff *) buf->private;
79+
80+
kfree_skb(skb);
81+
}
82+
83+
static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
84+
struct pipe_buffer *buf)
85+
{
86+
struct sk_buff *skb = (struct sk_buff *) buf->private;
87+
88+
skb_get(skb);
89+
}
90+
91+
static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
92+
struct pipe_buffer *buf)
93+
{
94+
return 1;
95+
}
96+
97+
98+
/* Pipe buffer operations for a socket. */
99+
static struct pipe_buf_operations sock_pipe_buf_ops = {
100+
.can_merge = 0,
101+
.map = generic_pipe_buf_map,
102+
.unmap = generic_pipe_buf_unmap,
103+
.confirm = generic_pipe_buf_confirm,
104+
.release = sock_pipe_buf_release,
105+
.steal = sock_pipe_buf_steal,
106+
.get = sock_pipe_buf_get,
107+
};
108+
74109
/*
75110
* Keep out-of-line to prevent kernel bloat.
76111
* __builtin_return_address is not used because it is not always
@@ -1122,6 +1157,217 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
11221157
return -EFAULT;
11231158
}
11241159

1160+
/*
1161+
* Callback from splice_to_pipe(), if we need to release some pages
1162+
* at the end of the spd in case we error'ed out in filling the pipe.
1163+
*/
1164+
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
1165+
{
1166+
struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private;
1167+
1168+
kfree_skb(skb);
1169+
}
1170+
1171+
/*
1172+
* Fill page/offset/length into spd, if it can hold more pages.
1173+
*/
1174+
static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
1175+
unsigned int len, unsigned int offset,
1176+
struct sk_buff *skb)
1177+
{
1178+
if (unlikely(spd->nr_pages == PIPE_BUFFERS))
1179+
return 1;
1180+
1181+
spd->pages[spd->nr_pages] = page;
1182+
spd->partial[spd->nr_pages].len = len;
1183+
spd->partial[spd->nr_pages].offset = offset;
1184+
spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb);
1185+
spd->nr_pages++;
1186+
return 0;
1187+
}
1188+
1189+
/*
1190+
* Map linear and fragment data from the skb to spd. Returns number of
1191+
* pages mapped.
1192+
*/
1193+
static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
1194+
unsigned int *total_len,
1195+
struct splice_pipe_desc *spd)
1196+
{
1197+
unsigned int nr_pages = spd->nr_pages;
1198+
unsigned int poff, plen, len, toff, tlen;
1199+
int headlen, seg;
1200+
1201+
toff = *offset;
1202+
tlen = *total_len;
1203+
if (!tlen)
1204+
goto err;
1205+
1206+
/*
1207+
* if the offset is greater than the linear part, go directly to
1208+
* the fragments.
1209+
*/
1210+
headlen = skb_headlen(skb);
1211+
if (toff >= headlen) {
1212+
toff -= headlen;
1213+
goto map_frag;
1214+
}
1215+
1216+
/*
1217+
* first map the linear region into the pages/partial map, skipping
1218+
* any potential initial offset.
1219+
*/
1220+
len = 0;
1221+
while (len < headlen) {
1222+
void *p = skb->data + len;
1223+
1224+
poff = (unsigned long) p & (PAGE_SIZE - 1);
1225+
plen = min_t(unsigned int, headlen - len, PAGE_SIZE - poff);
1226+
len += plen;
1227+
1228+
if (toff) {
1229+
if (plen <= toff) {
1230+
toff -= plen;
1231+
continue;
1232+
}
1233+
plen -= toff;
1234+
poff += toff;
1235+
toff = 0;
1236+
}
1237+
1238+
plen = min(plen, tlen);
1239+
if (!plen)
1240+
break;
1241+
1242+
/*
1243+
* just jump directly to update and return, no point
1244+
* in going over fragments when the output is full.
1245+
*/
1246+
if (spd_fill_page(spd, virt_to_page(p), plen, poff, skb))
1247+
goto done;
1248+
1249+
tlen -= plen;
1250+
}
1251+
1252+
/*
1253+
* then map the fragments
1254+
*/
1255+
map_frag:
1256+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
1257+
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
1258+
1259+
plen = f->size;
1260+
poff = f->page_offset;
1261+
1262+
if (toff) {
1263+
if (plen <= toff) {
1264+
toff -= plen;
1265+
continue;
1266+
}
1267+
plen -= toff;
1268+
poff += toff;
1269+
toff = 0;
1270+
}
1271+
1272+
plen = min(plen, tlen);
1273+
if (!plen)
1274+
break;
1275+
1276+
if (spd_fill_page(spd, f->page, plen, poff, skb))
1277+
break;
1278+
1279+
tlen -= plen;
1280+
}
1281+
1282+
done:
1283+
if (spd->nr_pages - nr_pages) {
1284+
*offset = 0;
1285+
*total_len = tlen;
1286+
return 0;
1287+
}
1288+
err:
1289+
return 1;
1290+
}
1291+
1292+
/*
1293+
* Map data from the skb to a pipe. Should handle both the linear part,
1294+
* the fragments, and the frag list. It does NOT handle frag lists within
1295+
* the frag list, if such a thing exists. We'd probably need to recurse to
1296+
* handle that cleanly.
1297+
*/
1298+
int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
1299+
struct pipe_inode_info *pipe, unsigned int tlen,
1300+
unsigned int flags)
1301+
{
1302+
struct partial_page partial[PIPE_BUFFERS];
1303+
struct page *pages[PIPE_BUFFERS];
1304+
struct splice_pipe_desc spd = {
1305+
.pages = pages,
1306+
.partial = partial,
1307+
.flags = flags,
1308+
.ops = &sock_pipe_buf_ops,
1309+
.spd_release = sock_spd_release,
1310+
};
1311+
struct sk_buff *skb;
1312+
1313+
/*
1314+
* I'd love to avoid the clone here, but tcp_read_sock()
1315+
* ignores reference counts and unconditonally kills the sk_buff
1316+
* on return from the actor.
1317+
*/
1318+
skb = skb_clone(__skb, GFP_KERNEL);
1319+
if (unlikely(!skb))
1320+
return -ENOMEM;
1321+
1322+
/*
1323+
* __skb_splice_bits() only fails if the output has no room left,
1324+
* so no point in going over the frag_list for the error case.
1325+
*/
1326+
if (__skb_splice_bits(skb, &offset, &tlen, &spd))
1327+
goto done;
1328+
else if (!tlen)
1329+
goto done;
1330+
1331+
/*
1332+
* now see if we have a frag_list to map
1333+
*/
1334+
if (skb_shinfo(skb)->frag_list) {
1335+
struct sk_buff *list = skb_shinfo(skb)->frag_list;
1336+
1337+
for (; list && tlen; list = list->next) {
1338+
if (__skb_splice_bits(list, &offset, &tlen, &spd))
1339+
break;
1340+
}
1341+
}
1342+
1343+
done:
1344+
/*
1345+
* drop our reference to the clone, the pipe consumption will
1346+
* drop the rest.
1347+
*/
1348+
kfree_skb(skb);
1349+
1350+
if (spd.nr_pages) {
1351+
int ret;
1352+
1353+
/*
1354+
* Drop the socket lock, otherwise we have reverse
1355+
* locking dependencies between sk_lock and i_mutex
1356+
* here as compared to sendfile(). We enter here
1357+
* with the socket lock held, and splice_to_pipe() will
1358+
* grab the pipe inode lock. For sendfile() emulation,
1359+
* we call into ->sendpage() with the i_mutex lock held
1360+
* and networking will grab the socket lock.
1361+
*/
1362+
release_sock(__skb->sk);
1363+
ret = splice_to_pipe(pipe, &spd);
1364+
lock_sock(__skb->sk);
1365+
return ret;
1366+
}
1367+
1368+
return 0;
1369+
}
1370+
11251371
/**
11261372
* skb_store_bits - store bits from kernel buffer to skb
11271373
* @skb: destination buffer

net/ipv4/af_inet.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,7 @@ const struct proto_ops inet_stream_ops = {
838838
.recvmsg = sock_common_recvmsg,
839839
.mmap = sock_no_mmap,
840840
.sendpage = tcp_sendpage,
841+
.splice_read = tcp_splice_read,
841842
#ifdef CONFIG_COMPAT
842843
.compat_setsockopt = compat_sock_common_setsockopt,
843844
.compat_getsockopt = compat_sock_common_getsockopt,

0 commit comments

Comments
 (0)