Skip to content

Commit 309c446

Browse files
committed
Merge branch 'tcp-zero-copy-receive'
Eric Dumazet says: ==================== tcp: add zero copy receive This patch series add mmap() support to TCP sockets for RX zero copy. While tcp_mmap() patch itself is quite small (~100 LOC), optimal support for asynchronous mmap() required better SO_RCVLOWAT behavior, and a test program to demonstrate how mmap() on TCP sockets can be used. Note that mmap() (and associated munmap()) calls are adding more pressure on per-process VM semaphore, so might not show benefit for processus with high number of threads. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 10b19ae + 192dc40 commit 309c446

File tree

9 files changed

+608
-7
lines changed

9 files changed

+608
-7
lines changed

include/linux/net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ struct proto_ops {
197197
int offset, size_t size, int flags);
198198
int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
199199
size_t size);
200+
int (*set_rcvlowat)(struct sock *sk, int val);
200201
};
201202

202203
#define DECLARE_SOCKADDR(type, dst, src) \

include/net/tcp.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,10 @@ void tcp_set_keepalive(struct sock *sk, int val);
402402
void tcp_syn_ack_timeout(const struct request_sock *req);
403403
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
404404
int flags, int *addr_len);
405+
int tcp_set_rcvlowat(struct sock *sk, int val);
406+
void tcp_data_ready(struct sock *sk);
407+
int tcp_mmap(struct file *file, struct socket *sock,
408+
struct vm_area_struct *vma);
405409
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
406410
struct tcp_options_received *opt_rx,
407411
int estab, struct tcp_fastopen_cookie *foc);

net/core/sock.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
905905
case SO_RCVLOWAT:
906906
if (val < 0)
907907
val = INT_MAX;
908-
sk->sk_rcvlowat = val ? : 1;
908+
if (sock->ops->set_rcvlowat)
909+
ret = sock->ops->set_rcvlowat(sk, val);
910+
else
911+
sk->sk_rcvlowat = val ? : 1;
909912
break;
910913

911914
case SO_RCVTIMEO:

net/ipv4/af_inet.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
994994
.getsockopt = sock_common_getsockopt,
995995
.sendmsg = inet_sendmsg,
996996
.recvmsg = inet_recvmsg,
997-
.mmap = sock_no_mmap,
997+
.mmap = tcp_mmap,
998998
.sendpage = inet_sendpage,
999999
.splice_read = tcp_splice_read,
10001000
.read_sock = tcp_read_sock,
@@ -1006,6 +1006,7 @@ const struct proto_ops inet_stream_ops = {
10061006
.compat_getsockopt = compat_sock_common_getsockopt,
10071007
.compat_ioctl = inet_compat_ioctl,
10081008
#endif
1009+
.set_rcvlowat = tcp_set_rcvlowat,
10091010
};
10101011
EXPORT_SYMBOL(inet_stream_ops);
10111012

net/ipv4/tcp.c

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,6 +1701,144 @@ int tcp_peek_len(struct socket *sock)
17011701
}
17021702
EXPORT_SYMBOL(tcp_peek_len);
17031703

1704+
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
1705+
int tcp_set_rcvlowat(struct sock *sk, int val)
1706+
{
1707+
sk->sk_rcvlowat = val ? : 1;
1708+
1709+
/* Check if we need to signal EPOLLIN right now */
1710+
tcp_data_ready(sk);
1711+
1712+
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1713+
return 0;
1714+
1715+
/* val comes from user space and might be close to INT_MAX */
1716+
val <<= 1;
1717+
if (val < 0)
1718+
val = INT_MAX;
1719+
1720+
val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
1721+
if (val > sk->sk_rcvbuf) {
1722+
sk->sk_rcvbuf = val;
1723+
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1724+
}
1725+
return 0;
1726+
}
1727+
EXPORT_SYMBOL(tcp_set_rcvlowat);
1728+
1729+
/* When user wants to mmap X pages, we first need to perform the mapping
1730+
* before freeing any skbs in receive queue, otherwise user would be unable
1731+
* to fallback to standard recvmsg(). This happens if some data in the
1732+
* requested block is not exactly fitting in a page.
1733+
*
1734+
* We only support order-0 pages for the moment.
1735+
* mmap() on TCP is very strict, there is no point
1736+
* trying to accommodate with pathological layouts.
1737+
*/
1738+
int tcp_mmap(struct file *file, struct socket *sock,
1739+
struct vm_area_struct *vma)
1740+
{
1741+
unsigned long size = vma->vm_end - vma->vm_start;
1742+
unsigned int nr_pages = size >> PAGE_SHIFT;
1743+
struct page **pages_array = NULL;
1744+
u32 seq, len, offset, nr = 0;
1745+
struct sock *sk = sock->sk;
1746+
const skb_frag_t *frags;
1747+
struct tcp_sock *tp;
1748+
struct sk_buff *skb;
1749+
int ret;
1750+
1751+
if (vma->vm_pgoff || !nr_pages)
1752+
return -EINVAL;
1753+
1754+
if (vma->vm_flags & VM_WRITE)
1755+
return -EPERM;
1756+
/* TODO: Maybe the following is not needed if pages are COW */
1757+
vma->vm_flags &= ~VM_MAYWRITE;
1758+
1759+
lock_sock(sk);
1760+
1761+
ret = -ENOTCONN;
1762+
if (sk->sk_state == TCP_LISTEN)
1763+
goto out;
1764+
1765+
sock_rps_record_flow(sk);
1766+
1767+
if (tcp_inq(sk) < size) {
1768+
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
1769+
goto out;
1770+
}
1771+
tp = tcp_sk(sk);
1772+
seq = tp->copied_seq;
1773+
/* Abort if urgent data is in the area */
1774+
if (unlikely(tp->urg_data)) {
1775+
u32 urg_offset = tp->urg_seq - seq;
1776+
1777+
ret = -EINVAL;
1778+
if (urg_offset < size)
1779+
goto out;
1780+
}
1781+
ret = -ENOMEM;
1782+
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
1783+
GFP_KERNEL);
1784+
if (!pages_array)
1785+
goto out;
1786+
skb = tcp_recv_skb(sk, seq, &offset);
1787+
ret = -EINVAL;
1788+
skb_start:
1789+
/* We do not support anything not in page frags */
1790+
offset -= skb_headlen(skb);
1791+
if ((int)offset < 0)
1792+
goto out;
1793+
if (skb_has_frag_list(skb))
1794+
goto out;
1795+
len = skb->data_len - offset;
1796+
frags = skb_shinfo(skb)->frags;
1797+
while (offset) {
1798+
if (frags->size > offset)
1799+
goto out;
1800+
offset -= frags->size;
1801+
frags++;
1802+
}
1803+
while (nr < nr_pages) {
1804+
if (len) {
1805+
if (len < PAGE_SIZE)
1806+
goto out;
1807+
if (frags->size != PAGE_SIZE || frags->page_offset)
1808+
goto out;
1809+
pages_array[nr++] = skb_frag_page(frags);
1810+
frags++;
1811+
len -= PAGE_SIZE;
1812+
seq += PAGE_SIZE;
1813+
continue;
1814+
}
1815+
skb = skb->next;
1816+
offset = seq - TCP_SKB_CB(skb)->seq;
1817+
goto skb_start;
1818+
}
1819+
/* OK, we have a full set of pages ready to be inserted into vma */
1820+
for (nr = 0; nr < nr_pages; nr++) {
1821+
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
1822+
pages_array[nr]);
1823+
if (ret)
1824+
goto out;
1825+
}
1826+
/* operation is complete, we can 'consume' all skbs */
1827+
tp->copied_seq = seq;
1828+
tcp_rcv_space_adjust(sk);
1829+
1830+
/* Clean up data we have read: This will do ACK frames. */
1831+
tcp_recv_skb(sk, seq, &offset);
1832+
tcp_cleanup_rbuf(sk, size);
1833+
1834+
ret = 0;
1835+
out:
1836+
release_sock(sk);
1837+
kvfree(pages_array);
1838+
return ret;
1839+
}
1840+
EXPORT_SYMBOL(tcp_mmap);
1841+
17041842
static void tcp_update_recv_tstamps(struct sk_buff *skb,
17051843
struct scm_timestamping *tss)
17061844
{

net/ipv4/tcp_input.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4576,6 +4576,17 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
45764576

45774577
}
45784578

4579+
void tcp_data_ready(struct sock *sk)
4580+
{
4581+
const struct tcp_sock *tp = tcp_sk(sk);
4582+
int avail = tp->rcv_nxt - tp->copied_seq;
4583+
4584+
if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
4585+
return;
4586+
4587+
sk->sk_data_ready(sk);
4588+
}
4589+
45794590
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
45804591
{
45814592
struct tcp_sock *tp = tcp_sk(sk);
@@ -4633,7 +4644,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
46334644
if (eaten > 0)
46344645
kfree_skb_partial(skb, fragstolen);
46354646
if (!sock_flag(sk, SOCK_DEAD))
4636-
sk->sk_data_ready(sk);
4647+
tcp_data_ready(sk);
46374648
return;
46384649
}
46394650

@@ -5026,9 +5037,12 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
50265037
/* More than one full frame received... */
50275038
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
50285039
/* ... and right edge of window advances far enough.
5029-
* (tcp_recvmsg() will send ACK otherwise). Or...
5040+
* (tcp_recvmsg() will send ACK otherwise).
5041+
* If application uses SO_RCVLOWAT, we want send ack now if
5042+
* we have not received enough bytes to satisfy the condition.
50305043
*/
5031-
__tcp_select_window(sk) >= tp->rcv_wnd) ||
5044+
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
5045+
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
50325046
/* We ACK each frame or... */
50335047
tcp_in_quickack_mode(sk) ||
50345048
/* We have out of order data. */
@@ -5431,7 +5445,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
54315445
no_ack:
54325446
if (eaten)
54335447
kfree_skb_partial(skb, fragstolen);
5434-
sk->sk_data_ready(sk);
5448+
tcp_data_ready(sk);
54355449
return;
54365450
}
54375451
}

net/ipv6/af_inet6.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
579579
.getsockopt = sock_common_getsockopt, /* ok */
580580
.sendmsg = inet_sendmsg, /* ok */
581581
.recvmsg = inet_recvmsg, /* ok */
582-
.mmap = sock_no_mmap,
582+
.mmap = tcp_mmap,
583583
.sendpage = inet_sendpage,
584584
.sendmsg_locked = tcp_sendmsg_locked,
585585
.sendpage_locked = tcp_sendpage_locked,
@@ -590,6 +590,7 @@ const struct proto_ops inet6_stream_ops = {
590590
.compat_setsockopt = compat_sock_common_setsockopt,
591591
.compat_getsockopt = compat_sock_common_getsockopt,
592592
#endif
593+
.set_rcvlowat = tcp_set_rcvlowat,
593594
};
594595

595596
const struct proto_ops inet6_dgram_ops = {

tools/testing/selftests/net/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetl
88
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
99
TEST_GEN_FILES = socket
1010
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
11+
TEST_GEN_FILES += tcp_mmap
1112
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
1213
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
1314

1415
include ../lib.mk
1516

1617
$(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
18+
$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread

0 commit comments

Comments
 (0)