Skip to content

Commit 4f693b5

Browse files
Eric Dumazetdavem330
authored andcommitted
tcp: implement coalescing on backlog queue
In case GRO is not as efficient as it should be or disabled, we might have a user thread trapped in __release_sock() while softirq handler flood packets up to the point we have to drop. This patch balances work done from user thread and softirq, to give more chances to __release_sock() to complete its work before new packets are added the the backlog. This also helps if we receive many ACK packets, since GRO does not aggregate them. This patch brings ~60% throughput increase on a receiver without GRO, but the spectacular gain is really on 1000x release_sock() latency reduction I have measured. Signed-off-by: Eric Dumazet <[email protected]> Cc: Neal Cardwell <[email protected]> Cc: Yuchung Cheng <[email protected]> Acked-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 85bdf7d commit 4f693b5

File tree

3 files changed

+88
-6
lines changed

3 files changed

+88
-6
lines changed

include/uapi/linux/snmp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ enum
243243
LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */
244244
LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */
245245
LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */
246+
LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */
246247
LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */
247248
LINUX_MIB_TCPOFODROP, /* TCPOFODrop */
248249
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */

net/ipv4/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
219219
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
220220
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
221221
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
222+
SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
222223
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
223224
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
224225
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),

net/ipv4/tcp_ipv4.c

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
16191619
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
16201620
{
16211621
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1622-
1623-
/* Only socket owner can try to collapse/prune rx queues
1624-
* to reduce memory overhead, so add a little headroom here.
1625-
* Few sockets backlog are possibly concurrently non empty.
1626-
*/
1627-
limit += 64*1024;
1622+
struct skb_shared_info *shinfo;
1623+
const struct tcphdr *th;
1624+
struct tcphdr *thtail;
1625+
struct sk_buff *tail;
1626+
unsigned int hdrlen;
1627+
bool fragstolen;
1628+
u32 gso_segs;
1629+
int delta;
16281630

16291631
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
16301632
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
16361638

16371639
skb_dst_drop(skb);
16381640

1641+
if (unlikely(tcp_checksum_complete(skb))) {
1642+
bh_unlock_sock(sk);
1643+
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1644+
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1645+
return true;
1646+
}
1647+
1648+
/* Attempt coalescing to last skb in backlog, even if we are
1649+
* above the limits.
1650+
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1651+
*/
1652+
th = (const struct tcphdr *)skb->data;
1653+
hdrlen = th->doff * 4;
1654+
shinfo = skb_shinfo(skb);
1655+
1656+
if (!shinfo->gso_size)
1657+
shinfo->gso_size = skb->len - hdrlen;
1658+
1659+
if (!shinfo->gso_segs)
1660+
shinfo->gso_segs = 1;
1661+
1662+
tail = sk->sk_backlog.tail;
1663+
if (!tail)
1664+
goto no_coalesce;
1665+
thtail = (struct tcphdr *)tail->data;
1666+
1667+
if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1668+
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1669+
((TCP_SKB_CB(tail)->tcp_flags |
1670+
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
1671+
((TCP_SKB_CB(tail)->tcp_flags ^
1672+
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1673+
#ifdef CONFIG_TLS_DEVICE
1674+
tail->decrypted != skb->decrypted ||
1675+
#endif
1676+
thtail->doff != th->doff ||
1677+
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1678+
goto no_coalesce;
1679+
1680+
__skb_pull(skb, hdrlen);
1681+
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1682+
thtail->window = th->window;
1683+
1684+
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1685+
1686+
if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
1687+
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1688+
1689+
TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1690+
1691+
if (TCP_SKB_CB(skb)->has_rxtstamp) {
1692+
TCP_SKB_CB(tail)->has_rxtstamp = true;
1693+
tail->tstamp = skb->tstamp;
1694+
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1695+
}
1696+
1697+
/* Not as strict as GRO. We only need to carry mss max value */
1698+
skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
1699+
skb_shinfo(tail)->gso_size);
1700+
1701+
gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
1702+
skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
1703+
1704+
sk->sk_backlog.len += delta;
1705+
__NET_INC_STATS(sock_net(sk),
1706+
LINUX_MIB_TCPBACKLOGCOALESCE);
1707+
kfree_skb_partial(skb, fragstolen);
1708+
return false;
1709+
}
1710+
__skb_push(skb, hdrlen);
1711+
1712+
no_coalesce:
1713+
/* Only socket owner can try to collapse/prune rx queues
1714+
* to reduce memory overhead, so add a little headroom here.
1715+
* Few sockets backlog are possibly concurrently non empty.
1716+
*/
1717+
limit += 64*1024;
1718+
16391719
if (unlikely(sk_add_backlog(sk, skb, limit))) {
16401720
bh_unlock_sock(sk);
16411721
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);

0 commit comments

Comments
 (0)