Skip to content

Commit 75c119a

Browse files
Eric Dumazetdavem330
authored andcommitted
tcp: implement rb-tree based retransmit queue
Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent f331981 commit 75c119a

File tree

7 files changed

+245
-188
lines changed

7 files changed

+245
-188
lines changed

include/net/sock.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
#include <linux/sched.h>
6161
#include <linux/wait.h>
6262
#include <linux/cgroup-defs.h>
63-
63+
#include <linux/rbtree.h>
6464
#include <linux/filter.h>
6565
#include <linux/rculist_nulls.h>
6666
#include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
397397
int sk_wmem_queued;
398398
refcount_t sk_wmem_alloc;
399399
unsigned long sk_tsq_flags;
400-
struct sk_buff *sk_send_head;
400+
union {
401+
struct sk_buff *sk_send_head;
402+
struct rb_root tcp_rtx_queue;
403+
};
401404
struct sk_buff_head sk_write_queue;
402405
__s32 sk_peek_off;
403406
int sk_write_pending;

include/net/tcp.h

Lines changed: 47 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
551551
void tcp_simple_retransmit(struct sock *);
552552
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
553553
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
554-
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
554+
enum tcp_queue {
555+
TCP_FRAG_IN_WRITE_QUEUE,
556+
TCP_FRAG_IN_RTX_QUEUE,
557+
};
558+
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
559+
struct sk_buff *skb, u32 len,
560+
unsigned int mss_now, gfp_t gfp);
555561

556562
void tcp_send_probe0(struct sock *);
557563
void tcp_send_partial(struct sock *);
@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
16081614

16091615
void tcp_write_queue_purge(struct sock *sk);
16101616

1617+
static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
1618+
{
1619+
return skb_rb_first(&sk->tcp_rtx_queue);
1620+
}
1621+
16111622
static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
16121623
{
16131624
return skb_peek(&sk->sk_write_queue);
@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
16301641
return skb_queue_prev(&sk->sk_write_queue, skb);
16311642
}
16321643

1633-
#define tcp_for_write_queue(skb, sk) \
1634-
skb_queue_walk(&(sk)->sk_write_queue, skb)
1635-
1636-
#define tcp_for_write_queue_from(skb, sk) \
1637-
skb_queue_walk_from(&(sk)->sk_write_queue, skb)
1638-
16391644
#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
16401645
skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
16411646

16421647
static inline struct sk_buff *tcp_send_head(const struct sock *sk)
16431648
{
1644-
return sk->sk_send_head;
1649+
return skb_peek(&sk->sk_write_queue);
16451650
}
16461651

16471652
static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
16501655
return skb_queue_is_last(&sk->sk_write_queue, skb);
16511656
}
16521657

1653-
static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
1658+
static inline bool tcp_write_queue_empty(const struct sock *sk)
16541659
{
1655-
if (tcp_skb_is_last(sk, skb))
1656-
sk->sk_send_head = NULL;
1657-
else
1658-
sk->sk_send_head = tcp_write_queue_next(sk, skb);
1660+
return skb_queue_empty(&sk->sk_write_queue);
1661+
}
1662+
1663+
static inline bool tcp_rtx_queue_empty(const struct sock *sk)
1664+
{
1665+
return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
1666+
}
1667+
1668+
static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
1669+
{
1670+
return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
16591671
}
16601672

16611673
static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
16621674
{
1663-
if (sk->sk_send_head == skb_unlinked) {
1664-
sk->sk_send_head = NULL;
1675+
if (tcp_write_queue_empty(sk))
16651676
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
1666-
}
1677+
16671678
if (tcp_sk(sk)->highest_sack == skb_unlinked)
16681679
tcp_sk(sk)->highest_sack = NULL;
16691680
}
16701681

1671-
static inline void tcp_init_send_head(struct sock *sk)
1672-
{
1673-
sk->sk_send_head = NULL;
1674-
}
1675-
16761682
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
16771683
{
16781684
__skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
16831689
__tcp_add_write_queue_tail(sk, skb);
16841690

16851691
/* Queue it, remembering where we must start sending. */
1686-
if (sk->sk_send_head == NULL) {
1687-
sk->sk_send_head = skb;
1692+
if (sk->sk_write_queue.next == skb) {
16881693
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
16891694

16901695
if (tcp_sk(sk)->highest_sack == NULL)
@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
16971702
__skb_queue_head(&sk->sk_write_queue, skb);
16981703
}
16991704

1700-
/* Insert buff after skb on the write queue of sk. */
1701-
static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
1702-
struct sk_buff *buff,
1703-
struct sock *sk)
1704-
{
1705-
__skb_queue_after(&sk->sk_write_queue, skb, buff);
1706-
}
1707-
17081705
/* Insert new before skb on the write queue of sk. */
17091706
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
17101707
struct sk_buff *skb,
17111708
struct sock *sk)
17121709
{
17131710
__skb_queue_before(&sk->sk_write_queue, skb, new);
1714-
1715-
if (sk->sk_send_head == skb)
1716-
sk->sk_send_head = new;
17171711
}
17181712

17191713
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
17201714
{
1721-
list_del(&skb->tcp_tsorted_anchor);
1722-
tcp_skb_tsorted_anchor_cleanup(skb);
17231715
__skb_unlink(skb, &sk->sk_write_queue);
17241716
}
17251717

1726-
static inline bool tcp_write_queue_empty(struct sock *sk)
1718+
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
1719+
1720+
static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
17271721
{
1728-
return skb_queue_empty(&sk->sk_write_queue);
1722+
tcp_skb_tsorted_anchor_cleanup(skb);
1723+
rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
1724+
}
1725+
1726+
static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
1727+
{
1728+
list_del(&skb->tcp_tsorted_anchor);
1729+
tcp_rtx_queue_unlink(skb, sk);
1730+
sk_wmem_free_skb(sk, skb);
17291731
}
17301732

17311733
static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
17541756

17551757
static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
17561758
{
1757-
tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
1758-
tcp_write_queue_next(sk, skb);
1759+
struct sk_buff *next = skb_rb_next(skb);
1760+
1761+
tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
17591762
}
17601763

17611764
static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
17651768

17661769
static inline void tcp_highest_sack_reset(struct sock *sk)
17671770
{
1768-
tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
1771+
struct sk_buff *skb = tcp_rtx_queue_head(sk);
1772+
1773+
tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
17691774
}
17701775

17711776
/* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
19351940
/* At how many usecs into the future should the RTO fire? */
19361941
static inline s64 tcp_rto_delta_us(const struct sock *sk)
19371942
{
1938-
const struct sk_buff *skb = tcp_write_queue_head(sk);
1943+
const struct sk_buff *skb = tcp_rtx_queue_head(sk);
19391944
u32 rto = inet_csk(sk)->icsk_rto;
19401945
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
19411946

net/ipv4/tcp.c

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
413413
struct tcp_sock *tp = tcp_sk(sk);
414414

415415
tp->out_of_order_queue = RB_ROOT;
416+
sk->tcp_rtx_queue = RB_ROOT;
416417
tcp_init_xmit_timers(sk);
417418
INIT_LIST_HEAD(&tp->tsq_node);
418419
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
701702
struct tcp_sock *tp = tcp_sk(sk);
702703
struct sk_buff *skb;
703704

704-
if (!tcp_send_head(sk))
705-
return;
706-
707705
skb = tcp_write_queue_tail(sk);
706+
if (!skb)
707+
return;
708708
if (!(flags & MSG_MORE) || forced_push(tp))
709709
tcp_mark_push(tp, skb);
710710

@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
964964
int copy, i;
965965
bool can_coalesce;
966966

967-
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
967+
if (!skb || (copy = size_goal - skb->len) <= 0 ||
968968
!tcp_skb_can_collapse_to(skb)) {
969969
new_segment:
970970
if (!sk_stream_memory_free(sk))
971971
goto wait_for_sndbuf;
972972

973973
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
974-
skb_queue_empty(&sk->sk_write_queue));
974+
tcp_rtx_and_write_queues_empty(sk));
975975
if (!skb)
976976
goto wait_for_memory;
977977

@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
11991199
goto out_err;
12001200
}
12011201

1202-
skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
1202+
skb = tcp_write_queue_tail(sk);
12031203
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
12041204
if (!uarg) {
12051205
err = -ENOBUFS;
@@ -1275,7 +1275,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
12751275
int max = size_goal;
12761276

12771277
skb = tcp_write_queue_tail(sk);
1278-
if (tcp_send_head(sk)) {
1278+
if (skb) {
12791279
if (skb->ip_summed == CHECKSUM_NONE)
12801280
max = mss_now;
12811281
copy = max - skb->len;
@@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
12951295
process_backlog = false;
12961296
goto restart;
12971297
}
1298-
first_skb = skb_queue_empty(&sk->sk_write_queue);
1298+
first_skb = tcp_rtx_and_write_queues_empty(sk);
12991299
skb = sk_stream_alloc_skb(sk,
13001300
select_size(sk, sg, first_skb),
13011301
sk->sk_allocation,
@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
15211521

15221522
/* XXX -- need to support SO_PEEK_OFF */
15231523

1524+
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1525+
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1526+
if (err)
1527+
return err;
1528+
copied += skb->len;
1529+
}
1530+
15241531
skb_queue_walk(&sk->sk_write_queue, skb) {
15251532
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
15261533
if (err)
@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
23202327
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
23212328
}
23222329

2330+
static void tcp_rtx_queue_purge(struct sock *sk)
2331+
{
2332+
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2333+
2334+
while (p) {
2335+
struct sk_buff *skb = rb_to_skb(p);
2336+
2337+
p = rb_next(p);
2338+
/* Since we are deleting whole queue, no need to
2339+
* list_del(&skb->tcp_tsorted_anchor)
2340+
*/
2341+
tcp_rtx_queue_unlink(skb, sk);
2342+
sk_wmem_free_skb(sk, skb);
2343+
}
2344+
}
2345+
23232346
void tcp_write_queue_purge(struct sock *sk)
23242347
{
23252348
struct sk_buff *skb;
@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
23292352
tcp_skb_tsorted_anchor_cleanup(skb);
23302353
sk_wmem_free_skb(sk, skb);
23312354
}
2355+
tcp_rtx_queue_purge(sk);
23322356
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
23332357
sk_mem_reclaim(sk);
23342358
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
23922416
* issue in __tcp_select_window()
23932417
*/
23942418
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2395-
tcp_init_send_head(sk);
23962419
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
23972420
__sk_dst_reset(sk);
23982421
dst_release(sk->sk_rx_dst);

0 commit comments

Comments
 (0)