Skip to content

Commit 9f5afea

Browse files
Yaogong Wangdavem330
authored andcommitted
tcp: use an RB tree for ooo receive queue
Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Cc: Yuchung Cheng <[email protected]> Cc: Neal Cardwell <[email protected]> Cc: Ilpo Järvinen <[email protected]> Acked-By: Ilpo Järvinen <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 3b61075 commit 9f5afea

File tree

8 files changed

+218
-149
lines changed

8 files changed

+218
-149
lines changed

include/linux/skbuff.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
24022402
kfree_skb(skb);
24032403
}
24042404

2405+
void skb_rbtree_purge(struct rb_root *root);
2406+
24052407
void *netdev_alloc_frag(unsigned int fragsz);
24062408

24072409
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,

include/linux/tcp.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,10 +281,9 @@ struct tcp_sock {
281281
struct sk_buff* lost_skb_hint;
282282
struct sk_buff *retransmit_skb_hint;
283283

284-
/* OOO segments go in this list. Note that socket lock must be held,
285-
* as we do not use sk_buff_head lock.
286-
*/
287-
struct sk_buff_head out_of_order_queue;
284+
/* OOO segments go in this rbtree. Socket lock must be held. */
285+
struct rb_root out_of_order_queue;
286+
struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */
288287

289288
/* SACKs data, these 2 need to be together (see tcp_options_write) */
290289
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */

include/net/tcp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ static inline void tcp_fast_path_check(struct sock *sk)
640640
{
641641
struct tcp_sock *tp = tcp_sk(sk);
642642

643-
if (skb_queue_empty(&tp->out_of_order_queue) &&
643+
if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
644644
tp->rcv_wnd &&
645645
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
646646
!tp->urg_data)

net/core/skbuff.c

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2444,6 +2444,25 @@ void skb_queue_purge(struct sk_buff_head *list)
24442444
}
24452445
EXPORT_SYMBOL(skb_queue_purge);
24462446

2447+
/**
2448+
* skb_rbtree_purge - empty a skb rbtree
2449+
* @root: root of the rbtree to empty
2450+
*
2451+
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
2452+
* the list and one reference dropped. This function does not take
2453+
* any lock. Synchronization should be handled by the caller (e.g., TCP
2454+
* out-of-order queue is protected by the socket lock).
2455+
*/
2456+
void skb_rbtree_purge(struct rb_root *root)
2457+
{
2458+
struct sk_buff *skb, *next;
2459+
2460+
rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode)
2461+
kfree_skb(skb);
2462+
2463+
*root = RB_ROOT;
2464+
}
2465+
24472466
/**
24482467
* skb_queue_head - queue a buffer at the list head
24492468
* @list: list to use

net/ipv4/tcp.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ void tcp_init_sock(struct sock *sk)
380380
struct inet_connection_sock *icsk = inet_csk(sk);
381381
struct tcp_sock *tp = tcp_sk(sk);
382382

383-
__skb_queue_head_init(&tp->out_of_order_queue);
383+
tp->out_of_order_queue = RB_ROOT;
384384
tcp_init_xmit_timers(sk);
385385
tcp_prequeue_init(tp);
386386
INIT_LIST_HEAD(&tp->tsq_node);
@@ -2243,7 +2243,7 @@ int tcp_disconnect(struct sock *sk, int flags)
22432243
tcp_clear_xmit_timers(sk);
22442244
__skb_queue_purge(&sk->sk_receive_queue);
22452245
tcp_write_queue_purge(sk);
2246-
__skb_queue_purge(&tp->out_of_order_queue);
2246+
skb_rbtree_purge(&tp->out_of_order_queue);
22472247

22482248
inet->inet_dport = 0;
22492249

0 commit comments

Comments
 (0)