Skip to content

Commit fa0f527

Browse files
Peter Oskolkovdavem330
authored andcommitted
ip: use rb trees for IP frag queue.
Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H <host> -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn <[email protected]> Reported-by: Juha-Matti Tilli <[email protected]> Suggested-by: Eric Dumazet <[email protected]> Signed-off-by: Peter Oskolkov <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Cc: Florian Westphal <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 385114d commit fa0f527

File tree

6 files changed

+121
-91
lines changed

6 files changed

+121
-91
lines changed

include/linux/skbuff.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -676,13 +676,16 @@ struct sk_buff {
676676
* UDP receive path is one user.
677677
*/
678678
unsigned long dev_scratch;
679-
int ip_defrag_offset;
680679
};
681680
};
682-
struct rb_node rbnode; /* used in netem & tcp stack */
681+
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
683682
struct list_head list;
684683
};
685-
struct sock *sk;
684+
685+
union {
686+
struct sock *sk;
687+
int ip_defrag_offset;
688+
};
686689

687690
union {
688691
ktime_t tstamp;

include/net/inet_frag.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ struct inet_frag_queue {
7575
struct timer_list timer;
7676
spinlock_t lock;
7777
refcount_t refcnt;
78-
struct sk_buff *fragments;
78+
struct sk_buff *fragments; /* Used in IPv6. */
79+
struct rb_root rb_fragments; /* Used in IPv4. */
7980
struct sk_buff *fragments_tail;
8081
ktime_t stamp;
8182
int len;

net/ipv4/inet_fragment.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
137137
fp = q->fragments;
138138
nf = q->net;
139139
f = nf->f;
140-
while (fp) {
141-
struct sk_buff *xp = fp->next;
142-
143-
sum_truesize += fp->truesize;
144-
kfree_skb(fp);
145-
fp = xp;
140+
if (fp) {
141+
do {
142+
struct sk_buff *xp = fp->next;
143+
144+
sum_truesize += fp->truesize;
145+
kfree_skb(fp);
146+
fp = xp;
147+
} while (fp);
148+
} else {
149+
sum_truesize = skb_rbtree_purge(&q->rb_fragments);
146150
}
147151
sum = sum_truesize + f->qsize;
148152

net/ipv4/ip_fragment.c

Lines changed: 101 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
136136
{
137137
struct inet_frag_queue *frag = from_timer(frag, t, timer);
138138
const struct iphdr *iph;
139-
struct sk_buff *head;
139+
struct sk_buff *head = NULL;
140140
struct net *net;
141141
struct ipq *qp;
142142
int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
152152

153153
ipq_kill(qp);
154154
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
155-
156-
head = qp->q.fragments;
157-
158155
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
159156

160-
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
157+
if (!qp->q.flags & INET_FRAG_FIRST_IN)
161158
goto out;
162159

160+
/* sk_buff::dev and sk_buff::rbnode are unionized. So we
161+
* pull the head out of the tree in order to be able to
162+
* deal with head->dev.
163+
*/
164+
if (qp->q.fragments) {
165+
head = qp->q.fragments;
166+
qp->q.fragments = head->next;
167+
} else {
168+
head = skb_rb_first(&qp->q.rb_fragments);
169+
if (!head)
170+
goto out;
171+
rb_erase(&head->rbnode, &qp->q.rb_fragments);
172+
memset(&head->rbnode, 0, sizeof(head->rbnode));
173+
barrier();
174+
}
175+
if (head == qp->q.fragments_tail)
176+
qp->q.fragments_tail = NULL;
177+
178+
sub_frag_mem_limit(qp->q.net, head->truesize);
179+
163180
head->dev = dev_get_by_index_rcu(net, qp->iif);
164181
if (!head->dev)
165182
goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
179196
(skb_rtable(head)->rt_type != RTN_LOCAL))
180197
goto out;
181198

182-
skb_get(head);
183199
spin_unlock(&qp->q.lock);
184200
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
185-
kfree_skb(head);
186201
goto out_rcu_unlock;
187202

188203
out:
189204
spin_unlock(&qp->q.lock);
190205
out_rcu_unlock:
191206
rcu_read_unlock();
207+
if (head)
208+
kfree_skb(head);
192209
ipq_put(qp);
193210
}
194211

@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
231248
end = atomic_inc_return(&peer->rid);
232249
qp->rid = end;
233250

234-
rc = qp->q.fragments && (end - start) > max;
251+
rc = qp->q.fragments_tail && (end - start) > max;
235252

236253
if (rc) {
237254
struct net *net;
@@ -245,28 +262,21 @@ static int ip_frag_too_far(struct ipq *qp)
245262

246263
static int ip_frag_reinit(struct ipq *qp)
247264
{
248-
struct sk_buff *fp;
249265
unsigned int sum_truesize = 0;
250266

251267
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
252268
refcount_inc(&qp->q.refcnt);
253269
return -ETIMEDOUT;
254270
}
255271

256-
fp = qp->q.fragments;
257-
do {
258-
struct sk_buff *xp = fp->next;
259-
260-
sum_truesize += fp->truesize;
261-
kfree_skb(fp);
262-
fp = xp;
263-
} while (fp);
272+
sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
264273
sub_frag_mem_limit(qp->q.net, sum_truesize);
265274

266275
qp->q.flags = 0;
267276
qp->q.len = 0;
268277
qp->q.meat = 0;
269278
qp->q.fragments = NULL;
279+
qp->q.rb_fragments = RB_ROOT;
270280
qp->q.fragments_tail = NULL;
271281
qp->iif = 0;
272282
qp->ecn = 0;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
278288
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
279289
{
280290
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
281-
struct sk_buff *prev, *next;
291+
struct rb_node **rbn, *parent;
292+
struct sk_buff *skb1;
282293
struct net_device *dev;
283294
unsigned int fragsize;
284295
int flags, offset;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
341352
if (err)
342353
goto err;
343354

344-
/* Find out which fragments are in front and at the back of us
345-
* in the chain of fragments so far. We must know where to put
346-
* this fragment, right?
347-
*/
348-
prev = qp->q.fragments_tail;
349-
if (!prev || prev->ip_defrag_offset < offset) {
350-
next = NULL;
351-
goto found;
352-
}
353-
prev = NULL;
354-
for (next = qp->q.fragments; next != NULL; next = next->next) {
355-
if (next->ip_defrag_offset >= offset)
356-
break; /* bingo! */
357-
prev = next;
358-
}
355+
/* Note : skb->rbnode and skb->dev share the same location. */
356+
dev = skb->dev;
357+
/* Makes sure compiler wont do silly aliasing games */
358+
barrier();
359359

360-
found:
361360
/* RFC5722, Section 4, amended by Errata ID : 3089
362361
* When reassembling an IPv6 datagram, if
363362
* one or more its constituent fragments is determined to be an
364363
* overlapping fragment, the entire datagram (and any constituent
365364
* fragments) MUST be silently discarded.
366365
*
367-
* We do the same here for IPv4.
366+
* We do the same here for IPv4 (and increment an snmp counter).
368367
*/
369368

370-
/* Is there an overlap with the previous fragment? */
371-
if (prev &&
372-
(prev->ip_defrag_offset + prev->len) > offset)
373-
goto discard_qp;
374-
375-
/* Is there an overlap with the next fragment? */
376-
if (next && next->ip_defrag_offset < end)
377-
goto discard_qp;
369+
/* Find out where to put this fragment. */
370+
skb1 = qp->q.fragments_tail;
371+
if (!skb1) {
372+
/* This is the first fragment we've received. */
373+
rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
374+
qp->q.fragments_tail = skb;
375+
} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
376+
/* This is the common/special case: skb goes to the end. */
377+
/* Detect and discard overlaps. */
378+
if (offset < (skb1->ip_defrag_offset + skb1->len))
379+
goto discard_qp;
380+
/* Insert after skb1. */
381+
rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
382+
qp->q.fragments_tail = skb;
383+
} else {
384+
/* Binary search. Note that skb can become the first fragment, but
385+
* not the last (covered above). */
386+
rbn = &qp->q.rb_fragments.rb_node;
387+
do {
388+
parent = *rbn;
389+
skb1 = rb_to_skb(parent);
390+
if (end <= skb1->ip_defrag_offset)
391+
rbn = &parent->rb_left;
392+
else if (offset >= skb1->ip_defrag_offset + skb1->len)
393+
rbn = &parent->rb_right;
394+
else /* Found an overlap with skb1. */
395+
goto discard_qp;
396+
} while (*rbn);
397+
/* Here we have parent properly set, and rbn pointing to
398+
* one of its NULL left/right children. Insert skb. */
399+
rb_link_node(&skb->rbnode, parent, rbn);
400+
}
401+
rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
378402

379-
/* Note : skb->ip_defrag_offset and skb->dev share the same location */
380-
dev = skb->dev;
381403
if (dev)
382404
qp->iif = dev->ifindex;
383-
/* Makes sure compiler wont do silly aliasing games */
384-
barrier();
385405
skb->ip_defrag_offset = offset;
386406

387-
/* Insert this fragment in the chain of fragments. */
388-
skb->next = next;
389-
if (!next)
390-
qp->q.fragments_tail = skb;
391-
if (prev)
392-
prev->next = skb;
393-
else
394-
qp->q.fragments = skb;
395-
396407
qp->q.stamp = skb->tstamp;
397408
qp->q.meat += skb->len;
398409
qp->ecn |= ecn;
@@ -414,7 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
414425
unsigned long orefdst = skb->_skb_refdst;
415426

416427
skb->_skb_refdst = 0UL;
417-
err = ip_frag_reasm(qp, prev, dev);
428+
err = ip_frag_reasm(qp, skb, dev);
418429
skb->_skb_refdst = orefdst;
419430
return err;
420431
}
@@ -431,15 +442,15 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
431442
return err;
432443
}
433444

434-
435445
/* Build a new IP datagram from all its fragments. */
436-
437-
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
446+
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
438447
struct net_device *dev)
439448
{
440449
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
441450
struct iphdr *iph;
442-
struct sk_buff *fp, *head = qp->q.fragments;
451+
struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
452+
struct sk_buff **nextp; /* To build frag_list. */
453+
struct rb_node *rbn;
443454
int len;
444455
int ihlen;
445456
int err;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
453464
goto out_fail;
454465
}
455466
/* Make the one we just received the head. */
456-
if (prev) {
457-
head = prev->next;
458-
fp = skb_clone(head, GFP_ATOMIC);
467+
if (head != skb) {
468+
fp = skb_clone(skb, GFP_ATOMIC);
459469
if (!fp)
460470
goto out_nomem;
461-
462-
fp->next = head->next;
463-
if (!fp->next)
471+
rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
472+
if (qp->q.fragments_tail == skb)
464473
qp->q.fragments_tail = fp;
465-
prev->next = fp;
466-
467-
skb_morph(head, qp->q.fragments);
468-
head->next = qp->q.fragments->next;
469-
470-
consume_skb(qp->q.fragments);
471-
qp->q.fragments = head;
474+
skb_morph(skb, head);
475+
rb_replace_node(&head->rbnode, &skb->rbnode,
476+
&qp->q.rb_fragments);
477+
consume_skb(head);
478+
head = skb;
472479
}
473480

474-
WARN_ON(!head);
475481
WARN_ON(head->ip_defrag_offset != 0);
476482

477483
/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
496502
clone = alloc_skb(0, GFP_ATOMIC);
497503
if (!clone)
498504
goto out_nomem;
499-
clone->next = head->next;
500-
head->next = clone;
501505
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
502506
skb_frag_list_init(head);
503507
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
504508
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
505509
clone->len = clone->data_len = head->data_len - plen;
506-
head->data_len -= clone->len;
507-
head->len -= clone->len;
510+
skb->truesize += clone->truesize;
508511
clone->csum = 0;
509512
clone->ip_summed = head->ip_summed;
510513
add_frag_mem_limit(qp->q.net, clone->truesize);
514+
skb_shinfo(head)->frag_list = clone;
515+
nextp = &clone->next;
516+
} else {
517+
nextp = &skb_shinfo(head)->frag_list;
511518
}
512519

513-
skb_shinfo(head)->frag_list = head->next;
514520
skb_push(head, head->data - skb_network_header(head));
515521

516-
for (fp=head->next; fp; fp = fp->next) {
522+
/* Traverse the tree in order, to build frag_list. */
523+
rbn = rb_next(&head->rbnode);
524+
rb_erase(&head->rbnode, &qp->q.rb_fragments);
525+
while (rbn) {
526+
struct rb_node *rbnext = rb_next(rbn);
527+
fp = rb_to_skb(rbn);
528+
rb_erase(rbn, &qp->q.rb_fragments);
529+
rbn = rbnext;
530+
*nextp = fp;
531+
nextp = &fp->next;
532+
fp->prev = NULL;
533+
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
517534
head->data_len += fp->len;
518535
head->len += fp->len;
519536
if (head->ip_summed != fp->ip_summed)
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
524541
}
525542
sub_frag_mem_limit(qp->q.net, head->truesize);
526543

544+
*nextp = NULL;
527545
head->next = NULL;
546+
head->prev = NULL;
528547
head->dev = dev;
529548
head->tstamp = qp->q.stamp;
530549
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
552571

553572
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
554573
qp->q.fragments = NULL;
574+
qp->q.rb_fragments = RB_ROOT;
555575
qp->q.fragments_tail = NULL;
556576
return 0;
557577

net/ipv6/netfilter/nf_conntrack_reasm.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
463463
head->csum);
464464

465465
fq->q.fragments = NULL;
466+
fq->q.rb_fragments = RB_ROOT;
466467
fq->q.fragments_tail = NULL;
467468

468469
return true;

0 commit comments

Comments
 (0)