@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
136
136
{
137
137
struct inet_frag_queue * frag = from_timer (frag , t , timer );
138
138
const struct iphdr * iph ;
139
- struct sk_buff * head ;
139
+ struct sk_buff * head = NULL ;
140
140
struct net * net ;
141
141
struct ipq * qp ;
142
142
int err ;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
152
152
153
153
ipq_kill (qp );
154
154
__IP_INC_STATS (net , IPSTATS_MIB_REASMFAILS );
155
-
156
- head = qp -> q .fragments ;
157
-
158
155
__IP_INC_STATS (net , IPSTATS_MIB_REASMTIMEOUT );
159
156
160
- if (!( qp -> q .flags & INET_FRAG_FIRST_IN ) || ! head )
157
+ if (!qp -> q .flags & INET_FRAG_FIRST_IN )
161
158
goto out ;
162
159
160
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
161
+ * pull the head out of the tree in order to be able to
162
+ * deal with head->dev.
163
+ */
164
+ if (qp -> q .fragments ) {
165
+ head = qp -> q .fragments ;
166
+ qp -> q .fragments = head -> next ;
167
+ } else {
168
+ head = skb_rb_first (& qp -> q .rb_fragments );
169
+ if (!head )
170
+ goto out ;
171
+ rb_erase (& head -> rbnode , & qp -> q .rb_fragments );
172
+ memset (& head -> rbnode , 0 , sizeof (head -> rbnode ));
173
+ barrier ();
174
+ }
175
+ if (head == qp -> q .fragments_tail )
176
+ qp -> q .fragments_tail = NULL ;
177
+
178
+ sub_frag_mem_limit (qp -> q .net , head -> truesize );
179
+
163
180
head -> dev = dev_get_by_index_rcu (net , qp -> iif );
164
181
if (!head -> dev )
165
182
goto out ;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
179
196
(skb_rtable (head )-> rt_type != RTN_LOCAL ))
180
197
goto out ;
181
198
182
- skb_get (head );
183
199
spin_unlock (& qp -> q .lock );
184
200
icmp_send (head , ICMP_TIME_EXCEEDED , ICMP_EXC_FRAGTIME , 0 );
185
- kfree_skb (head );
186
201
goto out_rcu_unlock ;
187
202
188
203
out :
189
204
spin_unlock (& qp -> q .lock );
190
205
out_rcu_unlock :
191
206
rcu_read_unlock ();
207
+ if (head )
208
+ kfree_skb (head );
192
209
ipq_put (qp );
193
210
}
194
211
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
231
248
end = atomic_inc_return (& peer -> rid );
232
249
qp -> rid = end ;
233
250
234
- rc = qp -> q .fragments && (end - start ) > max ;
251
+ rc = qp -> q .fragments_tail && (end - start ) > max ;
235
252
236
253
if (rc ) {
237
254
struct net * net ;
@@ -245,28 +262,21 @@ static int ip_frag_too_far(struct ipq *qp)
245
262
246
263
static int ip_frag_reinit (struct ipq * qp )
247
264
{
248
- struct sk_buff * fp ;
249
265
unsigned int sum_truesize = 0 ;
250
266
251
267
if (!mod_timer (& qp -> q .timer , jiffies + qp -> q .net -> timeout )) {
252
268
refcount_inc (& qp -> q .refcnt );
253
269
return - ETIMEDOUT ;
254
270
}
255
271
256
- fp = qp -> q .fragments ;
257
- do {
258
- struct sk_buff * xp = fp -> next ;
259
-
260
- sum_truesize += fp -> truesize ;
261
- kfree_skb (fp );
262
- fp = xp ;
263
- } while (fp );
272
+ sum_truesize = skb_rbtree_purge (& qp -> q .rb_fragments );
264
273
sub_frag_mem_limit (qp -> q .net , sum_truesize );
265
274
266
275
qp -> q .flags = 0 ;
267
276
qp -> q .len = 0 ;
268
277
qp -> q .meat = 0 ;
269
278
qp -> q .fragments = NULL ;
279
+ qp -> q .rb_fragments = RB_ROOT ;
270
280
qp -> q .fragments_tail = NULL ;
271
281
qp -> iif = 0 ;
272
282
qp -> ecn = 0 ;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
278
288
static int ip_frag_queue (struct ipq * qp , struct sk_buff * skb )
279
289
{
280
290
struct net * net = container_of (qp -> q .net , struct net , ipv4 .frags );
281
- struct sk_buff * prev , * next ;
291
+ struct rb_node * * rbn , * parent ;
292
+ struct sk_buff * skb1 ;
282
293
struct net_device * dev ;
283
294
unsigned int fragsize ;
284
295
int flags , offset ;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
341
352
if (err )
342
353
goto err ;
343
354
344
- /* Find out which fragments are in front and at the back of us
345
- * in the chain of fragments so far. We must know where to put
346
- * this fragment, right?
347
- */
348
- prev = qp -> q .fragments_tail ;
349
- if (!prev || prev -> ip_defrag_offset < offset ) {
350
- next = NULL ;
351
- goto found ;
352
- }
353
- prev = NULL ;
354
- for (next = qp -> q .fragments ; next != NULL ; next = next -> next ) {
355
- if (next -> ip_defrag_offset >= offset )
356
- break ; /* bingo! */
357
- prev = next ;
358
- }
355
+ /* Note : skb->rbnode and skb->dev share the same location. */
356
+ dev = skb -> dev ;
357
+ /* Makes sure compiler wont do silly aliasing games */
358
+ barrier ();
359
359
360
- found :
361
360
/* RFC5722, Section 4, amended by Errata ID : 3089
362
361
* When reassembling an IPv6 datagram, if
363
362
* one or more its constituent fragments is determined to be an
364
363
* overlapping fragment, the entire datagram (and any constituent
365
364
* fragments) MUST be silently discarded.
366
365
*
367
- * We do the same here for IPv4.
366
+ * We do the same here for IPv4 (and increment an snmp counter) .
368
367
*/
369
368
370
- /* Is there an overlap with the previous fragment? */
371
- if (prev &&
372
- (prev -> ip_defrag_offset + prev -> len ) > offset )
373
- goto discard_qp ;
374
-
375
- /* Is there an overlap with the next fragment? */
376
- if (next && next -> ip_defrag_offset < end )
377
- goto discard_qp ;
369
+ /* Find out where to put this fragment. */
370
+ skb1 = qp -> q .fragments_tail ;
371
+ if (!skb1 ) {
372
+ /* This is the first fragment we've received. */
373
+ rb_link_node (& skb -> rbnode , NULL , & qp -> q .rb_fragments .rb_node );
374
+ qp -> q .fragments_tail = skb ;
375
+ } else if ((skb1 -> ip_defrag_offset + skb1 -> len ) < end ) {
376
+ /* This is the common/special case: skb goes to the end. */
377
+ /* Detect and discard overlaps. */
378
+ if (offset < (skb1 -> ip_defrag_offset + skb1 -> len ))
379
+ goto discard_qp ;
380
+ /* Insert after skb1. */
381
+ rb_link_node (& skb -> rbnode , & skb1 -> rbnode , & skb1 -> rbnode .rb_right );
382
+ qp -> q .fragments_tail = skb ;
383
+ } else {
384
+ /* Binary search. Note that skb can become the first fragment, but
385
+ * not the last (covered above). */
386
+ rbn = & qp -> q .rb_fragments .rb_node ;
387
+ do {
388
+ parent = * rbn ;
389
+ skb1 = rb_to_skb (parent );
390
+ if (end <= skb1 -> ip_defrag_offset )
391
+ rbn = & parent -> rb_left ;
392
+ else if (offset >= skb1 -> ip_defrag_offset + skb1 -> len )
393
+ rbn = & parent -> rb_right ;
394
+ else /* Found an overlap with skb1. */
395
+ goto discard_qp ;
396
+ } while (* rbn );
397
+ /* Here we have parent properly set, and rbn pointing to
398
+ * one of its NULL left/right children. Insert skb. */
399
+ rb_link_node (& skb -> rbnode , parent , rbn );
400
+ }
401
+ rb_insert_color (& skb -> rbnode , & qp -> q .rb_fragments );
378
402
379
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
380
- dev = skb -> dev ;
381
403
if (dev )
382
404
qp -> iif = dev -> ifindex ;
383
- /* Makes sure compiler wont do silly aliasing games */
384
- barrier ();
385
405
skb -> ip_defrag_offset = offset ;
386
406
387
- /* Insert this fragment in the chain of fragments. */
388
- skb -> next = next ;
389
- if (!next )
390
- qp -> q .fragments_tail = skb ;
391
- if (prev )
392
- prev -> next = skb ;
393
- else
394
- qp -> q .fragments = skb ;
395
-
396
407
qp -> q .stamp = skb -> tstamp ;
397
408
qp -> q .meat += skb -> len ;
398
409
qp -> ecn |= ecn ;
@@ -414,7 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
414
425
unsigned long orefdst = skb -> _skb_refdst ;
415
426
416
427
skb -> _skb_refdst = 0UL ;
417
- err = ip_frag_reasm (qp , prev , dev );
428
+ err = ip_frag_reasm (qp , skb , dev );
418
429
skb -> _skb_refdst = orefdst ;
419
430
return err ;
420
431
}
@@ -431,15 +442,15 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
431
442
return err ;
432
443
}
433
444
434
-
435
445
/* Build a new IP datagram from all its fragments. */
436
-
437
- static int ip_frag_reasm (struct ipq * qp , struct sk_buff * prev ,
446
+ static int ip_frag_reasm (struct ipq * qp , struct sk_buff * skb ,
438
447
struct net_device * dev )
439
448
{
440
449
struct net * net = container_of (qp -> q .net , struct net , ipv4 .frags );
441
450
struct iphdr * iph ;
442
- struct sk_buff * fp , * head = qp -> q .fragments ;
451
+ struct sk_buff * fp , * head = skb_rb_first (& qp -> q .rb_fragments );
452
+ struct sk_buff * * nextp ; /* To build frag_list. */
453
+ struct rb_node * rbn ;
443
454
int len ;
444
455
int ihlen ;
445
456
int err ;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
453
464
goto out_fail ;
454
465
}
455
466
/* Make the one we just received the head. */
456
- if (prev ) {
457
- head = prev -> next ;
458
- fp = skb_clone (head , GFP_ATOMIC );
467
+ if (head != skb ) {
468
+ fp = skb_clone (skb , GFP_ATOMIC );
459
469
if (!fp )
460
470
goto out_nomem ;
461
-
462
- fp -> next = head -> next ;
463
- if (!fp -> next )
471
+ rb_replace_node (& skb -> rbnode , & fp -> rbnode , & qp -> q .rb_fragments );
472
+ if (qp -> q .fragments_tail == skb )
464
473
qp -> q .fragments_tail = fp ;
465
- prev -> next = fp ;
466
-
467
- skb_morph (head , qp -> q .fragments );
468
- head -> next = qp -> q .fragments -> next ;
469
-
470
- consume_skb (qp -> q .fragments );
471
- qp -> q .fragments = head ;
474
+ skb_morph (skb , head );
475
+ rb_replace_node (& head -> rbnode , & skb -> rbnode ,
476
+ & qp -> q .rb_fragments );
477
+ consume_skb (head );
478
+ head = skb ;
472
479
}
473
480
474
- WARN_ON (!head );
475
481
WARN_ON (head -> ip_defrag_offset != 0 );
476
482
477
483
/* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
496
502
clone = alloc_skb (0 , GFP_ATOMIC );
497
503
if (!clone )
498
504
goto out_nomem ;
499
- clone -> next = head -> next ;
500
- head -> next = clone ;
501
505
skb_shinfo (clone )-> frag_list = skb_shinfo (head )-> frag_list ;
502
506
skb_frag_list_init (head );
503
507
for (i = 0 ; i < skb_shinfo (head )-> nr_frags ; i ++ )
504
508
plen += skb_frag_size (& skb_shinfo (head )-> frags [i ]);
505
509
clone -> len = clone -> data_len = head -> data_len - plen ;
506
- head -> data_len -= clone -> len ;
507
- head -> len -= clone -> len ;
510
+ skb -> truesize += clone -> truesize ;
508
511
clone -> csum = 0 ;
509
512
clone -> ip_summed = head -> ip_summed ;
510
513
add_frag_mem_limit (qp -> q .net , clone -> truesize );
514
+ skb_shinfo (head )-> frag_list = clone ;
515
+ nextp = & clone -> next ;
516
+ } else {
517
+ nextp = & skb_shinfo (head )-> frag_list ;
511
518
}
512
519
513
- skb_shinfo (head )-> frag_list = head -> next ;
514
520
skb_push (head , head -> data - skb_network_header (head ));
515
521
516
- for (fp = head -> next ; fp ; fp = fp -> next ) {
522
+ /* Traverse the tree in order, to build frag_list. */
523
+ rbn = rb_next (& head -> rbnode );
524
+ rb_erase (& head -> rbnode , & qp -> q .rb_fragments );
525
+ while (rbn ) {
526
+ struct rb_node * rbnext = rb_next (rbn );
527
+ fp = rb_to_skb (rbn );
528
+ rb_erase (rbn , & qp -> q .rb_fragments );
529
+ rbn = rbnext ;
530
+ * nextp = fp ;
531
+ nextp = & fp -> next ;
532
+ fp -> prev = NULL ;
533
+ memset (& fp -> rbnode , 0 , sizeof (fp -> rbnode ));
517
534
head -> data_len += fp -> len ;
518
535
head -> len += fp -> len ;
519
536
if (head -> ip_summed != fp -> ip_summed )
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
524
541
}
525
542
sub_frag_mem_limit (qp -> q .net , head -> truesize );
526
543
544
+ * nextp = NULL ;
527
545
head -> next = NULL ;
546
+ head -> prev = NULL ;
528
547
head -> dev = dev ;
529
548
head -> tstamp = qp -> q .stamp ;
530
549
IPCB (head )-> frag_max_size = max (qp -> max_df_size , qp -> q .max_size );
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
552
571
553
572
__IP_INC_STATS (net , IPSTATS_MIB_REASMOKS );
554
573
qp -> q .fragments = NULL ;
574
+ qp -> q .rb_fragments = RB_ROOT ;
555
575
qp -> q .fragments_tail = NULL ;
556
576
return 0 ;
557
577
0 commit comments