@@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
38
38
return !nft_rbtree_interval_end (rbe );
39
39
}
40
40
41
- static bool nft_rbtree_equal (const struct nft_set * set , const void * this ,
42
- const struct nft_rbtree_elem * interval )
41
+ static int nft_rbtree_cmp (const struct nft_set * set ,
42
+ const struct nft_rbtree_elem * e1 ,
43
+ const struct nft_rbtree_elem * e2 )
43
44
{
44
- return memcmp (this , nft_set_ext_key (& interval -> ext ), set -> klen ) == 0 ;
45
+ return memcmp (nft_set_ext_key (& e1 -> ext ), nft_set_ext_key (& e2 -> ext ),
46
+ set -> klen );
45
47
}
46
48
47
49
static bool __nft_rbtree_lookup (const struct net * net , const struct nft_set * set ,
@@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
52
54
const struct nft_rbtree_elem * rbe , * interval = NULL ;
53
55
u8 genmask = nft_genmask_cur (net );
54
56
const struct rb_node * parent ;
55
- const void * this ;
56
57
int d ;
57
58
58
59
parent = rcu_dereference_raw (priv -> root .rb_node );
@@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
62
63
63
64
rbe = rb_entry (parent , struct nft_rbtree_elem , node );
64
65
65
- this = nft_set_ext_key (& rbe -> ext );
66
- d = memcmp (this , key , set -> klen );
66
+ d = memcmp (nft_set_ext_key (& rbe -> ext ), key , set -> klen );
67
67
if (d < 0 ) {
68
68
parent = rcu_dereference_raw (parent -> rb_left );
69
69
if (interval &&
70
- nft_rbtree_equal (set , this , interval ) &&
70
+ ! nft_rbtree_cmp (set , rbe , interval ) &&
71
71
nft_rbtree_interval_end (rbe ) &&
72
72
nft_rbtree_interval_start (interval ))
73
73
continue ;
@@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
215
215
return rbe ;
216
216
}
217
217
218
+ static int nft_rbtree_gc_elem (const struct nft_set * __set ,
219
+ struct nft_rbtree * priv ,
220
+ struct nft_rbtree_elem * rbe )
221
+ {
222
+ struct nft_set * set = (struct nft_set * )__set ;
223
+ struct rb_node * prev = rb_prev (& rbe -> node );
224
+ struct nft_rbtree_elem * rbe_prev ;
225
+ struct nft_set_gc_batch * gcb ;
226
+
227
+ gcb = nft_set_gc_batch_check (set , NULL , GFP_ATOMIC );
228
+ if (!gcb )
229
+ return - ENOMEM ;
230
+
231
+ /* search for expired end interval coming before this element. */
232
+ do {
233
+ rbe_prev = rb_entry (prev , struct nft_rbtree_elem , node );
234
+ if (nft_rbtree_interval_end (rbe_prev ))
235
+ break ;
236
+
237
+ prev = rb_prev (prev );
238
+ } while (prev != NULL );
239
+
240
+ rb_erase (& rbe_prev -> node , & priv -> root );
241
+ rb_erase (& rbe -> node , & priv -> root );
242
+ atomic_sub (2 , & set -> nelems );
243
+
244
+ nft_set_gc_batch_add (gcb , rbe );
245
+ nft_set_gc_batch_complete (gcb );
246
+
247
+ return 0 ;
248
+ }
249
+
250
+ static bool nft_rbtree_update_first (const struct nft_set * set ,
251
+ struct nft_rbtree_elem * rbe ,
252
+ struct rb_node * first )
253
+ {
254
+ struct nft_rbtree_elem * first_elem ;
255
+
256
+ first_elem = rb_entry (first , struct nft_rbtree_elem , node );
257
+ /* this element is closest to where the new element is to be inserted:
258
+ * update the first element for the node list path.
259
+ */
260
+ if (nft_rbtree_cmp (set , rbe , first_elem ) < 0 )
261
+ return true;
262
+
263
+ return false;
264
+ }
265
+
218
266
static int __nft_rbtree_insert (const struct net * net , const struct nft_set * set ,
219
267
struct nft_rbtree_elem * new ,
220
268
struct nft_set_ext * * ext )
221
269
{
222
- bool overlap = false, dup_end_left = false, dup_end_right = false;
270
+ struct nft_rbtree_elem * rbe , * rbe_le = NULL , * rbe_ge = NULL ;
271
+ struct rb_node * node , * parent , * * p , * first = NULL ;
223
272
struct nft_rbtree * priv = nft_set_priv (set );
224
273
u8 genmask = nft_genmask_next (net );
225
- struct nft_rbtree_elem * rbe ;
226
- struct rb_node * parent , * * p ;
227
- int d ;
274
+ int d , err ;
228
275
229
- /* Detect overlaps as we descend the tree. Set the flag in these cases:
230
- *
231
- * a1. _ _ __>| ?_ _ __| (insert end before existing end)
232
- * a2. _ _ ___| ?_ _ _>| (insert end after existing end)
233
- * a3. _ _ ___? >|_ _ __| (insert start before existing end)
234
- *
235
- * and clear it later on, as we eventually reach the points indicated by
236
- * '?' above, in the cases described below. We'll always meet these
237
- * later, locally, due to tree ordering, and overlaps for the intervals
238
- * that are the closest together are always evaluated last.
239
- *
240
- * b1. _ _ __>| !_ _ __| (insert end before existing start)
241
- * b2. _ _ ___| !_ _ _>| (insert end after existing start)
242
- * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
243
- * '--' no nodes falling in this range
244
- * b4. >|_ _ ! (insert start before existing start)
245
- *
246
- * Case a3. resolves to b3.:
247
- * - if the inserted start element is the leftmost, because the '0'
248
- * element in the tree serves as end element
249
- * - otherwise, if an existing end is found immediately to the left. If
250
- * there are existing nodes in between, we need to further descend the
251
- * tree before we can conclude the new start isn't causing an overlap
252
- *
253
- * or to b4., which, preceded by a3., means we already traversed one or
254
- * more existing intervals entirely, from the right.
255
- *
256
- * For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
257
- * in that order.
258
- *
259
- * The flag is also cleared in two special cases:
260
- *
261
- * b5. |__ _ _!|<_ _ _ (insert start right before existing end)
262
- * b6. |__ _ >|!__ _ _ (insert end right after existing start)
263
- *
264
- * which always happen as last step and imply that no further
265
- * overlapping is possible.
266
- *
267
- * Another special case comes from the fact that start elements matching
268
- * an already existing start element are allowed: insertion is not
269
- * performed but we return -EEXIST in that case, and the error will be
270
- * cleared by the caller if NLM_F_EXCL is not present in the request.
271
- * This way, request for insertion of an exact overlap isn't reported as
272
- * error to userspace if not desired.
273
- *
274
- * However, if the existing start matches a pre-existing start, but the
275
- * end element doesn't match the corresponding pre-existing end element,
276
- * we need to report a partial overlap. This is a local condition that
277
- * can be noticed without need for a tracking flag, by checking for a
278
- * local duplicated end for a corresponding start, from left and right,
279
- * separately.
276
+ /* Descend the tree to search for an existing element greater than the
277
+ * key value to insert that is greater than the new element. This is the
278
+ * first element to walk the ordered elements to find possible overlap.
280
279
*/
281
-
282
280
parent = NULL ;
283
281
p = & priv -> root .rb_node ;
284
282
while (* p != NULL ) {
285
283
parent = * p ;
286
284
rbe = rb_entry (parent , struct nft_rbtree_elem , node );
287
- d = memcmp (nft_set_ext_key (& rbe -> ext ),
288
- nft_set_ext_key (& new -> ext ),
289
- set -> klen );
285
+ d = nft_rbtree_cmp (set , rbe , new );
286
+
290
287
if (d < 0 ) {
291
288
p = & parent -> rb_left ;
292
-
293
- if (nft_rbtree_interval_start (new )) {
294
- if (nft_rbtree_interval_end (rbe ) &&
295
- nft_set_elem_active (& rbe -> ext , genmask ) &&
296
- !nft_set_elem_expired (& rbe -> ext ) && !* p )
297
- overlap = false;
298
- } else {
299
- if (dup_end_left && !* p )
300
- return - ENOTEMPTY ;
301
-
302
- overlap = nft_rbtree_interval_end (rbe ) &&
303
- nft_set_elem_active (& rbe -> ext ,
304
- genmask ) &&
305
- !nft_set_elem_expired (& rbe -> ext );
306
-
307
- if (overlap ) {
308
- dup_end_right = true;
309
- continue ;
310
- }
311
- }
312
289
} else if (d > 0 ) {
313
- p = & parent -> rb_right ;
290
+ if (!first ||
291
+ nft_rbtree_update_first (set , rbe , first ))
292
+ first = & rbe -> node ;
314
293
315
- if (nft_rbtree_interval_end (new )) {
316
- if (dup_end_right && !* p )
317
- return - ENOTEMPTY ;
318
-
319
- overlap = nft_rbtree_interval_end (rbe ) &&
320
- nft_set_elem_active (& rbe -> ext ,
321
- genmask ) &&
322
- !nft_set_elem_expired (& rbe -> ext );
323
-
324
- if (overlap ) {
325
- dup_end_left = true;
326
- continue ;
327
- }
328
- } else if (nft_set_elem_active (& rbe -> ext , genmask ) &&
329
- !nft_set_elem_expired (& rbe -> ext )) {
330
- overlap = nft_rbtree_interval_end (rbe );
331
- }
294
+ p = & parent -> rb_right ;
332
295
} else {
333
- if (nft_rbtree_interval_end (rbe ) &&
334
- nft_rbtree_interval_start (new )) {
296
+ if (nft_rbtree_interval_end (rbe ))
335
297
p = & parent -> rb_left ;
336
-
337
- if (nft_set_elem_active (& rbe -> ext , genmask ) &&
338
- !nft_set_elem_expired (& rbe -> ext ))
339
- overlap = false;
340
- } else if (nft_rbtree_interval_start (rbe ) &&
341
- nft_rbtree_interval_end (new )) {
298
+ else
342
299
p = & parent -> rb_right ;
300
+ }
301
+ }
302
+
303
+ if (!first )
304
+ first = rb_first (& priv -> root );
305
+
306
+ /* Detect overlap by going through the list of valid tree nodes.
307
+ * Values stored in the tree are in reversed order, starting from
308
+ * highest to lowest value.
309
+ */
310
+ for (node = first ; node != NULL ; node = rb_next (node )) {
311
+ rbe = rb_entry (node , struct nft_rbtree_elem , node );
312
+
313
+ if (!nft_set_elem_active (& rbe -> ext , genmask ))
314
+ continue ;
343
315
344
- if (nft_set_elem_active (& rbe -> ext , genmask ) &&
345
- !nft_set_elem_expired (& rbe -> ext ))
346
- overlap = false;
347
- } else if (nft_set_elem_active (& rbe -> ext , genmask ) &&
348
- !nft_set_elem_expired (& rbe -> ext )) {
349
- * ext = & rbe -> ext ;
350
- return - EEXIST ;
351
- } else {
352
- overlap = false;
353
- if (nft_rbtree_interval_end (rbe ))
354
- p = & parent -> rb_left ;
355
- else
356
- p = & parent -> rb_right ;
316
+ /* perform garbage collection to avoid bogus overlap reports. */
317
+ if (nft_set_elem_expired (& rbe -> ext )) {
318
+ err = nft_rbtree_gc_elem (set , priv , rbe );
319
+ if (err < 0 )
320
+ return err ;
321
+
322
+ continue ;
323
+ }
324
+
325
+ d = nft_rbtree_cmp (set , rbe , new );
326
+ if (d == 0 ) {
327
+ /* Matching end element: no need to look for an
328
+ * overlapping greater or equal element.
329
+ */
330
+ if (nft_rbtree_interval_end (rbe )) {
331
+ rbe_le = rbe ;
332
+ break ;
333
+ }
334
+
335
+ /* first element that is greater or equal to key value. */
336
+ if (!rbe_ge ) {
337
+ rbe_ge = rbe ;
338
+ continue ;
339
+ }
340
+
341
+ /* this is a closer more or equal element, update it. */
342
+ if (nft_rbtree_cmp (set , rbe_ge , new ) != 0 ) {
343
+ rbe_ge = rbe ;
344
+ continue ;
357
345
}
346
+
347
+ /* element is equal to key value, make sure flags are
348
+ * the same, an existing more or equal start element
349
+ * must not be replaced by more or equal end element.
350
+ */
351
+ if ((nft_rbtree_interval_start (new ) &&
352
+ nft_rbtree_interval_start (rbe_ge )) ||
353
+ (nft_rbtree_interval_end (new ) &&
354
+ nft_rbtree_interval_end (rbe_ge ))) {
355
+ rbe_ge = rbe ;
356
+ continue ;
357
+ }
358
+ } else if (d > 0 ) {
359
+ /* annotate element greater than the new element. */
360
+ rbe_ge = rbe ;
361
+ continue ;
362
+ } else if (d < 0 ) {
363
+ /* annotate element less than the new element. */
364
+ rbe_le = rbe ;
365
+ break ;
358
366
}
367
+ }
359
368
360
- dup_end_left = dup_end_right = false;
369
+ /* - new start element matching existing start element: full overlap
370
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
371
+ */
372
+ if (rbe_ge && !nft_rbtree_cmp (set , new , rbe_ge ) &&
373
+ nft_rbtree_interval_start (rbe_ge ) == nft_rbtree_interval_start (new )) {
374
+ * ext = & rbe_ge -> ext ;
375
+ return - EEXIST ;
376
+ }
377
+
378
+ /* - new end element matching existing end element: full overlap
379
+ * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
380
+ */
381
+ if (rbe_le && !nft_rbtree_cmp (set , new , rbe_le ) &&
382
+ nft_rbtree_interval_end (rbe_le ) == nft_rbtree_interval_end (new )) {
383
+ * ext = & rbe_le -> ext ;
384
+ return - EEXIST ;
361
385
}
362
386
363
- if (overlap )
387
+ /* - new start element with existing closest, less or equal key value
388
+ * being a start element: partial overlap, reported as -ENOTEMPTY.
389
+ * Anonymous sets allow for two consecutive start element since they
390
+ * are constant, skip them to avoid bogus overlap reports.
391
+ */
392
+ if (!nft_set_is_anonymous (set ) && rbe_le &&
393
+ nft_rbtree_interval_start (rbe_le ) && nft_rbtree_interval_start (new ))
394
+ return - ENOTEMPTY ;
395
+
396
+ /* - new end element with existing closest, less or equal key value
397
+ * being a end element: partial overlap, reported as -ENOTEMPTY.
398
+ */
399
+ if (rbe_le &&
400
+ nft_rbtree_interval_end (rbe_le ) && nft_rbtree_interval_end (new ))
364
401
return - ENOTEMPTY ;
365
402
403
+ /* - new end element with existing closest, greater or equal key value
404
+ * being an end element: partial overlap, reported as -ENOTEMPTY
405
+ */
406
+ if (rbe_ge &&
407
+ nft_rbtree_interval_end (rbe_ge ) && nft_rbtree_interval_end (new ))
408
+ return - ENOTEMPTY ;
409
+
410
+ /* Accepted element: pick insertion point depending on key value */
411
+ parent = NULL ;
412
+ p = & priv -> root .rb_node ;
413
+ while (* p != NULL ) {
414
+ parent = * p ;
415
+ rbe = rb_entry (parent , struct nft_rbtree_elem , node );
416
+ d = nft_rbtree_cmp (set , rbe , new );
417
+
418
+ if (d < 0 )
419
+ p = & parent -> rb_left ;
420
+ else if (d > 0 )
421
+ p = & parent -> rb_right ;
422
+ else if (nft_rbtree_interval_end (rbe ))
423
+ p = & parent -> rb_left ;
424
+ else
425
+ p = & parent -> rb_right ;
426
+ }
427
+
366
428
rb_link_node_rcu (& new -> node , parent , p );
367
429
rb_insert_color (& new -> node , & priv -> root );
368
430
return 0 ;
0 commit comments