Skip to content

Commit c9e6978

Browse files
committed
netfilter: nft_set_rbtree: Switch to node list walk for overlap detection
...instead of a tree descent, which became overly complicated in an attempt to cover cases where expired or inactive elements would affect comparisons with the new element being inserted. Further, it turned out that it's probably impossible to cover all those cases, as inactive nodes might entirely hide subtrees consisting of a complete interval plus a node that makes the current insertion not overlap. To speed up the overlap check, descent the tree to find a greater element that is closer to the key value to insert. Then walk down the node list for overlap detection. Starting the overlap check from rb_first() unconditionally is slow, it takes 10 times longer due to the full linear traversal of the list. Moreover, perform garbage collection of expired elements when walking down the node list to avoid bogus overlap reports. For the insertion operation itself, this essentially reverts back to the implementation before commit 7c84d41 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion"), except that cases of complete overlap are already handled in the overlap detection phase itself, which slightly simplifies the loop to find the insertion point. Based on initial patch from Stefano Brivio, including text from the original patch description too. Fixes: 7c84d41 ("netfilter: nft_set_rbtree: Detect partial overlaps on insertion") Reviewed-by: Stefano Brivio <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 71ab9c3 commit c9e6978

File tree

1 file changed

+189
-127
lines changed

1 file changed

+189
-127
lines changed

net/netfilter/nft_set_rbtree.c

Lines changed: 189 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,12 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
3838
return !nft_rbtree_interval_end(rbe);
3939
}
4040

41-
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
42-
const struct nft_rbtree_elem *interval)
41+
static int nft_rbtree_cmp(const struct nft_set *set,
42+
const struct nft_rbtree_elem *e1,
43+
const struct nft_rbtree_elem *e2)
4344
{
44-
return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
45+
return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
46+
set->klen);
4547
}
4648

4749
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
@@ -52,7 +54,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
5254
const struct nft_rbtree_elem *rbe, *interval = NULL;
5355
u8 genmask = nft_genmask_cur(net);
5456
const struct rb_node *parent;
55-
const void *this;
5657
int d;
5758

5859
parent = rcu_dereference_raw(priv->root.rb_node);
@@ -62,12 +63,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
6263

6364
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
6465

65-
this = nft_set_ext_key(&rbe->ext);
66-
d = memcmp(this, key, set->klen);
66+
d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
6767
if (d < 0) {
6868
parent = rcu_dereference_raw(parent->rb_left);
6969
if (interval &&
70-
nft_rbtree_equal(set, this, interval) &&
70+
!nft_rbtree_cmp(set, rbe, interval) &&
7171
nft_rbtree_interval_end(rbe) &&
7272
nft_rbtree_interval_start(interval))
7373
continue;
@@ -215,154 +215,216 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
215215
return rbe;
216216
}
217217

218+
static int nft_rbtree_gc_elem(const struct nft_set *__set,
219+
struct nft_rbtree *priv,
220+
struct nft_rbtree_elem *rbe)
221+
{
222+
struct nft_set *set = (struct nft_set *)__set;
223+
struct rb_node *prev = rb_prev(&rbe->node);
224+
struct nft_rbtree_elem *rbe_prev;
225+
struct nft_set_gc_batch *gcb;
226+
227+
gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
228+
if (!gcb)
229+
return -ENOMEM;
230+
231+
/* search for expired end interval coming before this element. */
232+
do {
233+
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
234+
if (nft_rbtree_interval_end(rbe_prev))
235+
break;
236+
237+
prev = rb_prev(prev);
238+
} while (prev != NULL);
239+
240+
rb_erase(&rbe_prev->node, &priv->root);
241+
rb_erase(&rbe->node, &priv->root);
242+
atomic_sub(2, &set->nelems);
243+
244+
nft_set_gc_batch_add(gcb, rbe);
245+
nft_set_gc_batch_complete(gcb);
246+
247+
return 0;
248+
}
249+
250+
static bool nft_rbtree_update_first(const struct nft_set *set,
251+
struct nft_rbtree_elem *rbe,
252+
struct rb_node *first)
253+
{
254+
struct nft_rbtree_elem *first_elem;
255+
256+
first_elem = rb_entry(first, struct nft_rbtree_elem, node);
257+
/* this element is closest to where the new element is to be inserted:
258+
* update the first element for the node list path.
259+
*/
260+
if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
261+
return true;
262+
263+
return false;
264+
}
265+
218266
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
219267
struct nft_rbtree_elem *new,
220268
struct nft_set_ext **ext)
221269
{
222-
bool overlap = false, dup_end_left = false, dup_end_right = false;
270+
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
271+
struct rb_node *node, *parent, **p, *first = NULL;
223272
struct nft_rbtree *priv = nft_set_priv(set);
224273
u8 genmask = nft_genmask_next(net);
225-
struct nft_rbtree_elem *rbe;
226-
struct rb_node *parent, **p;
227-
int d;
274+
int d, err;
228275

229-
/* Detect overlaps as we descend the tree. Set the flag in these cases:
230-
*
231-
* a1. _ _ __>| ?_ _ __| (insert end before existing end)
232-
* a2. _ _ ___| ?_ _ _>| (insert end after existing end)
233-
* a3. _ _ ___? >|_ _ __| (insert start before existing end)
234-
*
235-
* and clear it later on, as we eventually reach the points indicated by
236-
* '?' above, in the cases described below. We'll always meet these
237-
* later, locally, due to tree ordering, and overlaps for the intervals
238-
* that are the closest together are always evaluated last.
239-
*
240-
* b1. _ _ __>| !_ _ __| (insert end before existing start)
241-
* b2. _ _ ___| !_ _ _>| (insert end after existing start)
242-
* b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf)
243-
* '--' no nodes falling in this range
244-
* b4. >|_ _ ! (insert start before existing start)
245-
*
246-
* Case a3. resolves to b3.:
247-
* - if the inserted start element is the leftmost, because the '0'
248-
* element in the tree serves as end element
249-
* - otherwise, if an existing end is found immediately to the left. If
250-
* there are existing nodes in between, we need to further descend the
251-
* tree before we can conclude the new start isn't causing an overlap
252-
*
253-
* or to b4., which, preceded by a3., means we already traversed one or
254-
* more existing intervals entirely, from the right.
255-
*
256-
* For a new, rightmost pair of elements, we'll hit cases b3. and b2.,
257-
* in that order.
258-
*
259-
* The flag is also cleared in two special cases:
260-
*
261-
* b5. |__ _ _!|<_ _ _ (insert start right before existing end)
262-
* b6. |__ _ >|!__ _ _ (insert end right after existing start)
263-
*
264-
* which always happen as last step and imply that no further
265-
* overlapping is possible.
266-
*
267-
* Another special case comes from the fact that start elements matching
268-
* an already existing start element are allowed: insertion is not
269-
* performed but we return -EEXIST in that case, and the error will be
270-
* cleared by the caller if NLM_F_EXCL is not present in the request.
271-
* This way, request for insertion of an exact overlap isn't reported as
272-
* error to userspace if not desired.
273-
*
274-
* However, if the existing start matches a pre-existing start, but the
275-
* end element doesn't match the corresponding pre-existing end element,
276-
* we need to report a partial overlap. This is a local condition that
277-
* can be noticed without need for a tracking flag, by checking for a
278-
* local duplicated end for a corresponding start, from left and right,
279-
* separately.
276+
/* Descend the tree to search for an existing element greater than the
277+
* key value to insert that is greater than the new element. This is the
278+
* first element to walk the ordered elements to find possible overlap.
280279
*/
281-
282280
parent = NULL;
283281
p = &priv->root.rb_node;
284282
while (*p != NULL) {
285283
parent = *p;
286284
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
287-
d = memcmp(nft_set_ext_key(&rbe->ext),
288-
nft_set_ext_key(&new->ext),
289-
set->klen);
285+
d = nft_rbtree_cmp(set, rbe, new);
286+
290287
if (d < 0) {
291288
p = &parent->rb_left;
292-
293-
if (nft_rbtree_interval_start(new)) {
294-
if (nft_rbtree_interval_end(rbe) &&
295-
nft_set_elem_active(&rbe->ext, genmask) &&
296-
!nft_set_elem_expired(&rbe->ext) && !*p)
297-
overlap = false;
298-
} else {
299-
if (dup_end_left && !*p)
300-
return -ENOTEMPTY;
301-
302-
overlap = nft_rbtree_interval_end(rbe) &&
303-
nft_set_elem_active(&rbe->ext,
304-
genmask) &&
305-
!nft_set_elem_expired(&rbe->ext);
306-
307-
if (overlap) {
308-
dup_end_right = true;
309-
continue;
310-
}
311-
}
312289
} else if (d > 0) {
313-
p = &parent->rb_right;
290+
if (!first ||
291+
nft_rbtree_update_first(set, rbe, first))
292+
first = &rbe->node;
314293

315-
if (nft_rbtree_interval_end(new)) {
316-
if (dup_end_right && !*p)
317-
return -ENOTEMPTY;
318-
319-
overlap = nft_rbtree_interval_end(rbe) &&
320-
nft_set_elem_active(&rbe->ext,
321-
genmask) &&
322-
!nft_set_elem_expired(&rbe->ext);
323-
324-
if (overlap) {
325-
dup_end_left = true;
326-
continue;
327-
}
328-
} else if (nft_set_elem_active(&rbe->ext, genmask) &&
329-
!nft_set_elem_expired(&rbe->ext)) {
330-
overlap = nft_rbtree_interval_end(rbe);
331-
}
294+
p = &parent->rb_right;
332295
} else {
333-
if (nft_rbtree_interval_end(rbe) &&
334-
nft_rbtree_interval_start(new)) {
296+
if (nft_rbtree_interval_end(rbe))
335297
p = &parent->rb_left;
336-
337-
if (nft_set_elem_active(&rbe->ext, genmask) &&
338-
!nft_set_elem_expired(&rbe->ext))
339-
overlap = false;
340-
} else if (nft_rbtree_interval_start(rbe) &&
341-
nft_rbtree_interval_end(new)) {
298+
else
342299
p = &parent->rb_right;
300+
}
301+
}
302+
303+
if (!first)
304+
first = rb_first(&priv->root);
305+
306+
/* Detect overlap by going through the list of valid tree nodes.
307+
* Values stored in the tree are in reversed order, starting from
308+
* highest to lowest value.
309+
*/
310+
for (node = first; node != NULL; node = rb_next(node)) {
311+
rbe = rb_entry(node, struct nft_rbtree_elem, node);
312+
313+
if (!nft_set_elem_active(&rbe->ext, genmask))
314+
continue;
343315

344-
if (nft_set_elem_active(&rbe->ext, genmask) &&
345-
!nft_set_elem_expired(&rbe->ext))
346-
overlap = false;
347-
} else if (nft_set_elem_active(&rbe->ext, genmask) &&
348-
!nft_set_elem_expired(&rbe->ext)) {
349-
*ext = &rbe->ext;
350-
return -EEXIST;
351-
} else {
352-
overlap = false;
353-
if (nft_rbtree_interval_end(rbe))
354-
p = &parent->rb_left;
355-
else
356-
p = &parent->rb_right;
316+
/* perform garbage collection to avoid bogus overlap reports. */
317+
if (nft_set_elem_expired(&rbe->ext)) {
318+
err = nft_rbtree_gc_elem(set, priv, rbe);
319+
if (err < 0)
320+
return err;
321+
322+
continue;
323+
}
324+
325+
d = nft_rbtree_cmp(set, rbe, new);
326+
if (d == 0) {
327+
/* Matching end element: no need to look for an
328+
* overlapping greater or equal element.
329+
*/
330+
if (nft_rbtree_interval_end(rbe)) {
331+
rbe_le = rbe;
332+
break;
333+
}
334+
335+
/* first element that is greater or equal to key value. */
336+
if (!rbe_ge) {
337+
rbe_ge = rbe;
338+
continue;
339+
}
340+
341+
/* this is a closer more or equal element, update it. */
342+
if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
343+
rbe_ge = rbe;
344+
continue;
357345
}
346+
347+
/* element is equal to key value, make sure flags are
348+
* the same, an existing more or equal start element
349+
* must not be replaced by more or equal end element.
350+
*/
351+
if ((nft_rbtree_interval_start(new) &&
352+
nft_rbtree_interval_start(rbe_ge)) ||
353+
(nft_rbtree_interval_end(new) &&
354+
nft_rbtree_interval_end(rbe_ge))) {
355+
rbe_ge = rbe;
356+
continue;
357+
}
358+
} else if (d > 0) {
359+
/* annotate element greater than the new element. */
360+
rbe_ge = rbe;
361+
continue;
362+
} else if (d < 0) {
363+
/* annotate element less than the new element. */
364+
rbe_le = rbe;
365+
break;
358366
}
367+
}
359368

360-
dup_end_left = dup_end_right = false;
369+
/* - new start element matching existing start element: full overlap
370+
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
371+
*/
372+
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
373+
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
374+
*ext = &rbe_ge->ext;
375+
return -EEXIST;
376+
}
377+
378+
/* - new end element matching existing end element: full overlap
379+
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
380+
*/
381+
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
382+
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
383+
*ext = &rbe_le->ext;
384+
return -EEXIST;
361385
}
362386

363-
if (overlap)
387+
/* - new start element with existing closest, less or equal key value
388+
* being a start element: partial overlap, reported as -ENOTEMPTY.
389+
* Anonymous sets allow for two consecutive start element since they
390+
* are constant, skip them to avoid bogus overlap reports.
391+
*/
392+
if (!nft_set_is_anonymous(set) && rbe_le &&
393+
nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
394+
return -ENOTEMPTY;
395+
396+
/* - new end element with existing closest, less or equal key value
397+
* being a end element: partial overlap, reported as -ENOTEMPTY.
398+
*/
399+
if (rbe_le &&
400+
nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
364401
return -ENOTEMPTY;
365402

403+
/* - new end element with existing closest, greater or equal key value
404+
* being an end element: partial overlap, reported as -ENOTEMPTY
405+
*/
406+
if (rbe_ge &&
407+
nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
408+
return -ENOTEMPTY;
409+
410+
/* Accepted element: pick insertion point depending on key value */
411+
parent = NULL;
412+
p = &priv->root.rb_node;
413+
while (*p != NULL) {
414+
parent = *p;
415+
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
416+
d = nft_rbtree_cmp(set, rbe, new);
417+
418+
if (d < 0)
419+
p = &parent->rb_left;
420+
else if (d > 0)
421+
p = &parent->rb_right;
422+
else if (nft_rbtree_interval_end(rbe))
423+
p = &parent->rb_left;
424+
else
425+
p = &parent->rb_right;
426+
}
427+
366428
rb_link_node_rcu(&new->node, parent, p);
367429
rb_insert_color(&new->node, &priv->root);
368430
return 0;

0 commit comments

Comments
 (0)