Skip to content

Commit 03e5fd0

Browse files
Liping Zhangummakynes
authored andcommitted
netfilter: nft_set_rbtree: use per-set rwlock to improve the scalability
Karel Rericha reported that in his test case, ICMP packets going through boxes had normally about 5ms latency. But when running nft, actually listing the sets with interval flags, latency would go up to 30-100ms. This was observed when router throughput is from 600Mbps to 2Gbps. This is because we use a single global spinlock to protect the whole rbtree sets, so "dumping sets" will race with the "key lookup" inevitably. But actually they are all _readers_, so it's ok to convert the spinlock to rwlock to avoid competition between them. Also use per-set rwlock since each set is independent. Reported-by: Karel Rericha <[email protected]> Tested-by: Karel Rericha <[email protected]> Signed-off-by: Liping Zhang <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 2cb4bbd commit 03e5fd0

File tree

1 file changed

+16
-15
lines changed

1 file changed

+16
-15
lines changed

net/netfilter/nft_set_rbtree.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
#include <linux/netfilter/nf_tables.h>
1919
#include <net/netfilter/nf_tables.h>
2020

21-
static DEFINE_SPINLOCK(nft_rbtree_lock);
22-
2321
struct nft_rbtree {
22+
rwlock_t lock;
2423
struct rb_root root;
2524
};
2625

@@ -44,14 +43,14 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
4443
static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
4544
const u32 *key, const struct nft_set_ext **ext)
4645
{
47-
const struct nft_rbtree *priv = nft_set_priv(set);
46+
struct nft_rbtree *priv = nft_set_priv(set);
4847
const struct nft_rbtree_elem *rbe, *interval = NULL;
4948
u8 genmask = nft_genmask_cur(net);
5049
const struct rb_node *parent;
5150
const void *this;
5251
int d;
5352

54-
spin_lock_bh(&nft_rbtree_lock);
53+
read_lock_bh(&priv->lock);
5554
parent = priv->root.rb_node;
5655
while (parent != NULL) {
5756
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -75,7 +74,7 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
7574
}
7675
if (nft_rbtree_interval_end(rbe))
7776
goto out;
78-
spin_unlock_bh(&nft_rbtree_lock);
77+
read_unlock_bh(&priv->lock);
7978

8079
*ext = &rbe->ext;
8180
return true;
@@ -85,12 +84,12 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
8584
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
8685
nft_set_elem_active(&interval->ext, genmask) &&
8786
!nft_rbtree_interval_end(interval)) {
88-
spin_unlock_bh(&nft_rbtree_lock);
87+
read_unlock_bh(&priv->lock);
8988
*ext = &interval->ext;
9089
return true;
9190
}
9291
out:
93-
spin_unlock_bh(&nft_rbtree_lock);
92+
read_unlock_bh(&priv->lock);
9493
return false;
9594
}
9695

@@ -140,12 +139,13 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
140139
const struct nft_set_elem *elem,
141140
struct nft_set_ext **ext)
142141
{
142+
struct nft_rbtree *priv = nft_set_priv(set);
143143
struct nft_rbtree_elem *rbe = elem->priv;
144144
int err;
145145

146-
spin_lock_bh(&nft_rbtree_lock);
146+
write_lock_bh(&priv->lock);
147147
err = __nft_rbtree_insert(net, set, rbe, ext);
148-
spin_unlock_bh(&nft_rbtree_lock);
148+
write_unlock_bh(&priv->lock);
149149

150150
return err;
151151
}
@@ -157,9 +157,9 @@ static void nft_rbtree_remove(const struct net *net,
157157
struct nft_rbtree *priv = nft_set_priv(set);
158158
struct nft_rbtree_elem *rbe = elem->priv;
159159

160-
spin_lock_bh(&nft_rbtree_lock);
160+
write_lock_bh(&priv->lock);
161161
rb_erase(&rbe->node, &priv->root);
162-
spin_unlock_bh(&nft_rbtree_lock);
162+
write_unlock_bh(&priv->lock);
163163
}
164164

165165
static void nft_rbtree_activate(const struct net *net,
@@ -224,12 +224,12 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
224224
struct nft_set *set,
225225
struct nft_set_iter *iter)
226226
{
227-
const struct nft_rbtree *priv = nft_set_priv(set);
227+
struct nft_rbtree *priv = nft_set_priv(set);
228228
struct nft_rbtree_elem *rbe;
229229
struct nft_set_elem elem;
230230
struct rb_node *node;
231231

232-
spin_lock_bh(&nft_rbtree_lock);
232+
read_lock_bh(&priv->lock);
233233
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
234234
rbe = rb_entry(node, struct nft_rbtree_elem, node);
235235

@@ -242,13 +242,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
242242

243243
iter->err = iter->fn(ctx, set, iter, &elem);
244244
if (iter->err < 0) {
245-
spin_unlock_bh(&nft_rbtree_lock);
245+
read_unlock_bh(&priv->lock);
246246
return;
247247
}
248248
cont:
249249
iter->count++;
250250
}
251-
spin_unlock_bh(&nft_rbtree_lock);
251+
read_unlock_bh(&priv->lock);
252252
}
253253

254254
static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[])
@@ -262,6 +262,7 @@ static int nft_rbtree_init(const struct nft_set *set,
262262
{
263263
struct nft_rbtree *priv = nft_set_priv(set);
264264

265+
rwlock_init(&priv->lock);
265266
priv->root = RB_ROOT;
266267
return 0;
267268
}

0 commit comments

Comments
 (0)