Skip to content

Commit 0cbc06b

Browse files
Florian Westphalummakynes
authored andcommitted
netfilter: nf_tables: remove synchronize_rcu in commit phase
synchronize_rcu() is expensive. The commit phase currently enforces an unconditional synchronize_rcu() after incrementing the generation counter. This is to make sure that a packet always sees a consistent chain, either nft_do_chain is still using old generation (it will skip the newly added rules), or the new one (it will skip old ones that might still be linked into the list). We could just remove the synchronize_rcu(), it would not cause a crash but it could cause us to evaluate a rule that was removed and new rule for the same packet, instead of either-or. To resolve this, add rule pointer array holding two generations, the current one and the future generation. In commit phase, allocate the rule blob and populate it with the rules that will be active in the new generation. Then, make this rule blob public, replacing the old generation pointer. Then the generation counter can be incremented. nft_do_chain() will either continue to use the current generation (in case loop was invoked right before increment), or the new one. Suggested-by: Pablo Neira Ayuso <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 0030879 commit 0cbc06b

File tree

3 files changed

+215
-18
lines changed

3 files changed

+215
-18
lines changed

include/net/netfilter/nf_tables.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,8 @@ enum nft_chain_flags {
858858
* @name: name of the chain
859859
*/
860860
struct nft_chain {
861+
struct nft_rule *__rcu *rules_gen_0;
862+
struct nft_rule *__rcu *rules_gen_1;
861863
struct list_head rules;
862864
struct list_head list;
863865
struct nft_table *table;
@@ -867,6 +869,9 @@ struct nft_chain {
867869
u8 flags:6,
868870
genmask:2;
869871
char *name;
872+
873+
/* Only used during control plane commit phase: */
874+
struct nft_rule **rules_next;
870875
};
871876

872877
enum nft_chain_types {

net/netfilter/nf_tables_api.c

Lines changed: 197 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1237,12 +1237,29 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
12371237
rcu_assign_pointer(chain->stats, newstats);
12381238
}
12391239

1240+
static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
1241+
{
1242+
struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0);
1243+
struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1);
1244+
1245+
if (g0 != g1)
1246+
kvfree(g1);
1247+
kvfree(g0);
1248+
1249+
/* should be NULL either via abort or via successful commit */
1250+
WARN_ON_ONCE(chain->rules_next);
1251+
kvfree(chain->rules_next);
1252+
}
1253+
12401254
static void nf_tables_chain_destroy(struct nft_ctx *ctx)
12411255
{
12421256
struct nft_chain *chain = ctx->chain;
12431257

12441258
BUG_ON(chain->use > 0);
12451259

1260+
/* no concurrent access possible anymore */
1261+
nf_tables_chain_free_chain_rules(chain);
1262+
12461263
if (nft_is_base_chain(chain)) {
12471264
struct nft_base_chain *basechain = nft_base_chain(chain);
12481265

@@ -1335,6 +1352,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
13351352
module_put(hook->type->owner);
13361353
}
13371354

1355+
struct nft_rules_old {
1356+
struct rcu_head h;
1357+
struct nft_rule **start;
1358+
};
1359+
1360+
static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain,
1361+
unsigned int alloc)
1362+
{
1363+
if (alloc > INT_MAX)
1364+
return NULL;
1365+
1366+
alloc += 1; /* NULL, ends rules */
1367+
if (sizeof(struct nft_rule *) > INT_MAX / alloc)
1368+
return NULL;
1369+
1370+
alloc *= sizeof(struct nft_rule *);
1371+
alloc += sizeof(struct nft_rules_old);
1372+
1373+
return kvmalloc(alloc, GFP_KERNEL);
1374+
}
1375+
13381376
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
13391377
u8 policy, bool create)
13401378
{
@@ -1344,6 +1382,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
13441382
struct nft_stats __percpu *stats;
13451383
struct net *net = ctx->net;
13461384
struct nft_chain *chain;
1385+
struct nft_rule **rules;
13471386
int err;
13481387

13491388
if (table->use == UINT_MAX)
@@ -1406,6 +1445,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
14061445
goto err1;
14071446
}
14081447

1448+
rules = nf_tables_chain_alloc_rules(chain, 0);
1449+
if (!rules) {
1450+
err = -ENOMEM;
1451+
goto err1;
1452+
}
1453+
1454+
*rules = NULL;
1455+
rcu_assign_pointer(chain->rules_gen_0, rules);
1456+
rcu_assign_pointer(chain->rules_gen_1, rules);
1457+
14091458
err = nf_tables_register_hook(net, table, chain);
14101459
if (err < 0)
14111460
goto err1;
@@ -5850,21 +5899,162 @@ static void nf_tables_commit_release(struct net *net)
58505899
}
58515900
}
58525901

5902+
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
5903+
{
5904+
struct nft_rule *rule;
5905+
unsigned int alloc = 0;
5906+
int i;
5907+
5908+
/* already handled or inactive chain? */
5909+
if (chain->rules_next || !nft_is_active_next(net, chain))
5910+
return 0;
5911+
5912+
rule = list_entry(&chain->rules, struct nft_rule, list);
5913+
i = 0;
5914+
5915+
list_for_each_entry_continue(rule, &chain->rules, list) {
5916+
if (nft_is_active_next(net, rule))
5917+
alloc++;
5918+
}
5919+
5920+
chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc);
5921+
if (!chain->rules_next)
5922+
return -ENOMEM;
5923+
5924+
list_for_each_entry_continue(rule, &chain->rules, list) {
5925+
if (nft_is_active_next(net, rule))
5926+
chain->rules_next[i++] = rule;
5927+
}
5928+
5929+
chain->rules_next[i] = NULL;
5930+
return 0;
5931+
}
5932+
5933+
static void nf_tables_commit_chain_prepare_cancel(struct net *net)
5934+
{
5935+
struct nft_trans *trans, *next;
5936+
5937+
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
5938+
struct nft_chain *chain = trans->ctx.chain;
5939+
5940+
if (trans->msg_type == NFT_MSG_NEWRULE ||
5941+
trans->msg_type == NFT_MSG_DELRULE) {
5942+
kvfree(chain->rules_next);
5943+
chain->rules_next = NULL;
5944+
}
5945+
}
5946+
}
5947+
5948+
static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h)
5949+
{
5950+
struct nft_rules_old *o = container_of(h, struct nft_rules_old, h);
5951+
5952+
kvfree(o->start);
5953+
}
5954+
5955+
static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules)
5956+
{
5957+
struct nft_rule **r = rules;
5958+
struct nft_rules_old *old;
5959+
5960+
while (*r)
5961+
r++;
5962+
5963+
r++; /* rcu_head is after end marker */
5964+
old = (void *) r;
5965+
old->start = rules;
5966+
5967+
call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old);
5968+
}
5969+
5970+
static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain)
5971+
{
5972+
struct nft_rule **g0, **g1;
5973+
bool next_genbit;
5974+
5975+
next_genbit = nft_gencursor_next(net);
5976+
5977+
g0 = rcu_dereference_protected(chain->rules_gen_0,
5978+
lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
5979+
g1 = rcu_dereference_protected(chain->rules_gen_1,
5980+
lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
5981+
5982+
/* No changes to this chain? */
5983+
if (chain->rules_next == NULL) {
5984+
/* chain had no change in last or next generation */
5985+
if (g0 == g1)
5986+
return;
5987+
/*
5988+
* chain had no change in this generation; make sure next
5989+
* one uses same rules as current generation.
5990+
*/
5991+
if (next_genbit) {
5992+
rcu_assign_pointer(chain->rules_gen_1, g0);
5993+
nf_tables_commit_chain_free_rules_old(g1);
5994+
} else {
5995+
rcu_assign_pointer(chain->rules_gen_0, g1);
5996+
nf_tables_commit_chain_free_rules_old(g0);
5997+
}
5998+
5999+
return;
6000+
}
6001+
6002+
if (next_genbit)
6003+
rcu_assign_pointer(chain->rules_gen_1, chain->rules_next);
6004+
else
6005+
rcu_assign_pointer(chain->rules_gen_0, chain->rules_next);
6006+
6007+
chain->rules_next = NULL;
6008+
6009+
if (g0 == g1)
6010+
return;
6011+
6012+
if (next_genbit)
6013+
nf_tables_commit_chain_free_rules_old(g1);
6014+
else
6015+
nf_tables_commit_chain_free_rules_old(g0);
6016+
}
6017+
58536018
static int nf_tables_commit(struct net *net, struct sk_buff *skb)
58546019
{
58556020
struct nft_trans *trans, *next;
58566021
struct nft_trans_elem *te;
6022+
struct nft_chain *chain;
6023+
struct nft_table *table;
58576024

5858-
/* Bump generation counter, invalidate any dump in progress */
5859-
while (++net->nft.base_seq == 0);
6025+
/* 1. Allocate space for next generation rules_gen_X[] */
6026+
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
6027+
int ret;
58606028

5861-
/* A new generation has just started */
5862-
net->nft.gencursor = nft_gencursor_next(net);
6029+
if (trans->msg_type == NFT_MSG_NEWRULE ||
6030+
trans->msg_type == NFT_MSG_DELRULE) {
6031+
chain = trans->ctx.chain;
6032+
6033+
ret = nf_tables_commit_chain_prepare(net, chain);
6034+
if (ret < 0) {
6035+
nf_tables_commit_chain_prepare_cancel(net);
6036+
return ret;
6037+
}
6038+
}
6039+
}
58636040

5864-
/* Make sure all packets have left the previous generation before
5865-
* purging old rules.
6041+
/* step 2. Make rules_gen_X visible to packet path */
6042+
list_for_each_entry(table, &net->nft.tables, list) {
6043+
list_for_each_entry(chain, &table->chains, list) {
6044+
if (!nft_is_active_next(net, chain))
6045+
continue;
6046+
nf_tables_commit_chain_active(net, chain);
6047+
}
6048+
}
6049+
6050+
/*
6051+
* Bump generation counter, invalidate any dump in progress.
6052+
* Cannot fail after this point.
58666053
*/
5867-
synchronize_rcu();
6054+
while (++net->nft.base_seq == 0);
6055+
6056+
/* step 3. Start new generation, rules_gen_X now in use. */
6057+
net->nft.gencursor = nft_gencursor_next(net);
58686058

58696059
list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
58706060
switch (trans->msg_type) {

net/netfilter/nf_tables_core.c

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -133,35 +133,37 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
133133

134134
struct nft_jumpstack {
135135
const struct nft_chain *chain;
136-
const struct nft_rule *rule;
136+
struct nft_rule *const *rules;
137137
};
138138

139139
unsigned int
140140
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
141141
{
142142
const struct nft_chain *chain = priv, *basechain = chain;
143143
const struct net *net = nft_net(pkt);
144+
struct nft_rule *const *rules;
144145
const struct nft_rule *rule;
145146
const struct nft_expr *expr, *last;
146147
struct nft_regs regs;
147148
unsigned int stackptr = 0;
148149
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
149-
unsigned int gencursor = nft_genmask_cur(net);
150+
bool genbit = READ_ONCE(net->nft.gencursor);
150151
struct nft_traceinfo info;
151152

152153
info.trace = false;
153154
if (static_branch_unlikely(&nft_trace_enabled))
154155
nft_trace_init(&info, pkt, &regs.verdict, basechain);
155156
do_chain:
156-
rule = list_entry(&chain->rules, struct nft_rule, list);
157+
if (genbit)
158+
rules = rcu_dereference(chain->rules_gen_1);
159+
else
160+
rules = rcu_dereference(chain->rules_gen_0);
161+
157162
next_rule:
163+
rule = *rules;
158164
regs.verdict.code = NFT_CONTINUE;
159-
list_for_each_entry_continue_rcu(rule, &chain->rules, list) {
160-
161-
/* This rule is not active, skip. */
162-
if (unlikely(rule->genmask & gencursor))
163-
continue;
164-
165+
for (; *rules ; rules++) {
166+
rule = *rules;
165167
nft_rule_for_each_expr(expr, last, rule) {
166168
if (expr->ops == &nft_cmp_fast_ops)
167169
nft_cmp_fast_eval(expr, &regs);
@@ -199,7 +201,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
199201
case NFT_JUMP:
200202
BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE);
201203
jumpstack[stackptr].chain = chain;
202-
jumpstack[stackptr].rule = rule;
204+
jumpstack[stackptr].rules = rules + 1;
203205
stackptr++;
204206
/* fall through */
205207
case NFT_GOTO:
@@ -221,7 +223,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
221223
if (stackptr > 0) {
222224
stackptr--;
223225
chain = jumpstack[stackptr].chain;
224-
rule = jumpstack[stackptr].rule;
226+
rules = jumpstack[stackptr].rules;
225227
goto next_rule;
226228
}
227229

0 commit comments

Comments
 (0)