Skip to content

Commit 7845447

Browse files
Stephen Hemmingerkaber
authored andcommitted
netfilter: iptables: lock free counters
The reader/writer lock in ip_tables is acquired in the critical path of processing packets and is one of the reasons just loading iptables can cause a 20% performance loss. The rwlock serves two functions: 1) it prevents changes to table state (xt_replace) while table is in use. This is now handled by doing rcu on the xt_table. When table is replaced, the new table(s) are put in and the old one table(s) are freed after RCU period. 2) it provides synchronization when accesing the counter values. This is now handled by swapping in new table_info entries for each cpu then summing the old values, and putting the result back onto one cpu. On a busy system it may cause sampling to occur at different times on each cpu, but no packet/byte counts are lost in the process. Signed-off-by: Stephen Hemminger <[email protected]> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here) BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago) Acked-by: Eric Dumazet <[email protected]> Signed-off-by: Patrick McHardy <[email protected]>
1 parent 323dbf9 commit 7845447

File tree

5 files changed

+284
-102
lines changed

5 files changed

+284
-102
lines changed

include/linux/netfilter/x_tables.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ struct xt_table
353353
unsigned int valid_hooks;
354354

355355
/* Lock for the curtain */
356-
rwlock_t lock;
356+
struct mutex lock;
357357

358358
/* Man behind the curtain... */
359359
struct xt_table_info *private;
@@ -385,7 +385,7 @@ struct xt_table_info
385385

386386
/* ipt_entry tables: one per CPU */
387387
/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
388-
char *entries[1];
388+
void *entries[1];
389389
};
390390

391391
#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
432432

433433
extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
434434
extern void xt_free_table_info(struct xt_table_info *info);
435+
extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
436+
struct xt_table_info *new);
435437

436438
#ifdef CONFIG_COMPAT
437439
#include <net/compat.h>

net/ipv4/netfilter/arp_tables.c

Lines changed: 88 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
261261
indev = in ? in->name : nulldevname;
262262
outdev = out ? out->name : nulldevname;
263263

264-
read_lock_bh(&table->lock);
265-
private = table->private;
266-
table_base = (void *)private->entries[smp_processor_id()];
264+
rcu_read_lock();
265+
private = rcu_dereference(table->private);
266+
table_base = rcu_dereference(private->entries[smp_processor_id()]);
267+
267268
e = get_entry(table_base, private->hook_entry[hook]);
268269
back = get_entry(table_base, private->underflow[hook]);
269270

@@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
335336
e = (void *)e + e->next_offset;
336337
}
337338
} while (!hotdrop);
338-
read_unlock_bh(&table->lock);
339+
340+
rcu_read_unlock();
339341

340342
if (hotdrop)
341343
return NF_DROP;
@@ -738,11 +740,65 @@ static void get_counters(const struct xt_table_info *t,
738740
}
739741
}
740742

741-
static inline struct xt_counters *alloc_counters(struct xt_table *table)
743+
744+
/* We're lazy, and add to the first CPU; overflow works its fey magic
745+
* and everything is OK. */
746+
static int
747+
add_counter_to_entry(struct arpt_entry *e,
748+
const struct xt_counters addme[],
749+
unsigned int *i)
750+
{
751+
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
752+
753+
(*i)++;
754+
return 0;
755+
}
756+
757+
/* Take values from counters and add them back onto the current cpu */
758+
static void put_counters(struct xt_table_info *t,
759+
const struct xt_counters counters[])
760+
{
761+
unsigned int i, cpu;
762+
763+
local_bh_disable();
764+
cpu = smp_processor_id();
765+
i = 0;
766+
ARPT_ENTRY_ITERATE(t->entries[cpu],
767+
t->size,
768+
add_counter_to_entry,
769+
counters,
770+
&i);
771+
local_bh_enable();
772+
}
773+
774+
static inline int
775+
zero_entry_counter(struct arpt_entry *e, void *arg)
776+
{
777+
e->counters.bcnt = 0;
778+
e->counters.pcnt = 0;
779+
return 0;
780+
}
781+
782+
static void
783+
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
784+
{
785+
unsigned int cpu;
786+
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
787+
788+
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
789+
for_each_possible_cpu(cpu) {
790+
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
791+
ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
792+
zero_entry_counter, NULL);
793+
}
794+
}
795+
796+
static struct xt_counters *alloc_counters(struct xt_table *table)
742797
{
743798
unsigned int countersize;
744799
struct xt_counters *counters;
745-
const struct xt_table_info *private = table->private;
800+
struct xt_table_info *private = table->private;
801+
struct xt_table_info *info;
746802

747803
/* We need atomic snapshot of counters: rest doesn't change
748804
* (other than comefrom, which userspace doesn't care
@@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
752808
counters = vmalloc_node(countersize, numa_node_id());
753809

754810
if (counters == NULL)
755-
return ERR_PTR(-ENOMEM);
811+
goto nomem;
812+
813+
info = xt_alloc_table_info(private->size);
814+
if (!info)
815+
goto free_counters;
756816

757-
/* First, sum counters... */
758-
write_lock_bh(&table->lock);
759-
get_counters(private, counters);
760-
write_unlock_bh(&table->lock);
817+
clone_counters(info, private);
818+
819+
mutex_lock(&table->lock);
820+
xt_table_entry_swap_rcu(private, info);
821+
synchronize_net(); /* Wait until smoke has cleared */
822+
823+
get_counters(info, counters);
824+
put_counters(private, counters);
825+
mutex_unlock(&table->lock);
826+
827+
xt_free_table_info(info);
761828

762829
return counters;
830+
831+
free_counters:
832+
vfree(counters);
833+
nomem:
834+
return ERR_PTR(-ENOMEM);
763835
}
764836

765837
static int copy_entries_to_user(unsigned int total_size,
@@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
10991171
return ret;
11001172
}
11011173

1102-
/* We're lazy, and add to the first CPU; overflow works its fey magic
1103-
* and everything is OK.
1104-
*/
1105-
static inline int add_counter_to_entry(struct arpt_entry *e,
1106-
const struct xt_counters addme[],
1107-
unsigned int *i)
1108-
{
1109-
1110-
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1111-
1112-
(*i)++;
1113-
return 0;
1114-
}
1115-
11161174
static int do_add_counters(struct net *net, void __user *user, unsigned int len,
11171175
int compat)
11181176
{
@@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
11721230
goto free;
11731231
}
11741232

1175-
write_lock_bh(&t->lock);
1233+
mutex_lock(&t->lock);
11761234
private = t->private;
11771235
if (private->number != num_counters) {
11781236
ret = -EINVAL;
11791237
goto unlock_up_free;
11801238
}
11811239

1240+
preempt_disable();
11821241
i = 0;
11831242
/* Choose the copy that is on our node */
11841243
loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
11871246
add_counter_to_entry,
11881247
paddc,
11891248
&i);
1249+
preempt_enable();
11901250
unlock_up_free:
1191-
write_unlock_bh(&t->lock);
1251+
mutex_unlock(&t->lock);
1252+
11921253
xt_table_unlock(t);
11931254
module_put(t->me);
11941255
free:

net/ipv4/netfilter/ip_tables.c

Lines changed: 87 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
347347
mtpar.family = tgpar.family = NFPROTO_IPV4;
348348
tgpar.hooknum = hook;
349349

350-
read_lock_bh(&table->lock);
351350
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
352-
private = table->private;
353-
table_base = (void *)private->entries[smp_processor_id()];
351+
352+
rcu_read_lock();
353+
private = rcu_dereference(table->private);
354+
table_base = rcu_dereference(private->entries[smp_processor_id()]);
355+
354356
e = get_entry(table_base, private->hook_entry[hook]);
355357

356358
/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
445447
}
446448
} while (!hotdrop);
447449

448-
read_unlock_bh(&table->lock);
450+
rcu_read_unlock();
449451

450452
#ifdef DEBUG_ALLOW_ALL
451453
return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info *t,
924926
counters,
925927
&i);
926928
}
929+
930+
}
931+
932+
/* We're lazy, and add to the first CPU; overflow works its fey magic
933+
* and everything is OK. */
934+
static int
935+
add_counter_to_entry(struct ipt_entry *e,
936+
const struct xt_counters addme[],
937+
unsigned int *i)
938+
{
939+
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
940+
941+
(*i)++;
942+
return 0;
943+
}
944+
945+
/* Take values from counters and add them back onto the current cpu */
946+
static void put_counters(struct xt_table_info *t,
947+
const struct xt_counters counters[])
948+
{
949+
unsigned int i, cpu;
950+
951+
local_bh_disable();
952+
cpu = smp_processor_id();
953+
i = 0;
954+
IPT_ENTRY_ITERATE(t->entries[cpu],
955+
t->size,
956+
add_counter_to_entry,
957+
counters,
958+
&i);
959+
local_bh_enable();
960+
}
961+
962+
963+
static inline int
964+
zero_entry_counter(struct ipt_entry *e, void *arg)
965+
{
966+
e->counters.bcnt = 0;
967+
e->counters.pcnt = 0;
968+
return 0;
969+
}
970+
971+
static void
972+
clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
973+
{
974+
unsigned int cpu;
975+
const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
976+
977+
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
978+
for_each_possible_cpu(cpu) {
979+
memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
980+
IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
981+
zero_entry_counter, NULL);
982+
}
927983
}
928984

929985
static struct xt_counters * alloc_counters(struct xt_table *table)
930986
{
931987
unsigned int countersize;
932988
struct xt_counters *counters;
933-
const struct xt_table_info *private = table->private;
989+
struct xt_table_info *private = table->private;
990+
struct xt_table_info *info;
934991

935992
/* We need atomic snapshot of counters: rest doesn't change
936993
(other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
939996
counters = vmalloc_node(countersize, numa_node_id());
940997

941998
if (counters == NULL)
942-
return ERR_PTR(-ENOMEM);
999+
goto nomem;
9431000

944-
/* First, sum counters... */
945-
write_lock_bh(&table->lock);
946-
get_counters(private, counters);
947-
write_unlock_bh(&table->lock);
1001+
info = xt_alloc_table_info(private->size);
1002+
if (!info)
1003+
goto free_counters;
1004+
1005+
clone_counters(info, private);
1006+
1007+
mutex_lock(&table->lock);
1008+
xt_table_entry_swap_rcu(private, info);
1009+
synchronize_net(); /* Wait until smoke has cleared */
1010+
1011+
get_counters(info, counters);
1012+
put_counters(private, counters);
1013+
mutex_unlock(&table->lock);
1014+
1015+
xt_free_table_info(info);
9481016

9491017
return counters;
1018+
1019+
free_counters:
1020+
vfree(counters);
1021+
nomem:
1022+
return ERR_PTR(-ENOMEM);
9501023
}
9511024

9521025
static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
13121385
return ret;
13131386
}
13141387

1315-
/* We're lazy, and add to the first CPU; overflow works its fey magic
1316-
* and everything is OK. */
1317-
static int
1318-
add_counter_to_entry(struct ipt_entry *e,
1319-
const struct xt_counters addme[],
1320-
unsigned int *i)
1321-
{
1322-
#if 0
1323-
duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1324-
*i,
1325-
(long unsigned int)e->counters.pcnt,
1326-
(long unsigned int)e->counters.bcnt,
1327-
(long unsigned int)addme[*i].pcnt,
1328-
(long unsigned int)addme[*i].bcnt);
1329-
#endif
1330-
1331-
ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1332-
1333-
(*i)++;
1334-
return 0;
1335-
}
13361388

13371389
static int
13381390
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
13931445
goto free;
13941446
}
13951447

1396-
write_lock_bh(&t->lock);
1448+
mutex_lock(&t->lock);
13971449
private = t->private;
13981450
if (private->number != num_counters) {
13991451
ret = -EINVAL;
14001452
goto unlock_up_free;
14011453
}
14021454

1455+
preempt_disable();
14031456
i = 0;
14041457
/* Choose the copy that is on our node */
14051458
loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
14081461
add_counter_to_entry,
14091462
paddc,
14101463
&i);
1464+
preempt_enable();
14111465
unlock_up_free:
1412-
write_unlock_bh(&t->lock);
1466+
mutex_unlock(&t->lock);
14131467
xt_table_unlock(t);
14141468
module_put(t->me);
14151469
free:

0 commit comments

Comments
 (0)