All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] netfilter: remove the duplicate tables
@ 2010-11-18 14:39 Changli Gao
  2010-11-18 15:43 ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-11-18 14:39 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David S. Miller, Eric Dumazet, netfilter-devel, netdev, Changli Gao

As only xt_counters are private to each CPU, we don't need to maintain
a whole individual table for each CPU.

In the kernel space, we use the memory of ipt_entry.counters to save a
pointer to a percpu xt_counters. When iptables runs, it only update the
counters on its own CPU.

On non SMP platforms, no change is made.

Only the code of iptables is converted. Thanks for reviews.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 include/linux/netfilter/x_tables.h |   47 ++++++++++++++++--
 net/ipv4/netfilter/ip_tables.c     |   94 ++++++++++++++-----------------------
 net/netfilter/x_tables.c           |   33 ++++--------
 3 files changed, 90 insertions(+), 84 deletions(-)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 742bec0..593fe04 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -103,7 +103,6 @@ struct _xt_align {
 /* Error verdict. */
 #define XT_ERROR_TARGET "ERROR"
 
-#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
 struct xt_counters {
@@ -404,13 +403,9 @@ struct xt_table_info {
 	unsigned int stacksize;
 	unsigned int __percpu *stackptr;
 	void ***jumpstack;
-	/* ipt_entry tables: one per CPU */
-	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	void *entries[1];
+	void *entries;
 };
 
-#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
-			  + nr_cpu_ids * sizeof(char *))
 extern int xt_register_target(struct xt_target *target);
 extern void xt_unregister_target(struct xt_target *target);
 extern int xt_register_targets(struct xt_target *target, unsigned int n);
@@ -550,6 +545,46 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 extern struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *);
 extern void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *);
 
+#ifdef CONFIG_SMP
+#define xt_counters_kern(c) (*(struct xt_counters __percpu **)(c))
+#endif
+
+static inline int xt_counters_kern_init(struct xt_counters *c)
+{
+#ifdef CONFIG_SMP
+	xt_counters_kern(c) = alloc_percpu(struct xt_counters);
+	if (xt_counters_kern(c) == NULL)
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static inline void xt_counters_kern_destroy(struct xt_counters *c)
+{
+#ifdef CONFIG_SMP
+	free_percpu(xt_counters_kern(c));
+#endif
+}
+
+static inline struct xt_counters *xt_counters_kern_cpu(struct xt_counters *c,
+						       int cpu)
+{
+#ifdef CONFIG_SMP
+	return per_cpu_ptr(xt_counters_kern(c), cpu);
+#else
+	return c;
+#endif
+}
+
+static inline struct xt_counters *xt_counters_kern_this(struct xt_counters *c)
+{
+#ifdef CONFIG_SMP
+	return this_cpu_ptr(xt_counters_kern(c));
+#else
+	return c;
+#endif
+}
+
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..6a94ce0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -270,7 +270,7 @@ static void trace_packet(const struct sk_buff *skb,
 	const struct ipt_entry *iter;
 	unsigned int rulenum = 0;
 
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries;
 	root = get_entry(table_base, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
@@ -334,7 +334,7 @@ ipt_do_table(struct sk_buff *skb,
 	xt_info_rdlock_bh();
 	private = table->private;
 	cpu        = smp_processor_id();
-	table_base = private->entries[cpu];
+	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		ADD_COUNTER(*xt_counters_kern_this(&e->counters), skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -785,6 +785,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
 	par.family   = NFPROTO_IPV4;
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
+	xt_counters_kern_destroy(&e->counters);
 	module_put(par.target->me);
 }
 
@@ -868,12 +869,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -884,42 +879,21 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU.
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
+	struct xt_counters *counter;
 
+	memset(counters, 0, sizeof(struct xt_counters) * t->number);
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
 		i = 0;
 		local_bh_disable();
 		xt_info_wrlock(cpu);
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+		xt_entry_foreach(iter, t->entries, t->size) {
+			counter = xt_counters_kern_cpu(&iter->counters, cpu);
+			ADD_COUNTER(counters[i], counter->bcnt, counter->pcnt);
 			++i; /* macro does multi eval of i */
 		}
 		xt_info_wrunlock(cpu);
 		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -962,7 +936,7 @@ copy_entries_to_user(unsigned int total_size,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1076,10 +1050,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1199,9 +1173,14 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct xt_table *t;
 	struct xt_table_info *oldinfo;
 	struct xt_counters *counters;
-	void *loc_cpu_old_entry;
 	struct ipt_entry *iter;
 
+	xt_entry_foreach(iter, newinfo->entries, newinfo->size) {
+		ret = xt_counters_kern_init(&iter->counters);
+		if (ret < 0)
+			return ret;
+	}
+
 	ret = 0;
 	counters = vmalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
@@ -1242,8 +1221,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
-	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
 		cleanup_entry(iter, net);
 
 	xt_free_table_info(oldinfo);
@@ -1283,8 +1261,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1325,7 +1302,6 @@ do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
 	struct ipt_entry *iter;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
@@ -1382,10 +1358,10 @@ do_add_counters(struct net *net, const void __user *user,
 	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
 	xt_info_wrlock(curcpu);
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+	xt_entry_foreach(iter, private->entries, private->size) {
+		ADD_COUNTER(*xt_counters_kern_this(&iter->counters),
+			    paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_info_wrunlock(curcpu);
@@ -1728,7 +1704,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1780,11 +1756,6 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1828,7 +1799,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1911,7 +1882,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	pos = userptr;
 	size = total_size;
 	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
@@ -2081,6 +2052,7 @@ struct xt_table *ipt_register_table(struct net *net,
 	struct xt_table_info bootstrap = {0};
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
+	struct ipt_entry *iter;
 
 	newinfo = xt_alloc_table_info(repl->size);
 	if (!newinfo) {
@@ -2089,13 +2061,19 @@ struct xt_table *ipt_register_table(struct net *net,
 	}
 
 	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
 	if (ret != 0)
 		goto out_free;
 
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) {
+		ret = xt_counters_kern_init(&iter->counters);
+		if (ret < 0)
+			goto out_free;
+	}
+
 	new_table = xt_register_table(net, table, &bootstrap, newinfo);
 	if (IS_ERR(new_table)) {
 		ret = PTR_ERR(new_table);
@@ -2105,6 +2083,8 @@ struct xt_table *ipt_register_table(struct net *net,
 	return new_table;
 
 out_free:
+	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+		xt_counters_kern_destroy(&iter->counters);
 	xt_free_table_info(newinfo);
 out:
 	return ERR_PTR(ret);
@@ -2120,7 +2100,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8046350..8053cbe 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -639,31 +639,24 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;
-	int cpu;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
-	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
+	newinfo = kzalloc(sizeof(*newinfo), GFP_KERNEL);
 	if (!newinfo)
 		return NULL;
 
 	newinfo->size = size;
 
-	for_each_possible_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							cpu_to_node(cpu));
-
-		if (newinfo->entries[cpu] == NULL) {
-			xt_free_table_info(newinfo);
-			return NULL;
-		}
+	if (size <= PAGE_SIZE)
+		newinfo->entries = kmalloc(size, GFP_KERNEL);
+	else
+		newinfo->entries = vmalloc(size);
+	if (newinfo->entries == NULL) {
+		xt_free_table_info(newinfo);
+		return NULL;
 	}
 
 	return newinfo;
@@ -674,12 +667,10 @@ void xt_free_table_info(struct xt_table_info *info)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
+	if (info->size <= PAGE_SIZE)
+		kfree(info->entries);
+	else
+		vfree(info->entries);
 
 	if (info->jumpstack != NULL) {
 		if (sizeof(void *) * info->stacksize > PAGE_SIZE) {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-18 14:39 [RFC PATCH] netfilter: remove the duplicate tables Changli Gao
@ 2010-11-18 15:43 ` Eric Dumazet
  2010-11-18 23:36   ` Changli Gao
  2010-11-19 11:15   ` Jan Engelhardt
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-11-18 15:43 UTC (permalink / raw)
  To: Changli Gao; +Cc: Patrick McHardy, David S. Miller, netfilter-devel, netdev

Le jeudi 18 novembre 2010 à 22:39 +0800, Changli Gao a écrit :
> As only xt_counters are private to each CPU, we don't need to maintain
> a whole individual table for each CPU.
> 
> In the kernel space, we use the memory of ipt_entry.counters to save a
> pointer to a percpu xt_counters. When iptables runs, it only update the
> counters on its own CPU.
> 
> On non SMP platforms, no change is made.
> 
> Only the code of iptables is converted. Thanks for reviews.
> 

Changli

I answered you a (difficult) work was in progress, still you post a
patch that needs our review and time ? This is crazy.

I am tempted to stop here. Oh well...

Your way of allocating a percpu counter for each counter is a pure TLB
and cache line blower (up to two cache lines per counter), not counting
the time needed to load a new table with 10.000 entries. Some people
still use scripts with hundred of calls to iptables.

percpu_alloc() is not meant to be used thousand of times per second. It
is not scalable.

You consume 16 bytes per counter in the main table, while 4 bytes index
should be enough on SMP build. Most firewalls I know use two or four
cpus at most.

They care about speed, not really because iptables duplicates table on
each cpu. By the way, not using NUMA can definitly hurt firewalls with
many rules, unless you make sure the main table is vmalloced() with node
distribution, not a single node. Even with this, this can hurt
latencies.

Allocating one contiguous percpu var for all counters is a must.

Problem is : percpu alloc doesnt allow big allocations.

#define PCPU_MIN_UNIT_SIZE      PFN_ALIGN(32 << 10)

So max allocation is 32 Kbytes, thats 2048 'xt_counters' only.
-> cannot really use pcpu-alloc, but a kmalloc_node() or vmalloc_node()
per cpu



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-18 15:43 ` Eric Dumazet
@ 2010-11-18 23:36   ` Changli Gao
  2010-11-19  6:24     ` Eric Dumazet
  2010-11-19 11:15   ` Jan Engelhardt
  1 sibling, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-11-18 23:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, David S. Miller, netfilter-devel, netdev

On Thu, Nov 18, 2010 at 11:43 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le jeudi 18 novembre 2010 à 22:39 +0800, Changli Gao a écrit :
>> As only xt_counters are private to each CPU, we don't need to maintain
>> a whole individual table for each CPU.
>>
>> In the kernel space, we use the memory of ipt_entry.counters to save a
>> pointer to a percpu xt_counters. When iptables runs, it only update the
>> counters on its own CPU.
>>
>> On non SMP platforms, no change is made.
>>
>> Only the code of iptables is converted. Thanks for reviews.
>>
>
> Changli
>
> I answered you a (difficult) work was in progress, still you post a
> patch that needs our review and time ? This is crazy.

Sorry for interrupt, and thanks for review. I posted this patch to
check if I was in the right direction.

>
> I am tempted to stop here. Oh well...
>
> Your way of allocating a percpu counter for each counter is a pure TLB
> and cache line blower (up to two cache lines per counter), not counting
> the time needed to load a new table with 10.000 entries. Some people
> still use scripts with hundred of calls to iptables.
>
> percpu_alloc() is not meant to be used thousand of times per second. It
> is not scalable.
>
> You consume 16 bytes per counter in the main table, while 4 bytes index
> should be enough on SMP build. Most firewalls I know use two or four
> cpus at most.

I think we can't change the structure of ipt_entry, as it is exposed
to userspace as an ABI. Though there is no need to keep the same
structure in the kernel space, converting is a big work. :)

>
> They care about speed, not really because iptables duplicates table on
> each cpu. By the way, not using NUMA can definitly hurt firewalls with
> many rules, unless you make sure the main table is vmalloced() with node
> distribution, not a single node. Even with this, this can hurt
> latencies.

This is what I worry about. I think only test can verify it.

>
> Allocating one contiguous percpu var for all counters is a must.
>
> Problem is : percpu alloc doesnt allow big allocations.
>
> #define PCPU_MIN_UNIT_SIZE      PFN_ALIGN(32 << 10)
>
> So max allocation is 32 Kbytes, thats 2048 'xt_counters' only.
> -> cannot really use pcpu-alloc, but a kmalloc_node() or vmalloc_node()
> per cpu
>

Thanks. I'll try.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-18 23:36   ` Changli Gao
@ 2010-11-19  6:24     ` Eric Dumazet
  0 siblings, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-11-19  6:24 UTC (permalink / raw)
  To: Changli Gao; +Cc: Patrick McHardy, David S. Miller, netfilter-devel, netdev

Le vendredi 19 novembre 2010 à 07:36 +0800, Changli Gao a écrit :
> On Thu, Nov 18, 2010 at 11:43 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > You consume 16 bytes per counter in the main table, while 4 bytes index
> > should be enough on SMP build. Most firewalls I know use two or four
> > cpus at most.
> 
> I think we can't change the structure of ipt_entry, as it is exposed
> to userspace as an ABI. Though there is no need to keep the same
> structure in the kernel space, converting is a big work. :)
> 

We already do that for COMPAT. This is a not a big deal to always use a
converter and make it dependent on userland being 32 or 64 bit.



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-18 15:43 ` Eric Dumazet
  2010-11-18 23:36   ` Changli Gao
@ 2010-11-19 11:15   ` Jan Engelhardt
  2010-11-19 11:29     ` Eric Dumazet
  1 sibling, 1 reply; 10+ messages in thread
From: Jan Engelhardt @ 2010-11-19 11:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, Patrick McHardy, David S. Miller, netfilter-devel, netdev

On Thursday 2010-11-18 16:43, Eric Dumazet wrote:

>Le jeudi 18 novembre 2010 à 22:39 +0800, Changli Gao a écrit :
>> As only xt_counters are private to each CPU, we don't need to maintain
>> a whole individual table for each CPU.
>> 
>> In the kernel space, we use the memory of ipt_entry.counters to save a
>> pointer to a percpu xt_counters. When iptables runs, it only update the
>> counters on its own CPU.
>> 
>> On non SMP platforms, no change is made.
>> 
>> Only the code of iptables is converted. Thanks for reviews.
>> 
>
>Changli
>
>I answered you a (difficult) work was in progress

Was it? Quoting Patrick from 24h prior to this post:

|so patches to get rid of the table duplication are highly welcome.

>still you post a patch that needs our review and time ? This is crazy.

You do not need to do it, but I will happily look at this.
Of course my observations are the same as yours:

>Your way of allocating a percpu counter for each counter is a pure TLB
>and cache line blower (up to two cache lines per counter), not counting
>the time needed to load a new table with 10.000 entries. Some people
>still use scripts with hundred of calls to iptables.

The two are statistically independent though. Even for a loaded 
ruleset, the TLB/DC miss accumulation will be desastrous - as I've found 
with linked-list rules/small allocs.

>Allocating one contiguous percpu var for all counters is a must.
>
>Problem is : percpu alloc doesnt allow big allocations.
>
>#define PCPU_MIN_UNIT_SIZE      PFN_ALIGN(32 << 10)
>
>So max allocation is 32 Kbytes, thats 2048 'xt_counters' only.
>-> cannot really use pcpu-alloc, but a kmalloc_node() or vmalloc_node()
>per cpu

.. as is already done for jumpstack ;-)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-19 11:15   ` Jan Engelhardt
@ 2010-11-19 11:29     ` Eric Dumazet
  2010-11-19 12:48       ` Changli Gao
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-11-19 11:29 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Changli Gao, Patrick McHardy, David S. Miller, netfilter-devel,
	netdev, Stephen Hemminger

Le vendredi 19 novembre 2010 à 12:15 +0100, Jan Engelhardt a écrit :

> Was it? Quoting Patrick from 24h prior to this post:
> 
> |so patches to get rid of the table duplication are highly welcome.
> 
> >still you post a patch that needs our review and time ? This is crazy.
> 
> You do not need to do it, but I will happily look at this.
> Of course my observations are the same as yours:
> 
> >Your way of allocating a percpu counter for each counter is a pure TLB
> >and cache line blower (up to two cache lines per counter), not counting
> >the time needed to load a new table with 10.000 entries. Some people
> >still use scripts with hundred of calls to iptables.
> 
> The two are statistically independent though. Even for a loaded 
> ruleset, the TLB/DC miss accumulation will be desastrous - as I've found 
> with linked-list rules/small allocs.
> 
> >Allocating one contiguous percpu var for all counters is a must.
> >
> >Problem is : percpu alloc doesnt allow big allocations.
> >
> >#define PCPU_MIN_UNIT_SIZE      PFN_ALIGN(32 << 10)
> >
> >So max allocation is 32 Kbytes, thats 2048 'xt_counters' only.
> >-> cannot really use pcpu-alloc, but a kmalloc_node() or vmalloc_node()
> >per cpu
> 
> .. as is already done for jumpstack ;-)

IMHO, the real problem is not the table duplication. We know that adding
a level of indirection is going to hurt a lot because of cache misses.

Its the atomic op (spinlock) done for every packet, entering every
filter, with the conditional branch we do because of possible recursion.

per cpu variable, and spinlock... its really expensive.

Stephen tried an RCU conversion some time ago, that aborted because of
RCU drawbacks (too much memory was possibly waiting to be freed after a
grace period). Maybe RCU infrastructure is now ready to try again.

We should do what we did for u64 stats counters in network stack, using
the u64_stats_sync.h infrastructure. No more synchro between the threads
running through rules, and one gathering counters. Better latencies in
particular.



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-19 11:29     ` Eric Dumazet
@ 2010-11-19 12:48       ` Changli Gao
  2010-11-19 12:57         ` Jan Engelhardt
  0 siblings, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-11-19 12:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jan Engelhardt, Patrick McHardy, David S. Miller,
	netfilter-devel, netdev, Stephen Hemminger

On Fri, Nov 19, 2010 at 7:29 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> IMHO, the real problem is not the table duplication. We know that adding
> a level of indirection is going to hurt a lot because of cache misses.
>

Currently, multi-core CPU is common, the cores in a CPU share the
lowest level cache. Duplicate tables use more RAM, and may cause more
pressure of the lowest level cache.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-19 12:48       ` Changli Gao
@ 2010-11-19 12:57         ` Jan Engelhardt
  2010-11-19 13:03           ` Changli Gao
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Engelhardt @ 2010-11-19 12:57 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, Patrick McHardy, David S. Miller, netfilter-devel,
	netdev, Stephen Hemminger

On Friday 2010-11-19 13:48, Changli Gao wrote:

>On Fri, Nov 19, 2010 at 7:29 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>> IMHO, the real problem is not the table duplication. We know that adding
>> a level of indirection is going to hurt a lot because of cache misses.
>>
>
>Currently, multi-core CPU is common, the cores in a CPU share the
>lowest level cache. Duplicate tables use more RAM, and may cause more
>pressure of the lowest level cache.

There probably should be at most one copy per NUMA node. Maybe less, 
depending on what the benchmarks will say.

I started on some patch to reduce the ruleset from #cpu to #numa_nodes, 
but then stopped when I ran into the obvious fact that it would require 
locking the counters because xtables can actually run on more than one 
core within a given numa node. When decoupling counters from the 
ruleset, reducing the ruleset copies would become easier.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-19 12:57         ` Jan Engelhardt
@ 2010-11-19 13:03           ` Changli Gao
  2010-11-19 13:11             ` Jan Engelhardt
  0 siblings, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-11-19 13:03 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Eric Dumazet, Patrick McHardy, David S. Miller, netfilter-devel,
	netdev, Stephen Hemminger

On Fri, Nov 19, 2010 at 8:57 PM, Jan Engelhardt <jengelh@medozas.de> wrote:
> There probably should be at most one copy per NUMA node. Maybe less,
> depending on what the benchmarks will say.
>
> I started on some patch to reduce the ruleset from #cpu to #numa_nodes,

It is a great idea! :)


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] netfilter: remove the duplicate tables
  2010-11-19 13:03           ` Changli Gao
@ 2010-11-19 13:11             ` Jan Engelhardt
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Engelhardt @ 2010-11-19 13:11 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, Patrick McHardy, David S. Miller, netfilter-devel,
	netdev, Stephen Hemminger

On Friday 2010-11-19 14:03, Changli Gao wrote:

>On Fri, Nov 19, 2010 at 8:57 PM, Jan Engelhardt <jengelh@medozas.de> wrote:
>> There probably should be at most one copy per NUMA node. Maybe less,
>> depending on what the benchmarks will say.
>>
>> I started on some patch to reduce the ruleset from #cpu to #numa_nodes,
>
>It is a great idea! :)

Feel free to do it. (I don't have the patch anymore. It was just a few 
lines anyway.)

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-11-19 13:11 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-18 14:39 [RFC PATCH] netfilter: remove the duplicate tables Changli Gao
2010-11-18 15:43 ` Eric Dumazet
2010-11-18 23:36   ` Changli Gao
2010-11-19  6:24     ` Eric Dumazet
2010-11-19 11:15   ` Jan Engelhardt
2010-11-19 11:29     ` Eric Dumazet
2010-11-19 12:48       ` Changli Gao
2010-11-19 12:57         ` Jan Engelhardt
2010-11-19 13:03           ` Changli Gao
2010-11-19 13:11             ` Jan Engelhardt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.