All of lore.kernel.org
 help / color / mirror / Atom feed
* iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49
@ 2009-04-10  9:15 Jeff Chua
  2009-04-10 16:52 ` Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Jeff Chua @ 2009-04-10  9:15 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Jan Engelhardt, Patrick McHardy, David S. Miller,
	Roman Mindalev, Linus Torvalds, Linux Kernel



Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
0.2sec in 2.6.29. I've bisected down this commit.

There are a few patches on top of the original patch. When I reverted the 
original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
inserts back to .2sec again.

I'm loading all the firewall rules during boot-up and this 6 secs slowness 
is really not very nice to wait for.

This is more than 20% degradation.

     Author: Stephen Hemminger <shemminger@vyatta.com>
     Date:   Fri Feb 20 10:35:32 2009 +0100

     netfilter: iptables: lock free counters

     The reader/writer lock in ip_tables is acquired in the critical path
     of processing packets and is one of the reasons just loading
     iptables can cause a 20% performance loss. The rwlock serves two
     functions:



Attached is the original commit + _bh changes to make it revert cleanly in 
2.6.30-rc1.

I will be happy if someone has a proper patch for me to test further.

Thanks,
Jeff.


commit 784544739a25c30637397ace5489eeb6e15d7d49
Author: Stephen Hemminger <shemminger@vyatta.com>
Date:   Fri Feb 20 10:35:32 2009 +0100

     netfilter: iptables: lock free counters

     The reader/writer lock in ip_tables is acquired in the critical path of
     processing packets and is one of the reasons just loading iptables can cause
     a 20% performance loss. The rwlock serves two functions:

     1) it prevents changes to table state (xt_replace) while table is in use.
        This is now handled by doing rcu on the xt_table. When table is
        replaced, the new table(s) are put in and the old one table(s) are freed
        after RCU period.

     2) it provides synchronization when accesing the counter values.
        This is now handled by swapping in new table_info entries for each cpu
        then summing the old values, and putting the result back onto one
        cpu.  On a busy system it may cause sampling to occur at different
        times on each cpu, but no packet/byte counts are lost in the process.

     Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

     Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
     BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)

     Acked-by: Eric Dumazet <dada1@cosmosbay.com>
     Signed-off-by: Patrick McHardy <kaber@trash.net>


diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9fac88f..e8e08d0 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -353,7 +353,7 @@ struct xt_table
  	unsigned int valid_hooks;

  	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;

  	/* Man behind the curtain... */
  	struct xt_table_info *private;
@@ -385,7 +385,7 @@ struct xt_table_info

  	/* ipt_entry tables: one per CPU */
  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
  };

  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);

  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
  extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
+				    struct xt_table_info *new);

  #ifdef CONFIG_COMPAT
  #include <net/compat.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b5db463..64a7c6c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
  	indev = in ? in->name : nulldevname;
  	outdev = out ? out->name : nulldevname;

-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
  	e = get_entry(table_base, private->hook_entry[hook]);
  	back = get_entry(table_base, private->underflow[hook]);

@@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
  			e = (void *)e + e->next_offset;
  		}
  	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock_bh();

  	if (hotdrop)
  		return NF_DROP;
@@ -738,11 +740,65 @@ static void get_counters(const struct xt_table_info *t,
  	}
  }

-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct arpt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
  {
  	unsigned int countersize;
  	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;

  	/* We need atomic snapshot of counters: rest doesn't change
  	 * (other than comefrom, which userspace doesn't care
@@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_counters(struct xt_table *table)
  	counters = vmalloc_node(countersize, numa_node_id());

  	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;

-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);

  	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
  }

  static int copy_entries_to_user(unsigned int total_size,
@@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
  	return ret;
  }

-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct xt_counters addme[],
-				       unsigned int *i)
-{
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
  			   int compat)
  {
@@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
  		goto free;
  	}

-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
  	private = t->private;
  	if (private->number != num_counters) {
  		ret = -EINVAL;
  		goto unlock_up_free;
  	}

+	preempt_disable();
  	i = 0;
  	/* Choose the copy that is on our node */
  	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
  			   add_counter_to_entry,
  			   paddc,
  			   &i);
+	preempt_enable();
   unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
  	xt_table_unlock(t);
  	module_put(t->me);
   free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ef8b6ca..08cde5b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
  	tgpar.hooknum = hook;

-	read_lock_bh(&table->lock);
  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
  	e = get_entry(table_base, private->hook_entry[hook]);

  	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
  		}
  	} while (!hotdrop);

-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();

  #ifdef DEBUG_ALLOW_ALL
  	return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info *t,
  				  counters,
  				  &i);
  	}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+
+static inline int
+zero_entry_counter(struct ipt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
  }

  static struct xt_counters * alloc_counters(struct xt_table *table)
  {
  	unsigned int countersize;
  	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;

  	/* We need atomic snapshot of counters: rest doesn't change
  	   (other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
  	counters = vmalloc_node(countersize, numa_node_id());

  	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;

-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);

  	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
  }

  static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
  	return ret;
  }

-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}

  static int
  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
  		goto free;
  	}

-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
  	private = t->private;
  	if (private->number != num_counters) {
  		ret = -EINVAL;
  		goto unlock_up_free;
  	}

+	preempt_disable();
  	i = 0;
  	/* Choose the copy that is on our node */
  	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
  			  add_counter_to_entry,
  			  paddc,
  			  &i);
+	preempt_enable();
   unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
  	xt_table_unlock(t);
  	module_put(t->me);
   free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d64594b..34af7bb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -382,10 +382,12 @@ ip6t_do_table(struct sk_buff *skb,
  	mtpar.family  = tgpar.family = NFPROTO_IPV6;
  	tgpar.hooknum = hook;

-	read_lock_bh(&table->lock);
  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
  	e = get_entry(table_base, private->hook_entry[hook]);

  	/* For return from builtin chain */
@@ -483,7 +485,7 @@ ip6t_do_table(struct sk_buff *skb,
  #ifdef CONFIG_NETFILTER_DEBUG
  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
  #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();

  #ifdef DEBUG_ALLOW_ALL
  	return NF_ACCEPT;
@@ -964,11 +966,64 @@ get_counters(const struct xt_table_info *t,
  	}
  }

+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[cpu],
+			   t->size,
+			   add_counter_to_entry,
+			   counters,
+			   &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct ip6t_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				   zero_entry_counter, NULL);
+	}
+}
+
  static struct xt_counters *alloc_counters(struct xt_table *table)
  {
  	unsigned int countersize;
  	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;

  	/* We need atomic snapshot of counters: rest doesn't change
  	   (other than comefrom, which userspace doesn't care
@@ -977,16 +1032,30 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
  	counters = vmalloc_node(countersize, numa_node_id());

  	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);

-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
  	xt_free_table_info(info);

  	return counters;

+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
  }

  static int
@@ -1351,28 +1420,6 @@ do_replace(struct net *net, void __user *user, unsigned int len)
  	return ret;
  }

-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
  static int
  do_add_counters(struct net *net, void __user *user, unsigned int len,
  		int compat)
@@ -1433,13 +1480,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
  		goto free;
  	}

-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
  	private = t->private;
  	if (private->number != num_counters) {
  		ret = -EINVAL;
  		goto unlock_up_free;
  	}

+	preempt_disable();
  	i = 0;
  	/* Choose the copy that is on our node */
  	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1448,8 +1496,9 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
  			  add_counter_to_entry,
  			  paddc,
  			  &i);
+	preempt_enable();
   unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
  	xt_table_unlock(t);
  	module_put(t->me);
   free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bfbf521..bfcac92 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_info *info)
  }
  EXPORT_SYMBOL(xt_free_table_info);

+void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
+			     struct xt_table_info *newinfo)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *p = oldinfo->entries[cpu];
+		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+		newinfo->entries[cpu] = p;
+	}
+
+}
+EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
+
  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
  				    const char *name)
@@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table,
  	struct xt_table_info *oldinfo, *private;

  	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
  	private = table->private;
  	/* Check inside lock: is the old number correct? */
  	if (num_counters != private->number) {
  		duprintf("num_counters != table->private->number (%u/%u)\n",
  			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
  		*error = -EAGAIN;
  		return NULL;
  	}
  	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
  	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);

+	synchronize_net();
  	return oldinfo;
  }
  EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,

  	/* Simplifies replace_table code. */
  	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
  	if (!xt_replace_table(table, 0, newinfo, &ret))
  		goto unlock;


^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-10  9:15 iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49 Jeff Chua
@ 2009-04-10 16:52 ` Stephen Hemminger
  2009-04-11  1:07   ` Jeff Chua
  2009-04-11  1:25   ` David Miller
  0 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-10 16:52 UTC (permalink / raw)
  To: Jeff Chua
  Cc: Eric Dumazet, Jan Engelhardt, Patrick McHardy, David S. Miller,
	Roman Mindalev, Linus Torvalds, Linux Kernel

On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
Jeff Chua <jeff.chua.linux@gmail.com> wrote:

> 
> 
> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> 0.2sec in 2.6.29. I've bisected down this commit.
> 
> There are a few patches on top of the original patch. When I reverted the 
> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
> inserts back to .2sec again.
> 
> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
> is really not very nice to wait for.

The performance benefit during operation is more important. The load
time is fixable. The problem is probably generic to any set of rules,
but could you post some info about your configuration (like the rule
set), and the system configuration (# of cpu's, config etc).

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-10 16:52 ` Stephen Hemminger
@ 2009-04-11  1:07   ` Jeff Chua
  2009-04-11  1:25   ` David Miller
  1 sibling, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-11  1:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Jan Engelhardt, Patrick McHardy, David S. Miller,
	Roman Mindalev, Linus Torvalds, Linux Kernel

On Sat, Apr 11, 2009 at 12:52 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> The performance benefit during operation is more important. The load
> time is fixable. The problem is probably generic to any set of rules,
> but could you post some info about your configuration (like the rule
> set), and the system configuration (# of cpu's, config etc).

I've about 150 different IPs like ...
        iptables -A block -s 155.161.173.128/26 -j ACCEPT
        iptables -A block -s 155.161.194.128/26 -j ACCEPT

So, to make it easy for testing, you can do a loop like this ...
        for((i = 1; i < 100; i++))
        do
                iptables -A block -s 10.0.0.$i -j ACCEPT
        done

I'm running ThinkPad X61. Dual core T9300, 2.5GHz, 4GB RAM, 256GB SSD.
No load as I was not running anything else, and X not running.

Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-10 16:52 ` Stephen Hemminger
  2009-04-11  1:07   ` Jeff Chua
@ 2009-04-11  1:25   ` David Miller
  2009-04-11  1:39     ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Linus Torvalds
  1 sibling, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-11  1:25 UTC (permalink / raw)
  To: shemminger
  Cc: jeff.chua.linux, dada1, jengelh, kaber, r000n, torvalds,
	linux-kernel, netfilter-devel, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 10 Apr 2009 09:52:46 -0700

[ CC:'ing netfilter-devel and netdev... ]

> On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
> Jeff Chua <jeff.chua.linux@gmail.com> wrote:
> 
>> 
>> 
>> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
>> 0.2sec in 2.6.29. I've bisected down this commit.
>> 
>> There are a few patches on top of the original patch. When I reverted the 
>> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
>> inserts back to .2sec again.
>> 
>> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
>> is really not very nice to wait for.
> 
> The performance benefit during operation is more important. The load
> time is fixable. The problem is probably generic to any set of rules,
> but could you post some info about your configuration (like the rule
> set), and the system configuration (# of cpu's, config etc).
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  1:25   ` David Miller
@ 2009-04-11  1:39     ` Linus Torvalds
  2009-04-11  4:15       ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-11  1:39 UTC (permalink / raw)
  To: David Miller, Paul E. McKenney, Ingo Molnar, Lai Jiangshan
  Cc: shemminger, jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev



On Fri, 10 Apr 2009, David Miller wrote:
> 
> [ CC:'ing netfilter-devel and netdev... ]

I wonder if we should bring in the RCU people too, for them to tell you 
that the networking people are beign silly, and should not synchronize 
with the very heavy-handed

	synchronize_net()

but instead of doing synchronization (which is probably why adding a few 
hundred rules then takes several seconds - each synchronizes and that 
takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
list for later freeing.

Or whatever. Paul? synchronize_net() just calls synchronize_rcu(), and 
with that knowledge and a simple

	git show 784544739a25c30637397ace5489eeb6e15d7d49

I bet you can already tell people how to fix their performance issue.

		Linus

---
> > On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
> > Jeff Chua <jeff.chua.linux@gmail.com> wrote:
> >> 
> >> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> >> 0.2sec in 2.6.29. I've bisected down this commit.
> >> 
> >> There are a few patches on top of the original patch. When I reverted the 
> >> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
> >> inserts back to .2sec again.
> >> 
> >> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
> >> is really not very nice to wait for.
> > 
> > The performance benefit during operation is more important. The load
> > time is fixable. The problem is probably generic to any set of rules,
> > but could you post some info about your configuration (like the rule
> > set), and the system configuration (# of cpu's, config etc).
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  1:39     ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Linus Torvalds
@ 2009-04-11  4:15       ` Paul E. McKenney
  2009-04-11  5:14         ` Jan Engelhardt
                           ` (3 more replies)
  0 siblings, 4 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-11  4:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Ingo Molnar, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 10 Apr 2009, David Miller wrote:
> > 
> > [ CC:'ing netfilter-devel and netdev... ]
> 
> I wonder if we should bring in the RCU people too, for them to tell you 
> that the networking people are beign silly, and should not synchronize 
> with the very heavy-handed
> 
> 	synchronize_net()
> 
> but instead of doing synchronization (which is probably why adding a few 
> hundred rules then takes several seconds - each synchronizes and that 
> takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
> list for later freeing.
> 
> Or whatever. Paul? synchronize_net() just calls synchronize_rcu(), and 
> with that knowledge and a simple
> 
> 	git show 784544739a25c30637397ace5489eeb6e15d7d49
> 
> I bet you can already tell people how to fix their performance issue.

Well, I am certainly happy to demonstrate my ignorance of the networking
code by throwing out a few suggestions.

So, Dave and Steve, you might want to get out your barf bag before
reading further.  You have been warned!  ;-)

1.	Assuming that the synchronize_net() is intended to guarantee
	that the new rules will be in effect before returning to
	user space:

	a.	Split this functionality, so that there is a new
		user-space primitive that installs a new rule, but
		without waiting.  They provide an additional user-space
		primitive that waits for the rules to take effect.
		Then, when loading a long list of rules, load them
		using the non-waiting primitive, and wait at the end.

	b.	As above, but provide a flag that says whether or not
		to wait.  Same general effect.

	But I am not seeing the direct connection between this patch
	and netfilter, so...

2.	For the xt_replace_table() case, it would be necessary to add an
	rcu_head to the xt_table_info, and replace each caller's direct
	calls to xt_free_table_info() with call_rcu().

	Now this has an issue in that the caller wants to return the
	final counter values.  My assumption is that these values do
	not in fact need to be exact.  If I am wrong about that, then
	my suggestion would lose the counts from late readers.
	I must defer to the networking guys as to whether this is
	acceptable or not.  If not, more head-scratching would be
	required.  (But it looks to me that the rule is being trashed,
	so who cares about the extra counts?)

	In addition, a malicious user might be able to force this to
	happen extremely frequently, running the system out of memory.
	One way to fix this is to invoke synchronize_net() one out of
	20 times or some such.

3.	For the alloc_counters() case, the comments indicate that we
	really truly do want an atomic sampling of the counters.
	The counters are 64-bit entities, which is a bit inconvenient.
	Though people using this functionality are no doubt quite happy
	to never have to worry about overflow, I hasten to add!

	I will nevertheless suggest the following egregious hack to
	get a consistent sample of one counter for some other CPU:

	a.	Disable interrupts
	b.	Atomically exchange the bottom 32 bits of the
		counter with the value zero.
	c.	Atomically exchange the top 32 bits of the counter
		with the value zero.
	d.	Concatenate the values obtained in (b) and (c), which
		is the snapshot value.
	e.	Re-enable interrupts.  Yes, for each counter.  Do it
		for the honor of the -rt patchset.  ;-)

		Disabling interrupts should make it impossible for
		the low-order 32 bits of the counter to overflow before
		we get around to zeroing the upper 32 bits.  Yes, this
		is horribly paranoid, but please keep in mind that even
		my level of paranoia is not always sufficient to keep
		RCU working correctly.  :-/

		Architectures with 64-bit atomics can simply do a 64-bit
		exchange (or cmpxchg(), for that matter).

	Now we still have the possibility that the other CPU is still
	hammering away on the counter that we just zeroed from a
	long-running RCU read-side critical section.

	So, we also need to add an rcu_head somewhere, perhaps reuse
	the one in xt_table_info, create a second one, or squirrel one
	away somewhere else.  As long as there is a way to get to the
	old counter values.  And a flag to indicate that the rcu_head
	is in use.  It is socially irresponsible to pass a given
	rcu_head to call_rcu() before it has been invoked after the
	previous time it was passed to call_rcu().  But you guys all
	knew that already.

	We replace the synchronize_net() with call_rcu(), more or less.
	The call_rcu() probably needs to be under the lock -- or at the
	very least, setting the flag saying that it is in use needs to
	be under the lock.

	The RCU callback function traverses the old counters one last
	time, adding their values to the new set of counters.  No
	atomic exchange tricks are required this time, since all the
	RCU readers that could possibly have held a reference to the
	old set of counters must now be done.  We now clear the flag,
	allowing the next counter snapshot to proceed.

OK, OK, Dave and Steve, I should have suggested that you get two
barf bags.  Maybe three.  ;-)

Additional caveat: coward that I am, I looked only at the IPv4 code.
There might well be additional complications in the arp and IPv6 code.

However, I do believe that something like this might actually work.

Thoughts?

						Thanx, Paul

> 		Linus
> 
> ---
> > > On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
> > > Jeff Chua <jeff.chua.linux@gmail.com> wrote:
> > >> 
> > >> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> > >> 0.2sec in 2.6.29. I've bisected down this commit.
> > >> 
> > >> There are a few patches on top of the original patch. When I reverted the 
> > >> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
> > >> inserts back to .2sec again.
> > >> 
> > >> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
> > >> is really not very nice to wait for.
> > > 
> > > The performance benefit during operation is more important. The load
> > > time is fixable. The problem is probably generic to any set of rules,
> > > but could you post some info about your configuration (like the rule
> > > set), and the system configuration (# of cpu's, config etc).
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > Please read the FAQ at  http://www.tux.org/lkml/
> > 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  4:15       ` Paul E. McKenney
@ 2009-04-11  5:14         ` Jan Engelhardt
  2009-04-11  5:42           ` Paul E. McKenney
                             ` (3 more replies)
  2009-04-11  7:08         ` Ingo Molnar
                           ` (2 subsequent siblings)
  3 siblings, 4 replies; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-11  5:14 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev


On Saturday 2009-04-11 06:15, Paul E. McKenney wrote:
>On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
>>An unhappy user reported:
>>>>> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
>>>>> 0.2sec in 2.6.29. I've bisected down this commit.
>>>>> 784544739a25c30637397ace5489eeb6e15d7d49
>> 
>> I wonder if we should bring in the RCU people too, for them to tell you 
>> that the networking people are beign silly, and should not synchronize 
>> with the very heavy-handed
>> 
>> 	synchronize_net()
>> 
>> but instead of doing synchronization (which is probably why adding a few 
>> hundred rules then takes several seconds - each synchronizes and that 
>> takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
>> list for later freeing.

iptables works in whole tables. Userspace submits a table, checkentry is 
called for all rules in the new table, things are swapped, then destroy 
is called for all rules in the old table. By that logic (which existed
since dawn I think), only the swap operation needs to be locked.

Jeff Chua wrote:
>So, to make it easy for testing, you can do a loop like this ...
>        for((i = 1; i < 100; i++))
>        do
>                iptables -A block -s 10.0.0.$i -j ACCEPT
>        done

The fact that `iptables -A` is called a hundred times means you are 
doing 100 table replacements -- instead of one. And calling
synchronize_net at least a 100 times.

"Wanna use iptables-restore?"

>1.	Assuming that the synchronize_net() is intended to guarantee
>	that the new rules will be in effect before returning to
>	user space:

As I read the new code, it seems that synchronize_net is only
used on copying the rules from kernel into userspace;
not when updating them from userspace:

IPT_SO_GET_ENTRIES -> get_entries -> copy_entries_to_user -> 
alloc_counters -> synchronize_net.

>3.	For the alloc_counters() case, the comments indicate that we
>	really truly do want an atomic sampling of the counters.
>	The counters are 64-bit entities, which is a bit inconvenient.
>	Though people using this functionality are no doubt quite happy
>	to never have to worry about overflow, I hasten to add!
>
>	I will nevertheless suggest the following egregious hack to
>	get a consistent sample of one counter for some other CPU:
>       [...]

Would a seqlock suffice, as it does for the 64-bit jiffies?

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  5:14         ` Jan Engelhardt
@ 2009-04-11  5:42           ` Paul E. McKenney
  2009-04-11  6:00           ` David Miller
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-11  5:42 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, Apr 11, 2009 at 07:14:50AM +0200, Jan Engelhardt wrote:
> 
> On Saturday 2009-04-11 06:15, Paul E. McKenney wrote:
> >On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
> >>An unhappy user reported:
> >>>>> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> >>>>> 0.2sec in 2.6.29. I've bisected down this commit.
> >>>>> 784544739a25c30637397ace5489eeb6e15d7d49
> >> 
> >> I wonder if we should bring in the RCU people too, for them to tell you 
> >> that the networking people are beign silly, and should not synchronize 
> >> with the very heavy-handed
> >> 
> >> 	synchronize_net()
> >> 
> >> but instead of doing synchronization (which is probably why adding a few 
> >> hundred rules then takes several seconds - each synchronizes and that 
> >> takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
> >> list for later freeing.
> 
> iptables works in whole tables. Userspace submits a table, checkentry is 
> called for all rules in the new table, things are swapped, then destroy 
> is called for all rules in the old table. By that logic (which existed
> since dawn I think), only the swap operation needs to be locked.
> 
> Jeff Chua wrote:
> >So, to make it easy for testing, you can do a loop like this ...
> >        for((i = 1; i < 100; i++))
> >        do
> >                iptables -A block -s 10.0.0.$i -j ACCEPT
> >        done
> 
> The fact that `iptables -A` is called a hundred times means you are 
> doing 100 table replacements -- instead of one. And calling
> synchronize_net at least a 100 times.
> 
> "Wanna use iptables-restore?"
> 
> >1.	Assuming that the synchronize_net() is intended to guarantee
> >	that the new rules will be in effect before returning to
> >	user space:
> 
> As I read the new code, it seems that synchronize_net is only
> used on copying the rules from kernel into userspace;
> not when updating them from userspace:
> 
> IPT_SO_GET_ENTRIES -> get_entries -> copy_entries_to_user -> 
> alloc_counters -> synchronize_net.

OK.

> >3.	For the alloc_counters() case, the comments indicate that we
> >	really truly do want an atomic sampling of the counters.
> >	The counters are 64-bit entities, which is a bit inconvenient.
> >	Though people using this functionality are no doubt quite happy
> >	to never have to worry about overflow, I hasten to add!
> >
> >	I will nevertheless suggest the following egregious hack to
> >	get a consistent sample of one counter for some other CPU:
> >       [...]
> 
> Would a seqlock suffice, as it does for the 64-bit jiffies?

The 64-bit jiffies counter is not updated often, so write-acquiring a
seqlock on each update is OK.  From what I understand, these counters
are updated quite often (one each packet transmission or reception?),
so write-acquiring on each update would be quite painful.

Or did you have something else in mind here?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  5:14         ` Jan Engelhardt
  2009-04-11  5:42           ` Paul E. McKenney
@ 2009-04-11  6:00           ` David Miller
  2009-04-11 18:12               ` Kyle Moffett
  2009-04-12 16:38             ` Jan Engelhardt
  2009-04-11 15:07           ` Stephen Hemminger
  2009-04-11 17:51           ` Linus Torvalds
  3 siblings, 2 replies; 254+ messages in thread
From: David Miller @ 2009-04-11  6:00 UTC (permalink / raw)
  To: jengelh
  Cc: paulmck, torvalds, mingo, laijs, shemminger, jeff.chua.linux,
	dada1, kaber, r000n, linux-kernel, netfilter-devel, netdev

From: Jan Engelhardt <jengelh@medozas.de>
Date: Sat, 11 Apr 2009 07:14:50 +0200 (CEST)

> The fact that `iptables -A` is called a hundred times means you are 
> doing 100 table replacements -- instead of one. And calling
> synchronize_net at least a 100 times.
> 
> "Wanna use iptables-restore?"

I want to derail this line of thinking as fast as possible.

This is not an acceptable response to this problem.  We made something
fundamentally slower by several orders of magnitude.

Therefore, saying "Don't insert your firewall rules like that." is not
a valid response for this regression.

We really have to fix it or revert.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  4:15       ` Paul E. McKenney
  2009-04-11  5:14         ` Jan Engelhardt
@ 2009-04-11  7:08         ` Ingo Molnar
  2009-04-11 15:05           ` Stephen Hemminger
  2009-04-11 17:48           ` Paul E. McKenney
  2009-04-11 15:50         ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Stephen Hemminger
  2009-04-11 18:57         ` Linus Torvalds
  3 siblings, 2 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-11  7:08 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, David Miller, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> 	I will nevertheless suggest the following egregious hack to
> 	get a consistent sample of one counter for some other CPU:
> 
> 	a.	Disable interrupts
> 	b.	Atomically exchange the bottom 32 bits of the
> 		counter with the value zero.
> 	c.	Atomically exchange the top 32 bits of the counter
> 		with the value zero.
> 	d.	Concatenate the values obtained in (b) and (c), which
> 		is the snapshot value.

Note, i have recently implemented full atomic64_t support on 32-bit 
x86, for the perfcounters code, based on the CMPXCHG8B instruction.

Which, while not the lightest of instructions, is still much better 
than the sequence above.

So i think a better approach would be to also add a dumb generic 
implementation for atomic64_t (using a global lock or so), and then 
generic code could just assume that atomic64_t always exists.

It is far nicer - and faster as well - as the hack above, even on 
32-bit x86.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  7:08         ` Ingo Molnar
@ 2009-04-11 15:05           ` Stephen Hemminger
  2009-04-11 17:48           ` Paul E. McKenney
  1 sibling, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-11 15:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul E. McKenney, Linus Torvalds, David Miller, Lai Jiangshan,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, 11 Apr 2009 09:08:54 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > 	I will nevertheless suggest the following egregious hack to
> > 	get a consistent sample of one counter for some other CPU:
> > 
> > 	a.	Disable interrupts
> > 	b.	Atomically exchange the bottom 32 bits of the
> > 		counter with the value zero.
> > 	c.	Atomically exchange the top 32 bits of the counter
> > 		with the value zero.
> > 	d.	Concatenate the values obtained in (b) and (c), which
> > 		is the snapshot value.
> 
> Note, i have recently implemented full atomic64_t support on 32-bit 
> x86, for the perfcounters code, based on the CMPXCHG8B instruction.
> 
> Which, while not the lightest of instructions, is still much better 
> than the sequence above.
> 
> So i think a better approach would be to also add a dumb generic 
> implementation for atomic64_t (using a global lock or so), and then 
> generic code could just assume that atomic64_t always exists.
> 
> It is far nicer - and faster as well - as the hack above, even on 
> 32-bit x86.
> 
> 	Ingo

The iptables counters are write mostly, read rarely so they don't
fit the seq counter or atomic use case. Also, it is important
to get a consistent snapshot of the whole set not just each
individual counter.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  5:14         ` Jan Engelhardt
  2009-04-11  5:42           ` Paul E. McKenney
  2009-04-11  6:00           ` David Miller
@ 2009-04-11 15:07           ` Stephen Hemminger
  2009-04-11 16:05               ` Jeff Chua
  2009-04-11 17:51           ` Linus Torvalds
  3 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-11 15:07 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Paul E. McKenney, Linus Torvalds, David Miller, Ingo Molnar,
	Lai Jiangshan, jeff.chua.linux, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, 11 Apr 2009 07:14:50 +0200 (CEST)
Jan Engelhardt <jengelh@medozas.de> wrote:

> 
> On Saturday 2009-04-11 06:15, Paul E. McKenney wrote:
> >On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
> >>An unhappy user reported:
> >>>>> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> >>>>> 0.2sec in 2.6.29. I've bisected down this commit.
> >>>>> 784544739a25c30637397ace5489eeb6e15d7d49
> >> 
> >> I wonder if we should bring in the RCU people too, for them to tell you 
> >> that the networking people are beign silly, and should not synchronize 
> >> with the very heavy-handed
> >> 
> >> 	synchronize_net()
> >> 
> >> but instead of doing synchronization (which is probably why adding a few 
> >> hundred rules then takes several seconds - each synchronizes and that 
> >> takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
> >> list for later freeing.
> 
> iptables works in whole tables. Userspace submits a table, checkentry is 
> called for all rules in the new table, things are swapped, then destroy 
> is called for all rules in the old table. By that logic (which existed
> since dawn I think), only the swap operation needs to be locked.
> 

Part of the overhead is the API choice to take counter values from user
space during the replace.  If the rule replacement just always started with
zero counters it could be done with less overhead.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  4:15       ` Paul E. McKenney
  2009-04-11  5:14         ` Jan Engelhardt
  2009-04-11  7:08         ` Ingo Molnar
@ 2009-04-11 15:50         ` Stephen Hemminger
  2009-04-11 17:43           ` Paul E. McKenney
  2009-04-11 18:57         ` Linus Torvalds
  3 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-11 15:50 UTC (permalink / raw)
  To: paulmck
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Fri, 10 Apr 2009 21:15:33 -0700
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
> > 
> > 
> > On Fri, 10 Apr 2009, David Miller wrote:
> > > 
> > > [ CC:'ing netfilter-devel and netdev... ]
> > 
> > I wonder if we should bring in the RCU people too, for them to tell you 
> > that the networking people are beign silly, and should not synchronize 
> > with the very heavy-handed
> > 
> > 	synchronize_net()
> > 
> > but instead of doing synchronization (which is probably why adding a few 
> > hundred rules then takes several seconds - each synchronizes and that 
> > takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
> > list for later freeing.
> > 
> > Or whatever. Paul? synchronize_net() just calls synchronize_rcu(), and 
> > with that knowledge and a simple
> > 
> > 	git show 784544739a25c30637397ace5489eeb6e15d7d49
> > 
> > I bet you can already tell people how to fix their performance issue.
> 
> Well, I am certainly happy to demonstrate my ignorance of the networking
> code by throwing out a few suggestions.
> 
> So, Dave and Steve, you might want to get out your barf bag before
> reading further.  You have been warned!  ;-)
> 
> 1.	Assuming that the synchronize_net() is intended to guarantee
> 	that the new rules will be in effect before returning to
> 	user space:

In this case it is to make sure that the old counter table is no
longer being used by other cpu's receiving. 

> 	a.	Split this functionality, so that there is a new
> 		user-space primitive that installs a new rule, but
> 		without waiting.  They provide an additional user-space
> 		primitive that waits for the rules to take effect.
> 		Then, when loading a long list of rules, load them
> 		using the non-waiting primitive, and wait at the end.
> 
> 	b.	As above, but provide a flag that says whether or not
> 		to wait.  Same general effect.
> 
> 	But I am not seeing the direct connection between this patch
> 	and netfilter, so...
 
> 2.	For the xt_replace_table() case, it would be necessary to add an
> 	rcu_head to the xt_table_info, and replace each caller's direct
> 	calls to xt_free_table_info() with call_rcu().
> 
> 	Now this has an issue in that the caller wants to return the
> 	final counter values.  My assumption is that these values do
> 	not in fact need to be exact.  If I am wrong about that, then
> 	my suggestion would lose the counts from late readers.
> 	I must defer to the networking guys as to whether this is
> 	acceptable or not.  If not, more head-scratching would be
> 	required.  (But it looks to me that the rule is being trashed,
> 	so who cares about the extra counts?)

The problem is that users want to account for every byte. 

> 	In addition, a malicious user might be able to force this to
> 	happen extremely frequently, running the system out of memory.
> 	One way to fix this is to invoke synchronize_net() one out of
> 	20 times or some such.

Malicious user == root, therefore don't care.

> 3.	For the alloc_counters() case, the comments indicate that we
> 	really truly do want an atomic sampling of the counters.
> 	The counters are 64-bit entities, which is a bit inconvenient.
> 	Though people using this functionality are no doubt quite happy
> 	to never have to worry about overflow, I hasten to add!

And we need snapshot of all counters (which are not even an array but
a skip list).

> 	I will nevertheless suggest the following egregious hack to
> 	get a consistent sample of one counter for some other CPU:
> 
> 	a.	Disable interrupts
> 	b.	Atomically exchange the bottom 32 bits of the
> 		counter with the value zero.
> 	c.	Atomically exchange the top 32 bits of the counter
> 		with the value zero.
> 	d.	Concatenate the values obtained in (b) and (c), which
> 		is the snapshot value.
> 	e.	Re-enable interrupts.  Yes, for each counter.  Do it
> 		for the honor of the -rt patchset.  ;-)
> 
> 		Disabling interrupts should make it impossible for
> 		the low-order 32 bits of the counter to overflow before
> 		we get around to zeroing the upper 32 bits.  Yes, this
> 		is horribly paranoid, but please keep in mind that even
> 		my level of paranoia is not always sufficient to keep
> 		RCU working correctly.  :-/
> 
> 		Architectures with 64-bit atomics can simply do a 64-bit
> 		exchange (or cmpxchg(), for that matter).
> 
> 	Now we still have the possibility that the other CPU is still
> 	hammering away on the counter that we just zeroed from a
> 	long-running RCU read-side critical section.
> 
> 	So, we also need to add an rcu_head somewhere, perhaps reuse
> 	the one in xt_table_info, create a second one, or squirrel one
> 	away somewhere else.  As long as there is a way to get to the
> 	old counter values.  And a flag to indicate that the rcu_head
> 	is in use.  It is socially irresponsible to pass a given
> 	rcu_head to call_rcu() before it has been invoked after the
> 	previous time it was passed to call_rcu().  But you guys all
> 	knew that already.
> 
> 	We replace the synchronize_net() with call_rcu(), more or less.
> 	The call_rcu() probably needs to be under the lock -- or at the
> 	very least, setting the flag saying that it is in use needs to
> 	be under the lock.
> 
> 	The RCU callback function traverses the old counters one last
> 	time, adding their values to the new set of counters.  No
> 	atomic exchange tricks are required this time, since all the
> 	RCU readers that could possibly have held a reference to the
> 	old set of counters must now be done.  We now clear the flag,
> 	allowing the next counter snapshot to proceed.
> 
> OK, OK, Dave and Steve, I should have suggested that you get two
> barf bags.  Maybe three.  ;-)
> 
> Additional caveat: coward that I am, I looked only at the IPv4 code.
> There might well be additional complications in the arp and IPv6 code.
> 
> However, I do believe that something like this might actually work.
> 
> Thoughts?
> 
> 						Thanx, Paul
> 
> > 		Linus
> > 
> > ---
> > > > On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
> > > > Jeff Chua <jeff.chua.linux@gmail.com> wrote:
> > > >> 
> > > >> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> > > >> 0.2sec in 2.6.29. I've bisected down this commit.
> > > >> 
> > > >> There are a few patches on top of the original patch. When I reverted the 
> > > >> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
> > > >> inserts back to .2sec again.
> > > >> 
> > > >> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
> > > >> is really not very nice to wait for.
> > > > 
> > > > The performance benefit during operation is more important. The load
> > > > time is fixable. The problem is probably generic to any set of rules,
> > > > but could you post some info about your configuration (like the rule
> > > > set), and the system configuration (# of cpu's, config etc).
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > Please read the FAQ at  http://www.tux.org/lkml/
> > > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 15:07           ` Stephen Hemminger
@ 2009-04-11 16:05               ` Jeff Chua
  0 siblings, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-11 16:05 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jan Engelhardt, Paul E. McKenney, Linus Torvalds, David Miller,
	Ingo Molnar, Lai Jiangshan, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, Apr 11, 2009 at 11:07 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
>> iptables works in whole tables. Userspace submits a table, checkentry is
>> called for all rules in the new table, things are swapped, then destroy
>> is called for all rules in the old table. By that logic (which existed
>> since dawn I think), only the swap operation needs to be locked.
> Part of the overhead is the API choice to take counter values from user
> space during the replace.  If the rule replacement just always started with
> zero counters it could be done with less overhead.

It's always good practice to start from zero with these ...

# iptables -F
# iptables -t nat -F
# iptables -X

And most of the time, rules should be put into a file so that it can
rerun easily after reboot. So if it can be speed up for just this
case, it'll help many out there.

Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
@ 2009-04-11 16:05               ` Jeff Chua
  0 siblings, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-11 16:05 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jan Engelhardt, Paul E. McKenney, Linus Torvalds, David Miller,
	Ingo Molnar, Lai Jiangshan, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, Apr 11, 2009 at 11:07 PM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
>> iptables works in whole tables. Userspace submits a table, checkentry is
>> called for all rules in the new table, things are swapped, then destroy
>> is called for all rules in the old table. By that logic (which existed
>> since dawn I think), only the swap operation needs to be locked.
> Part of the overhead is the API choice to take counter values from user
> space during the replace.  If the rule replacement just always started with
> zero counters it could be done with less overhead.

It's always good practice to start from zero with these ...

# iptables -F
# iptables -t nat -F
# iptables -X

And most of the time, rules should be put into a file so that it can
rerun easily after reboot. So if it can be speed up for just this
case, it'll help many out there.

Thanks,
Jeff.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 15:50         ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Stephen Hemminger
@ 2009-04-11 17:43           ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-11 17:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, Apr 11, 2009 at 08:50:09AM -0700, Stephen Hemminger wrote:
> On Fri, 10 Apr 2009 21:15:33 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Fri, Apr 10, 2009 at 06:39:18PM -0700, Linus Torvalds wrote:
> > > 
> > > 
> > > On Fri, 10 Apr 2009, David Miller wrote:
> > > > 
> > > > [ CC:'ing netfilter-devel and netdev... ]
> > > 
> > > I wonder if we should bring in the RCU people too, for them to tell you 
> > > that the networking people are beign silly, and should not synchronize 
> > > with the very heavy-handed
> > > 
> > > 	synchronize_net()
> > > 
> > > but instead of doing synchronization (which is probably why adding a few 
> > > hundred rules then takes several seconds - each synchronizes and that 
> > > takes a timer tick or so), add the rules to be free'd on some rcu-freeing 
> > > list for later freeing.
> > > 
> > > Or whatever. Paul? synchronize_net() just calls synchronize_rcu(), and 
> > > with that knowledge and a simple
> > > 
> > > 	git show 784544739a25c30637397ace5489eeb6e15d7d49
> > > 
> > > I bet you can already tell people how to fix their performance issue.
> > 
> > Well, I am certainly happy to demonstrate my ignorance of the networking
> > code by throwing out a few suggestions.
> > 
> > So, Dave and Steve, you might want to get out your barf bag before
> > reading further.  You have been warned!  ;-)
> > 
> > 1.	Assuming that the synchronize_net() is intended to guarantee
> > 	that the new rules will be in effect before returning to
> > 	user space:
> 
> In this case it is to make sure that the old counter table is no
> longer being used by other cpu's receiving. 
> 
> > 	a.	Split this functionality, so that there is a new
> > 		user-space primitive that installs a new rule, but
> > 		without waiting.  They provide an additional user-space
> > 		primitive that waits for the rules to take effect.
> > 		Then, when loading a long list of rules, load them
> > 		using the non-waiting primitive, and wait at the end.
> > 
> > 	b.	As above, but provide a flag that says whether or not
> > 		to wait.  Same general effect.
> > 
> > 	But I am not seeing the direct connection between this patch
> > 	and netfilter, so...
> 
> > 2.	For the xt_replace_table() case, it would be necessary to add an
> > 	rcu_head to the xt_table_info, and replace each caller's direct
> > 	calls to xt_free_table_info() with call_rcu().
> > 
> > 	Now this has an issue in that the caller wants to return the
> > 	final counter values.  My assumption is that these values do
> > 	not in fact need to be exact.  If I am wrong about that, then
> > 	my suggestion would lose the counts from late readers.
> > 	I must defer to the networking guys as to whether this is
> > 	acceptable or not.  If not, more head-scratching would be
> > 	required.  (But it looks to me that the rule is being trashed,
> > 	so who cares about the extra counts?)
> 
> The problem is that users want to account for every byte. 

Ah!!!

Is this particular code path one of the ones responsible for the
slowdown?

> > 	In addition, a malicious user might be able to force this to
> > 	happen extremely frequently, running the system out of memory.
> > 	One way to fix this is to invoke synchronize_net() one out of
> > 	20 times or some such.
> 
> Malicious user == root, therefore don't care.

Sometimes things work out OK.  ;-)

> > 3.	For the alloc_counters() case, the comments indicate that we
> > 	really truly do want an atomic sampling of the counters.
> > 	The counters are 64-bit entities, which is a bit inconvenient.
> > 	Though people using this functionality are no doubt quite happy
> > 	to never have to worry about overflow, I hasten to add!
> 
> And we need snapshot of all counters (which are not even an array but
> a skip list).

OK.  However, the code seems to swap in a new set of counters intended
to account for subsequent packets.  So I was assuming that "snapshot"
meant that a given packet had to be accounted for precisely, but that
it was OK to do so either in the old set or the new set, as long as it
appeared in exactly one of the two sets.

If this assumption is accurate, then something like the following should
work.

If my assumption is wrong, what exactly does this snapshot need to do?

							Thanx, Paul

> > 	I will nevertheless suggest the following egregious hack to
> > 	get a consistent sample of one counter for some other CPU:
> > 
> > 	a.	Disable interrupts
> > 	b.	Atomically exchange the bottom 32 bits of the
> > 		counter with the value zero.
> > 	c.	Atomically exchange the top 32 bits of the counter
> > 		with the value zero.
> > 	d.	Concatenate the values obtained in (b) and (c), which
> > 		is the snapshot value.
> > 	e.	Re-enable interrupts.  Yes, for each counter.  Do it
> > 		for the honor of the -rt patchset.  ;-)
> > 
> > 		Disabling interrupts should make it impossible for
> > 		the low-order 32 bits of the counter to overflow before
> > 		we get around to zeroing the upper 32 bits.  Yes, this
> > 		is horribly paranoid, but please keep in mind that even
> > 		my level of paranoia is not always sufficient to keep
> > 		RCU working correctly.  :-/
> > 
> > 		Architectures with 64-bit atomics can simply do a 64-bit
> > 		exchange (or cmpxchg(), for that matter).
> > 
> > 	Now we still have the possibility that the other CPU is still
> > 	hammering away on the counter that we just zeroed from a
> > 	long-running RCU read-side critical section.
> > 
> > 	So, we also need to add an rcu_head somewhere, perhaps reuse
> > 	the one in xt_table_info, create a second one, or squirrel one
> > 	away somewhere else.  As long as there is a way to get to the
> > 	old counter values.  And a flag to indicate that the rcu_head
> > 	is in use.  It is socially irresponsible to pass a given
> > 	rcu_head to call_rcu() before it has been invoked after the
> > 	previous time it was passed to call_rcu().  But you guys all
> > 	knew that already.
> > 
> > 	We replace the synchronize_net() with call_rcu(), more or less.
> > 	The call_rcu() probably needs to be under the lock -- or at the
> > 	very least, setting the flag saying that it is in use needs to
> > 	be under the lock.
> > 
> > 	The RCU callback function traverses the old counters one last
> > 	time, adding their values to the new set of counters.  No
> > 	atomic exchange tricks are required this time, since all the
> > 	RCU readers that could possibly have held a reference to the
> > 	old set of counters must now be done.  We now clear the flag,
> > 	allowing the next counter snapshot to proceed.
> > 
> > OK, OK, Dave and Steve, I should have suggested that you get two
> > barf bags.  Maybe three.  ;-)
> > 
> > Additional caveat: coward that I am, I looked only at the IPv4 code.
> > There might well be additional complications in the arp and IPv6 code.
> > 
> > However, I do believe that something like this might actually work.
> > 
> > Thoughts?
> > 
> > 						Thanx, Paul
> > 
> > > 		Linus
> > > 
> > > ---
> > > > > On Fri, 10 Apr 2009 17:15:52 +0800 (SGT)
> > > > > Jeff Chua <jeff.chua.linux@gmail.com> wrote:
> > > > >> 
> > > > >> Adding 200 records in iptables took 6.0sec in 2.6.30-rc1 compared to 
> > > > >> 0.2sec in 2.6.29. I've bisected down this commit.
> > > > >> 
> > > > >> There are a few patches on top of the original patch. When I reverted the 
> > > > >> original commit + changing rcu_read() to rcu_read_bh(), it speeds up the 
> > > > >> inserts back to .2sec again.
> > > > >> 
> > > > >> I'm loading all the firewall rules during boot-up and this 6 secs slowness 
> > > > >> is really not very nice to wait for.
> > > > > 
> > > > > The performance benefit during operation is more important. The load
> > > > > time is fixable. The problem is probably generic to any set of rules,
> > > > > but could you post some info about your configuration (like the rule
> > > > > set), and the system configuration (# of cpu's, config etc).
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > > > Please read the FAQ at  http://www.tux.org/lkml/
> > > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  7:08         ` Ingo Molnar
  2009-04-11 15:05           ` Stephen Hemminger
@ 2009-04-11 17:48           ` Paul E. McKenney
  2009-04-12 10:54             ` Ingo Molnar
  2009-04-12 11:34             ` Paul Mackerras
  1 sibling, 2 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-11 17:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, David Miller, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev, paulus, benh

On Sat, Apr 11, 2009 at 09:08:54AM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > 	I will nevertheless suggest the following egregious hack to
> > 	get a consistent sample of one counter for some other CPU:
> > 
> > 	a.	Disable interrupts
> > 	b.	Atomically exchange the bottom 32 bits of the
> > 		counter with the value zero.
> > 	c.	Atomically exchange the top 32 bits of the counter
> > 		with the value zero.
> > 	d.	Concatenate the values obtained in (b) and (c), which
> > 		is the snapshot value.
> 
> Note, i have recently implemented full atomic64_t support on 32-bit 
> x86, for the perfcounters code, based on the CMPXCHG8B instruction.
> 
> Which, while not the lightest of instructions, is still much better 
> than the sequence above.
> 
> So i think a better approach would be to also add a dumb generic 
> implementation for atomic64_t (using a global lock or so), and then 
> generic code could just assume that atomic64_t always exists.
> 
> It is far nicer - and faster as well - as the hack above, even on 
> 32-bit x86.

If the generic implementation is needed only on !SMP systems, that
could work.  The architectures I would be worried about include
powerpc and ia64, which I believe support 32-bit SMP builds.

						Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  5:14         ` Jan Engelhardt
                             ` (2 preceding siblings ...)
  2009-04-11 15:07           ` Stephen Hemminger
@ 2009-04-11 17:51           ` Linus Torvalds
  3 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-11 17:51 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Paul E. McKenney, David Miller, Ingo Molnar, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev



On Sat, 11 Apr 2009, Jan Engelhardt wrote:
> 
> iptables works in whole tables.

Not really.

Yes, iptables as a single command works in whole tables.

USERS, on the other hand, often work in multiple iptables commands, ie 
they just add things to the tables. And in fact, I think this is the exact 
workload that Jeff complains about - doing two hundred "update table" 
commands.

> Userspace submits a table, checkentry is called for all rules in the new 
> table, things are swapped, then destroy is called for all rules in the 
> old table. By that logic (which existed since dawn I think), only the 
> swap operation needs to be locked.

The problem is, the new code makes the "wait after swap" thing happen 
after every switch. And if you do two hundred "update table" commands, you 
now take a _long_ time to update.

Sure, you could tell people to just do everything as one single table 
update, but that isn't what they do.

		Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  6:00           ` David Miller
@ 2009-04-11 18:12               ` Kyle Moffett
  2009-04-12 16:38             ` Jan Engelhardt
  1 sibling, 0 replies; 254+ messages in thread
From: Kyle Moffett @ 2009-04-11 18:12 UTC (permalink / raw)
  To: David Miller
  Cc: jengelh, paulmck, torvalds, mingo, laijs, shemminger,
	jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Sat, Apr 11, 2009 at 2:00 AM, David Miller <davem@davemloft.net> wrote:
> From: Jan Engelhardt <jengelh@medozas.de>
> Date: Sat, 11 Apr 2009 07:14:50 +0200 (CEST)
>
>> The fact that `iptables -A` is called a hundred times means you are
>> doing 100 table replacements -- instead of one. And calling
>> synchronize_net at least a 100 times.
>>
>> "Wanna use iptables-restore?"
>
> I want to derail this line of thinking as fast as possible.
>
> This is not an acceptable response to this problem.  We made something
> fundamentally slower by several orders of magnitude.
>
> Therefore, saying "Don't insert your firewall rules like that." is not
> a valid response for this regression.
>
> We really have to fix it or revert.

Let me start by saying that I agree that for most systems this patch
provided a bad performance tradeoff that needs to get fixed.

On the other hand I have certain systems where I would much rather
reduce the per-packet load by a few percent...  even if it increases
the effort to load a new ruleset by many orders of magnitude!!!  Quite
simply the boxes only reboot a few times a year but in-between times
they forward many terabytes of low-latency network traffic.

So... to play devils advocate:

Almost all of the standard firewall tools (such as shorewall, etc) are
already using iptables-restore command to load firewall rules,
primarily because using separate iptables commands was *already* way
too slow.  There's also the serious race-condition of doing a firewall
restart that way where you only have half your rules loaded for a bit.
 The "iptables" command is fine for fiddling around with the command
line and making minor tweaks, but it simply doesn't cut it for
large-scale rules.

I remember when switching from a shell-based shorewall to a perl-based
shorewall.  The time to build my rule lists with the perl-based
version was about 20% of what it had been, but the time to load the
rules into the kernel with iptables-restore was easily 2% or perhaps
less.

Finally, if you really are loading a couple hundred IPs into a linear
ruleset, you're adding a fair amount of packet latency to each and
every packet that goes through totally independent of iptables load
time.  It would be much better to use ipsets (or similar) because they
load all of the IP ranges into an appropriate tree datastructure with
O(small-constant * log(N) + large-constant * 1) lookup instead of
O(large-constant * N).

Cheers,
Kyle Moffett

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
@ 2009-04-11 18:12               ` Kyle Moffett
  0 siblings, 0 replies; 254+ messages in thread
From: Kyle Moffett @ 2009-04-11 18:12 UTC (permalink / raw)
  To: David Miller
  Cc: jengelh, paulmck, torvalds, mingo, laijs, shemminger,
	jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Sat, Apr 11, 2009 at 2:00 AM, David Miller <davem@davemloft.net> wrote:
> From: Jan Engelhardt <jengelh@medozas.de>
> Date: Sat, 11 Apr 2009 07:14:50 +0200 (CEST)
>
>> The fact that `iptables -A` is called a hundred times means you are
>> doing 100 table replacements -- instead of one. And calling
>> synchronize_net at least a 100 times.
>>
>> "Wanna use iptables-restore?"
>
> I want to derail this line of thinking as fast as possible.
>
> This is not an acceptable response to this problem.  We made something
> fundamentally slower by several orders of magnitude.
>
> Therefore, saying "Don't insert your firewall rules like that." is not
> a valid response for this regression.
>
> We really have to fix it or revert.

Let me start by saying that I agree that for most systems this patch
provided a bad performance tradeoff that needs to get fixed.

On the other hand I have certain systems where I would much rather
reduce the per-packet load by a few percent...  even if it increases
the effort to load a new ruleset by many orders of magnitude!!!  Quite
simply the boxes only reboot a few times a year but in-between times
they forward many terabytes of low-latency network traffic.

So... to play devils advocate:

Almost all of the standard firewall tools (such as shorewall, etc) are
already using iptables-restore command to load firewall rules,
primarily because using separate iptables commands was *already* way
too slow.  There's also the serious race-condition of doing a firewall
restart that way where you only have half your rules loaded for a bit.
 The "iptables" command is fine for fiddling around with the command
line and making minor tweaks, but it simply doesn't cut it for
large-scale rules.

I remember when switching from a shell-based shorewall to a perl-based
shorewall.  The time to build my rule lists with the perl-based
version was about 20% of what it had been, but the time to load the
rules into the kernel with iptables-restore was easily 2% or perhaps
less.

Finally, if you really are loading a couple hundred IPs into a linear
ruleset, you're adding a fair amount of packet latency to each and
every packet that goes through totally independent of iptables load
time.  It would be much better to use ipsets (or similar) because they
load all of the IP ranges into an appropriate tree datastructure with
O(small-constant * log(N) + large-constant * 1) lookup instead of
O(large-constant * N).

Cheers,
Kyle Moffett
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 18:12               ` Kyle Moffett
@ 2009-04-11 18:32                 ` Arkadiusz Miskiewicz
  -1 siblings, 0 replies; 254+ messages in thread
From: Arkadiusz Miskiewicz @ 2009-04-11 18:32 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: David Miller, jengelh, paulmck, torvalds, mingo, laijs,
	shemminger, jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Saturday 11 of April 2009, Kyle Moffett wrote:

> Almost all of the standard firewall tools (such as shorewall, etc) are
> already using iptables-restore command to load firewall rules,
> primarily because using separate iptables commands was *already* way
> too slow.  

Some time ago there was batch patch that allowed to use standard shell format 
of calling iptables but did everything at once:

http://lists.netfilter.org/pipermail/netfilter-devel/2004-
September/016704.html

It didn't get merged - no idea why.

-- 
Arkadiusz Miśkiewicz        PLD/Linux Team
arekm / maven.pl            http://ftp.pld-linux.org/


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
@ 2009-04-11 18:32                 ` Arkadiusz Miskiewicz
  0 siblings, 0 replies; 254+ messages in thread
From: Arkadiusz Miskiewicz @ 2009-04-11 18:32 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: David Miller, jengelh, paulmck, torvalds, mingo, laijs,
	shemminger, jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Saturday 11 of April 2009, Kyle Moffett wrote:

> Almost all of the standard firewall tools (such as shorewall, etc) are
> already using iptables-restore command to load firewall rules,
> primarily because using separate iptables commands was *already* way
> too slow.  

Some time ago there was batch patch that allowed to use standard shell format 
of calling iptables but did everything at once:

http://lists.netfilter.org/pipermail/netfilter-devel/2004-
September/016704.html

It didn't get merged - no idea why.

-- 
Arkadiusz Miśkiewicz        PLD/Linux Team
arekm / maven.pl            http://ftp.pld-linux.org/

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  4:15       ` Paul E. McKenney
                           ` (2 preceding siblings ...)
  2009-04-11 15:50         ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Stephen Hemminger
@ 2009-04-11 18:57         ` Linus Torvalds
  2009-04-12  0:34           ` Paul E. McKenney
  3 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-11 18:57 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: David Miller, Ingo Molnar, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev



On Fri, 10 Apr 2009, Paul E. McKenney wrote:
> 
> 1.	Assuming that the synchronize_net() is intended to guarantee
> 	that the new rules will be in effect before returning to
> 	user space:

Btw, I think that's a bad assumption.

The thing is, nobody can really care if the new rules are in effect or 
not, because the thing you race with is not the "return to user space" 
part, but the incoming packets.

And those incoming packets might have been incoming before the rules were 
set up too.

So I seriously doubt you need to synchronize with any returning to user 
space. What you want to synchronize with is then later actions that do 
things like turning on the interface that the rules are attached to etc!

So I would suggest:

 - remove the synchronize_net() entirely. Replace it with just freeing the 
   old rules using RCU.

 - new packets will always end up seeing the new rules. That includes the 
   case of somebody doing "ifconfig eth0 up" that enables a new source of 
   packets, so there are no real security issues.

 - if you enabled your network interfaces before you updated your packet 
   filtering rules, you already had a window where packets would come in 
   with the old rules, so doing a "synchronize_net()" in no way protects 
   against any race conditions anyway.

Am I missing something?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 18:57         ` Linus Torvalds
@ 2009-04-12  0:34           ` Paul E. McKenney
  2009-04-12  7:23             ` Evgeniy Polyakov
  2009-04-12 16:06             ` Stephen Hemminger
  0 siblings, 2 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-12  0:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Ingo Molnar, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, Apr 11, 2009 at 11:57:16AM -0700, Linus Torvalds wrote:
> 
> 
> On Fri, 10 Apr 2009, Paul E. McKenney wrote:
> > 
> > 1.	Assuming that the synchronize_net() is intended to guarantee
> > 	that the new rules will be in effect before returning to
> > 	user space:
> 
> Btw, I think that's a bad assumption.

It does indeed appear to be!

> The thing is, nobody can really care if the new rules are in effect or 
> not, because the thing you race with is not the "return to user space" 
> part, but the incoming packets.
> 
> And those incoming packets might have been incoming before the rules were 
> set up too.
> 
> So I seriously doubt you need to synchronize with any returning to user 
> space. What you want to synchronize with is then later actions that do 
> things like turning on the interface that the rules are attached to etc!
> 
> So I would suggest:
> 
>  - remove the synchronize_net() entirely. Replace it with just freeing the 
>    old rules using RCU.
> 
>  - new packets will always end up seeing the new rules. That includes the 
>    case of somebody doing "ifconfig eth0 up" that enables a new source of 
>    packets, so there are no real security issues.
> 
>  - if you enabled your network interfaces before you updated your packet 
>    filtering rules, you already had a window where packets would come in 
>    with the old rules, so doing a "synchronize_net()" in no way protects 
>    against any race conditions anyway.
> 
> Am I missing something?

The issue at this point seems to be the need to get accurate snapshots
of various counters -- there are a number of Linux networking users who
need to account for every byte flowing through their systems.  However,
it is also necessary to update these counters very efficiently, given
that they are updated on a per-packet basis.  The current approach is
as follows:

1.	Install a new set of counters.

2.	Wait for a grace period to elapse.

3.	At this point, we know that all subsequent counting will happen
	on the new set of counters.

4.	Add the value of the old set of counters to the new set of
	counters.

5.	Copy the old set of counters up to user space.

So we get a good snapshot in #5, while #4 ensures that we don't lose
any counts when taking future snapshots.  Unfortunately, #2 hits us
with grace-period latencies on the critical path.

We are going through the following possibilities:

o	Stick with the current approach, and ask people to move to
	new batch-oriented interfaces.  However, a 30x decrease in
	performance is pretty grim, even for an old-style interface.

o	Use various atomic tricks to get an immediate snapshot of the
	old counters after step 1.  Make step 3 use call_rcu() instead
	of synchronize_rcu(), and then step 4 happens off the
	critical path.

	This approach moves the RCU grace period off of the critical
	path, but the atomic tricks are extremely ugly on 32-bit SMP
	machines.  32-bit UP machines and 64-bit machines are not
	too bad, though the 32-bit UP case does add preemption-disable
	overhead on the counter-update fastpath.

o	Provide some sort of expedited synchronize_rcu().  This might
	be able to decrease the hit from 30x down to maybe 5x.
	But I might need to do this for the fast-boot folks anyway,
	though I am first trying to get away with just speeding
	up synchronized_rcu().  Though I was not thinking in terms
	of 6x, let alone 30x.

	Please note that this would not be a drop-in replacement for
	synchronize_rcu().  One would use synchronize_rcu_expedited()
	(or whatever) only when the system really could not get any
	useful work done while the grace period was in progress.
	The general approach would be to keep the whole machine busy
	trying to get the grace period done as soon as possible.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 18:12               ` Kyle Moffett
  (?)
  (?)
@ 2009-04-12  0:54               ` david
  2009-04-12  5:05                   ` Kyle Moffett
  2009-04-12 12:30                 ` Harald Welte
  -1 siblings, 2 replies; 254+ messages in thread
From: david @ 2009-04-12  0:54 UTC (permalink / raw)
  To: Kyle Moffett
  Cc: David Miller, jengelh, paulmck, torvalds, mingo, laijs,
	shemminger, jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3230 bytes --]

On Sat, 11 Apr 2009, Kyle Moffett wrote:

> On Sat, Apr 11, 2009 at 2:00 AM, David Miller <davem@davemloft.net> wrote:
>> From: Jan Engelhardt <jengelh@medozas.de>
>> Date: Sat, 11 Apr 2009 07:14:50 +0200 (CEST)
>>
>>> The fact that `iptables -A` is called a hundred times means you are
>>> doing 100 table replacements -- instead of one. And calling
>>> synchronize_net at least a 100 times.
>>>
>>> "Wanna use iptables-restore?"
>>
>> I want to derail this line of thinking as fast as possible.
>>
>> This is not an acceptable response to this problem.  We made something
>> fundamentally slower by several orders of magnitude.
>>
>> Therefore, saying "Don't insert your firewall rules like that." is not
>> a valid response for this regression.
>>
>> We really have to fix it or revert.
>
> Let me start by saying that I agree that for most systems this patch
> provided a bad performance tradeoff that needs to get fixed.
>
> On the other hand I have certain systems where I would much rather
> reduce the per-packet load by a few percent...  even if it increases
> the effort to load a new ruleset by many orders of magnitude!!!  Quite
> simply the boxes only reboot a few times a year but in-between times
> they forward many terabytes of low-latency network traffic.
>
> So... to play devils advocate:
>
> Almost all of the standard firewall tools (such as shorewall, etc) are
> already using iptables-restore command to load firewall rules,
> primarily because using separate iptables commands was *already* way
> too slow.  There's also the serious race-condition of doing a firewall
> restart that way where you only have half your rules loaded for a bit.
> The "iptables" command is fine for fiddling around with the command
> line and making minor tweaks, but it simply doesn't cut it for
> large-scale rules.

what are the userspace level tools that I am supposed to use in place of 
my current process? (which is to have a script that 1. stops traffic, 2. 
executes the iptables commands to create the rules that I want, then 3. 
enables traffic)

iptables-restore only works if you are actually restoring the old set of 
rules. if you need to change the rules that doesn't work.

David Lang

> I remember when switching from a shell-based shorewall to a perl-based
> shorewall.  The time to build my rule lists with the perl-based
> version was about 20% of what it had been, but the time to load the
> rules into the kernel with iptables-restore was easily 2% or perhaps
> less.
>
> Finally, if you really are loading a couple hundred IPs into a linear
> ruleset, you're adding a fair amount of packet latency to each and
> every packet that goes through totally independent of iptables load
> time.  It would be much better to use ipsets (or similar) because they
> load all of the IP ranges into an appropriate tree datastructure with
> O(small-constant * log(N) + large-constant * 1) lookup instead of
> O(large-constant * N).
>
> Cheers,
> Kyle Moffett
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12  0:54               ` david
@ 2009-04-12  5:05                   ` Kyle Moffett
  2009-04-12 12:30                 ` Harald Welte
  1 sibling, 0 replies; 254+ messages in thread
From: Kyle Moffett @ 2009-04-12  5:05 UTC (permalink / raw)
  To: david
  Cc: David Miller, jengelh, paulmck, torvalds, mingo, laijs,
	shemminger, jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Sat, Apr 11, 2009 at 8:54 PM,  <david@lang.hm> wrote:
> On Sat, 11 Apr 2009, Kyle Moffett wrote:
>> Almost all of the standard firewall tools (such as shorewall, etc) are
>> already using iptables-restore command to load firewall rules,
>> primarily because using separate iptables commands was *already* way
>> too slow.  There's also the serious race-condition of doing a firewall
>> restart that way where you only have half your rules loaded for a bit.
>> The "iptables" command is fine for fiddling around with the command
>> line and making minor tweaks, but it simply doesn't cut it for
>> large-scale rules.
>
> what are the userspace level tools that I am supposed to use in place of my
> current process? (which is to have a script that 1. stops traffic, 2.
> executes the iptables commands to create the rules that I want, then 3.
> enables traffic)
>
> iptables-restore only works if you are actually restoring the old set of
> rules. if you need to change the rules that doesn't work.

Not true...  The iptables-restore format is pretty well documented and
not far off the standard command-line argument format.  For instance
the "shorewall" tool does a sort of "compile" of its high-level
firewall language into an input file for the "iptables-restore"
command.

The basic format to atomically load a table is:
*tablename
:CHAINNAME DEFAULTACTION [packets:bytes]
:ANOTHERCHAIN ANOTHERACTION [packets:bytes]
:customchain - [packets:bytes]
-A SOMECHAIN --rule-arguments-here
-A customchain --rule-arguments-here
COMMIT

At the end of this email you can find some sample data cut-n-pasted
from the iptables-restore file from one of my boxes running shorewall.
 The full file is 645 lines but takes at most a second or so to load
once compiled.

You could also do an iptables-save -c on one of your configured
systems to see what various constructions you use look like in the
iptables format.  It's all pretty straightforward.

Cheers,
Kyle Moffett


*raw
:PREROUTING ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
COMMIT
*nat
:PREROUTING ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
:excl0 - [0:0]
:excl1 - [0:0]
:excl2 - [0:0]
:world_masq - [0:0]
-A POSTROUTING -o world -j world_masq
-A excl0 -d 10.0.0.0/8 -j RETURN
-A excl0 -d 192.168.0.0/16 -j RETURN
-A excl0 -d 172.16.0.0/12 -j RETURN
-A excl0 -j MASQUERADE
-A excl1 -d 10.0.0.0/8 -j RETURN
-A excl1 -d 192.168.0.0/16 -j RETURN
-A excl1 -d 172.16.0.0/12 -j RETURN
-A excl1 -j MASQUERADE
-A excl2 -d 10.0.0.0/8 -j RETURN
-A excl2 -d 192.168.0.0/16 -j RETURN
-A excl2 -d 172.16.0.0/12 -j RETURN
-A excl2 -j MASQUERADE
-A world_masq -s 10.0.0.0/8 -j excl0
-A world_masq -s 172.16.0.0/12 -j excl1
-A world_masq -s 192.168.0.0/16 -j excl2
COMMIT
*mangle
:PREROUTING ACCEPT [0:0]
:INPUT ACCEPT [0:0]
:FORWARD ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
:fortos - [0:0]
:tcfor - [0:0]
:tcout - [0:0]
:tcpost - [0:0]
:tcpre - [0:0]
-A PREROUTING  -j tcpre
-A FORWARD -j tcfor
-A FORWARD -j fortos
-A OUTPUT  -j tcout
-A POSTROUTING -j tcpost
-A fortos -p 17 -i dmz -s 72.214.41.2 -j TOS --set-tos 16
-A fortos -p 17 -o dmz -d 72.214.41.2 -j TOS --set-tos 16
COMMIT
*filter
:INPUT DROP [0:0]
:FORWARD DROP [0:0]
:OUTPUT DROP [0:0]
:Drop - [0:0]
:Reject - [0:0]
[...many more lines snipped...]

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
@ 2009-04-12  5:05                   ` Kyle Moffett
  0 siblings, 0 replies; 254+ messages in thread
From: Kyle Moffett @ 2009-04-12  5:05 UTC (permalink / raw)
  To: david
  Cc: David Miller, jengelh, paulmck, torvalds, mingo, laijs,
	shemminger, jeff.chua.linux, dada1, kaber, r000n, linux-kernel,
	netfilter-devel, netdev

On Sat, Apr 11, 2009 at 8:54 PM,  <david@lang.hm> wrote:
> On Sat, 11 Apr 2009, Kyle Moffett wrote:
>> Almost all of the standard firewall tools (such as shorewall, etc) are
>> already using iptables-restore command to load firewall rules,
>> primarily because using separate iptables commands was *already* way
>> too slow.  There's also the serious race-condition of doing a firewall
>> restart that way where you only have half your rules loaded for a bit.
>> The "iptables" command is fine for fiddling around with the command
>> line and making minor tweaks, but it simply doesn't cut it for
>> large-scale rules.
>
> what are the userspace level tools that I am supposed to use in place of my
> current process? (which is to have a script that 1. stops traffic, 2.
> executes the iptables commands to create the rules that I want, then 3.
> enables traffic)
>
> iptables-restore only works if you are actually restoring the old set of
> rules. if you need to change the rules that doesn't work.

Not true...  The iptables-restore format is pretty well documented and
not far off the standard command-line argument format.  For instance
the "shorewall" tool does a sort of "compile" of its high-level
firewall language into an input file for the "iptables-restore"
command.

The basic format to atomically load a table is:
*tablename
:CHAINNAME DEFAULTACTION [packets:bytes]
:ANOTHERCHAIN ANOTHERACTION [packets:bytes]
:customchain - [packets:bytes]
-A SOMECHAIN --rule-arguments-here
-A customchain --rule-arguments-here
COMMIT

At the end of this email you can find some sample data cut-n-pasted
from the iptables-restore file from one of my boxes running shorewall.
 The full file is 645 lines but takes at most a second or so to load
once compiled.

You could also do an iptables-save -c on one of your configured
systems to see what various constructions you use look like in the
iptables format.  It's all pretty straightforward.

Cheers,
Kyle Moffett


*raw
:PREROUTING ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
COMMIT
*nat
:PREROUTING ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
:excl0 - [0:0]
:excl1 - [0:0]
:excl2 - [0:0]
:world_masq - [0:0]
-A POSTROUTING -o world -j world_masq
-A excl0 -d 10.0.0.0/8 -j RETURN
-A excl0 -d 192.168.0.0/16 -j RETURN
-A excl0 -d 172.16.0.0/12 -j RETURN
-A excl0 -j MASQUERADE
-A excl1 -d 10.0.0.0/8 -j RETURN
-A excl1 -d 192.168.0.0/16 -j RETURN
-A excl1 -d 172.16.0.0/12 -j RETURN
-A excl1 -j MASQUERADE
-A excl2 -d 10.0.0.0/8 -j RETURN
-A excl2 -d 192.168.0.0/16 -j RETURN
-A excl2 -d 172.16.0.0/12 -j RETURN
-A excl2 -j MASQUERADE
-A world_masq -s 10.0.0.0/8 -j excl0
-A world_masq -s 172.16.0.0/12 -j excl1
-A world_masq -s 192.168.0.0/16 -j excl2
COMMIT
*mangle
:PREROUTING ACCEPT [0:0]
:INPUT ACCEPT [0:0]
:FORWARD ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
:fortos - [0:0]
:tcfor - [0:0]
:tcout - [0:0]
:tcpost - [0:0]
:tcpre - [0:0]
-A PREROUTING  -j tcpre
-A FORWARD -j tcfor
-A FORWARD -j fortos
-A OUTPUT  -j tcout
-A POSTROUTING -j tcpost
-A fortos -p 17 -i dmz -s 72.214.41.2 -j TOS --set-tos 16
-A fortos -p 17 -o dmz -d 72.214.41.2 -j TOS --set-tos 16
COMMIT
*filter
:INPUT DROP [0:0]
:FORWARD DROP [0:0]
:OUTPUT DROP [0:0]
:Drop - [0:0]
:Reject - [0:0]
[...many more lines snipped...]
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12  0:34           ` Paul E. McKenney
@ 2009-04-12  7:23             ` Evgeniy Polyakov
  2009-04-12 16:06             ` Stephen Hemminger
  1 sibling, 0 replies; 254+ messages in thread
From: Evgeniy Polyakov @ 2009-04-12  7:23 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

Hi.

On Sat, Apr 11, 2009 at 05:34:45PM -0700, Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> The issue at this point seems to be the need to get accurate snapshots
> of various counters -- there are a number of Linux networking users who
> need to account for every byte flowing through their systems.  However,

If we add or change the rule we can not know if iptables' return to
userspace does mean that rule started to act. There may be other queues
already filled with the packets which could match the new rule (like
receiving socket buffer). So effectively we want it to take effect very
soon. What if there will be a timer which will synchronize RCU-added states,
so if we update single rule - it will take effect in a second, and if we
update bunch of them - during the delay second we pretty much can load
them all.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 17:48           ` Paul E. McKenney
@ 2009-04-12 10:54             ` Ingo Molnar
  2009-04-12 11:34             ` Paul Mackerras
  1 sibling, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-12 10:54 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Linus Torvalds, David Miller, Lai Jiangshan, shemminger,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev, paulus, benh


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> On Sat, Apr 11, 2009 at 09:08:54AM +0200, Ingo Molnar wrote:
> > 
> > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > > 	I will nevertheless suggest the following egregious hack to
> > > 	get a consistent sample of one counter for some other CPU:
> > > 
> > > 	a.	Disable interrupts
> > > 	b.	Atomically exchange the bottom 32 bits of the
> > > 		counter with the value zero.
> > > 	c.	Atomically exchange the top 32 bits of the counter
> > > 		with the value zero.
> > > 	d.	Concatenate the values obtained in (b) and (c), which
> > > 		is the snapshot value.
> > 
> > Note, i have recently implemented full atomic64_t support on 32-bit 
> > x86, for the perfcounters code, based on the CMPXCHG8B instruction.
> > 
> > Which, while not the lightest of instructions, is still much better 
> > than the sequence above.
> > 
> > So i think a better approach would be to also add a dumb generic 
> > implementation for atomic64_t (using a global lock or so), and then 
> > generic code could just assume that atomic64_t always exists.
> > 
> > It is far nicer - and faster as well - as the hack above, even on 
> > 32-bit x86.
> 
> If the generic implementation is needed only on !SMP systems, that 
> could work.  The architectures I would be worried about include 
> powerpc and ia64, which I believe support 32-bit SMP builds.

ia64 would naturally support the CMPXCHG8B instructions.

Not sure about powerpc32. Having a lock for the library 
implementation is not _that_ much of a problem. We obviously dont 
want the design of Linux to be dictated by the weakest link of all 
platforms, right?

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11 17:48           ` Paul E. McKenney
  2009-04-12 10:54             ` Ingo Molnar
@ 2009-04-12 11:34             ` Paul Mackerras
  2009-04-12 17:31               ` Paul E. McKenney
  1 sibling, 1 reply; 254+ messages in thread
From: Paul Mackerras @ 2009-04-12 11:34 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, Linus Torvalds, David Miller, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev, benh

Paul E. McKenney writes:

> If the generic implementation is needed only on !SMP systems, that
> could work.  The architectures I would be worried about include
> powerpc and ia64, which I believe support 32-bit SMP builds.

32-bit powerpc doesn't have 64-bit atomic operations and does support
SMP.

What about ARM?  I thought they had 32-bit SMP these days as well.

Paul.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12  0:54               ` david
  2009-04-12  5:05                   ` Kyle Moffett
@ 2009-04-12 12:30                 ` Harald Welte
  1 sibling, 0 replies; 254+ messages in thread
From: Harald Welte @ 2009-04-12 12:30 UTC (permalink / raw)
  To: david
  Cc: Kyle Moffett, David Miller, jengelh, paulmck, torvalds, mingo,
	laijs, shemminger, jeff.chua.linux, dada1, kaber, r000n,
	linux-kernel, netfilter-devel, netdev

[-- Attachment #1: Type: text/plain, Size: 2665 bytes --]

Hi all,

On Sat, Apr 11, 2009 at 05:54:41PM -0700, david@lang.hm wrote:
>> Almost all of the standard firewall tools (such as shorewall, etc) are
>> already using iptables-restore command to load firewall rules,
>> primarily because using separate iptables commands was *already* way
>> too slow.  There's also the serious race-condition of doing a firewall
>> restart that way where you only have half your rules loaded for a bit.
>> The "iptables" command is fine for fiddling around with the command
>> line and making minor tweaks, but it simply doesn't cut it for
>> large-scale rules.
>
> what are the userspace level tools that I am supposed to use in place of  
> my current process? (which is to have a script that 1. stops traffic, 2.  
> executes the iptables commands to create the rules that I want, then 3.  
> enables traffic)
>
> iptables-restore only works if you are actually restoring the old set of  
> rules. if you need to change the rules that doesn't work.

That's what I implemented as "iptables-restore --noflush" a number of years
ago. It doesn't flush the current uleset and swaps in a new one, but 
reads the current rules from kernel, applies any number of changes and swaps
the new ruleset in.

The syntax of iptables-restore is almost identical to iptables commands, you
just specify the table in a different way.  So you would just create your
desired changes in that format, and echo that into iptables-restore.  If it's
an entire new ruleset, you use no '--noflush' and it does automatic flushing
of all old rules.  If your stdin-file contains only incremental changes, you
use --noflush.

In the netfilter project, we knew for many years that the 'swap the entire
table atomically in' is a bad design choice.  This is what various developers
have been trying to address at different times, and which finally resulted
in the nftables implementation of Patrick McHardy.  So for the mid- to long
term there is a clear design that moves away from that.

But so far, we have to live with the API and its semantics.  iptables userspace
has been improved a number of times, and things like iptables-restore with or
without --noflush can be used as an intermediate solution - and have been used
by many systems out there.

-- 
- Harald Welte <laforge@netfilter.org>                 http://netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12  0:34           ` Paul E. McKenney
  2009-04-12  7:23             ` Evgeniy Polyakov
@ 2009-04-12 16:06             ` Stephen Hemminger
  2009-04-12 17:30               ` Paul E. McKenney
  1 sibling, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-12 16:06 UTC (permalink / raw)
  To: paulmck
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sat, 11 Apr 2009 17:34:45 -0700
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Sat, Apr 11, 2009 at 11:57:16AM -0700, Linus Torvalds wrote:
> > 
> > 
> > On Fri, 10 Apr 2009, Paul E. McKenney wrote:
> > > 
> > > 1.	Assuming that the synchronize_net() is intended to guarantee
> > > 	that the new rules will be in effect before returning to
> > > 	user space:
> > 
> > Btw, I think that's a bad assumption.
> 
> It does indeed appear to be!
> 
> > The thing is, nobody can really care if the new rules are in effect or 
> > not, because the thing you race with is not the "return to user space" 
> > part, but the incoming packets.
> > 
> > And those incoming packets might have been incoming before the rules were 
> > set up too.
> > 
> > So I seriously doubt you need to synchronize with any returning to user 
> > space. What you want to synchronize with is then later actions that do 
> > things like turning on the interface that the rules are attached to etc!
> > 
> > So I would suggest:
> > 
> >  - remove the synchronize_net() entirely. Replace it with just freeing the 
> >    old rules using RCU.
> > 
> >  - new packets will always end up seeing the new rules. That includes the 
> >    case of somebody doing "ifconfig eth0 up" that enables a new source of 
> >    packets, so there are no real security issues.
> > 
> >  - if you enabled your network interfaces before you updated your packet 
> >    filtering rules, you already had a window where packets would come in 
> >    with the old rules, so doing a "synchronize_net()" in no way protects 
> >    against any race conditions anyway.
> > 
> > Am I missing something?
> 
> The issue at this point seems to be the need to get accurate snapshots
> of various counters -- there are a number of Linux networking users who
> need to account for every byte flowing through their systems.  However,
> it is also necessary to update these counters very efficiently, given
> that they are updated on a per-packet basis.  The current approach is
> as follows:
> 
> 1.	Install a new set of counters.
> 
> 2.	Wait for a grace period to elapse.
> 
> 3.	At this point, we know that all subsequent counting will happen
> 	on the new set of counters.
> 
> 4.	Add the value of the old set of counters to the new set of
> 	counters.
> 
> 5.	Copy the old set of counters up to user space.
> 
> So we get a good snapshot in #5, while #4 ensures that we don't lose
> any counts when taking future snapshots.  Unfortunately, #2 hits us
> with grace-period latencies on the critical path.
> 
> We are going through the following possibilities:
> 
> o	Stick with the current approach, and ask people to move to
> 	new batch-oriented interfaces.  However, a 30x decrease in
> 	performance is pretty grim, even for an old-style interface.
> 
> o	Use various atomic tricks to get an immediate snapshot of the
> 	old counters after step 1.  Make step 3 use call_rcu() instead
> 	of synchronize_rcu(), and then step 4 happens off the
> 	critical path.
> 
> 	This approach moves the RCU grace period off of the critical
> 	path, but the atomic tricks are extremely ugly on 32-bit SMP
> 	machines.  32-bit UP machines and 64-bit machines are not
> 	too bad, though the 32-bit UP case does add preemption-disable
> 	overhead on the counter-update fastpath.
> 
> o	Provide some sort of expedited synchronize_rcu().  This might
> 	be able to decrease the hit from 30x down to maybe 5x.
> 	But I might need to do this for the fast-boot folks anyway,
> 	though I am first trying to get away with just speeding
> 	up synchronized_rcu().  Though I was not thinking in terms
> 	of 6x, let alone 30x.
> 
> 	Please note that this would not be a drop-in replacement for
> 	synchronize_rcu().  One would use synchronize_rcu_expedited()
> 	(or whatever) only when the system really could not get any
> 	useful work done while the grace period was in progress.
> 	The general approach would be to keep the whole machine busy
> 	trying to get the grace period done as soon as possible.
> 
> 							Thanx, Paul

We could also try:
  * per-cpu spinlock on counters (instead of synchronize_net). 
    When doing update, just acquire
    lock on that cpu and futz with counters then. Overhead should
    still be less than 2.6.29 and earlier global rwlock

  * synchonize_rcu/synchronize_net is more guarantee than needed?

  * use on_each_cpu() somehow to do grace periood?

  * Add a cond_resched() into net_rx_action which might cause rx processing
    to get out of rcu sooner? also in transmit packet scheduler.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-11  6:00           ` David Miller
  2009-04-11 18:12               ` Kyle Moffett
@ 2009-04-12 16:38             ` Jan Engelhardt
  1 sibling, 0 replies; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-12 16:38 UTC (permalink / raw)
  To: David Miller
  Cc: paulmck, torvalds, mingo, laijs, shemminger, jeff.chua.linux,
	dada1, kaber, r000n, linux-kernel, netfilter-devel, netdev


On Saturday 2009-04-11 08:00, David Miller wrote:
>From: Jan Engelhardt
>Date: Sat, 11 Apr 2009 07:14:50 +0200 (CEST)
>
>> The fact that `iptables -A` is called a hundred times means you are 
>> doing 100 table replacements -- instead of one. And calling
>> synchronize_net at least a 100 times.
>> 
>> "Wanna use iptables-restore?"
>
>I want to derail this line of thinking as fast as possible.
>
>This is not an acceptable response to this problem.  We made something
>fundamentally slower by several orders of magnitude.
>
>Therefore, saying "Don't insert your firewall rules like that." is not
>a valid response for this regression.
>
>We really have to fix it or revert.
>
Well, there is an extra tool in SUSE's iptables, which collects
rules added this way, and then commits them in one go when you
are done. Perhaps that is an "adequeate" way?

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12 16:06             ` Stephen Hemminger
@ 2009-04-12 17:30               ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-12 17:30 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Torvalds, David Miller, Ingo Molnar, Lai Jiangshan,
	jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev

On Sun, Apr 12, 2009 at 09:06:03AM -0700, Stephen Hemminger wrote:
> On Sat, 11 Apr 2009 17:34:45 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Sat, Apr 11, 2009 at 11:57:16AM -0700, Linus Torvalds wrote:
> > > 
> > > 
> > > On Fri, 10 Apr 2009, Paul E. McKenney wrote:
> > > > 
> > > > 1.	Assuming that the synchronize_net() is intended to guarantee
> > > > 	that the new rules will be in effect before returning to
> > > > 	user space:
> > > 
> > > Btw, I think that's a bad assumption.
> > 
> > It does indeed appear to be!
> > 
> > > The thing is, nobody can really care if the new rules are in effect or 
> > > not, because the thing you race with is not the "return to user space" 
> > > part, but the incoming packets.
> > > 
> > > And those incoming packets might have been incoming before the rules were 
> > > set up too.
> > > 
> > > So I seriously doubt you need to synchronize with any returning to user 
> > > space. What you want to synchronize with is then later actions that do 
> > > things like turning on the interface that the rules are attached to etc!
> > > 
> > > So I would suggest:
> > > 
> > >  - remove the synchronize_net() entirely. Replace it with just freeing the 
> > >    old rules using RCU.
> > > 
> > >  - new packets will always end up seeing the new rules. That includes the 
> > >    case of somebody doing "ifconfig eth0 up" that enables a new source of 
> > >    packets, so there are no real security issues.
> > > 
> > >  - if you enabled your network interfaces before you updated your packet 
> > >    filtering rules, you already had a window where packets would come in 
> > >    with the old rules, so doing a "synchronize_net()" in no way protects 
> > >    against any race conditions anyway.
> > > 
> > > Am I missing something?
> > 
> > The issue at this point seems to be the need to get accurate snapshots
> > of various counters -- there are a number of Linux networking users who
> > need to account for every byte flowing through their systems.  However,
> > it is also necessary to update these counters very efficiently, given
> > that they are updated on a per-packet basis.  The current approach is
> > as follows:
> > 
> > 1.	Install a new set of counters.
> > 
> > 2.	Wait for a grace period to elapse.
> > 
> > 3.	At this point, we know that all subsequent counting will happen
> > 	on the new set of counters.
> > 
> > 4.	Add the value of the old set of counters to the new set of
> > 	counters.
> > 
> > 5.	Copy the old set of counters up to user space.
> > 
> > So we get a good snapshot in #5, while #4 ensures that we don't lose
> > any counts when taking future snapshots.  Unfortunately, #2 hits us
> > with grace-period latencies on the critical path.
> > 
> > We are going through the following possibilities:
> > 
> > o	Stick with the current approach, and ask people to move to
> > 	new batch-oriented interfaces.  However, a 30x decrease in
> > 	performance is pretty grim, even for an old-style interface.
> > 
> > o	Use various atomic tricks to get an immediate snapshot of the
> > 	old counters after step 1.  Make step 3 use call_rcu() instead
> > 	of synchronize_rcu(), and then step 4 happens off the
> > 	critical path.
> > 
> > 	This approach moves the RCU grace period off of the critical
> > 	path, but the atomic tricks are extremely ugly on 32-bit SMP
> > 	machines.  32-bit UP machines and 64-bit machines are not
> > 	too bad, though the 32-bit UP case does add preemption-disable
> > 	overhead on the counter-update fastpath.
> > 
> > o	Provide some sort of expedited synchronize_rcu().  This might
> > 	be able to decrease the hit from 30x down to maybe 5x.
> > 	But I might need to do this for the fast-boot folks anyway,
> > 	though I am first trying to get away with just speeding
> > 	up synchronized_rcu().  Though I was not thinking in terms
> > 	of 6x, let alone 30x.
> > 
> > 	Please note that this would not be a drop-in replacement for
> > 	synchronize_rcu().  One would use synchronize_rcu_expedited()
> > 	(or whatever) only when the system really could not get any
> > 	useful work done while the grace period was in progress.
> > 	The general approach would be to keep the whole machine busy
> > 	trying to get the grace period done as soon as possible.
> > 
> > 							Thanx, Paul
> 
> We could also try:
>   * per-cpu spinlock on counters (instead of synchronize_net). 
>     When doing update, just acquire
>     lock on that cpu and futz with counters then. Overhead should
>     still be less than 2.6.29 and earlier global rwlock

This one makes a lot of sense to me.  The overhead of an uncontended
lock is pretty small on most systems.  This would also mean that you
don't have to actually swap the counters, correct?

>   * synchonize_rcu/synchronize_net is more guarantee than needed?

If you really do need to swap the counters themselves, you -might- also
need call_rcu() to dispose of them.  But it should possible to do that
under the per-CPU lock instead.

>   * use on_each_cpu() somehow to do grace periood?

You could certainly use something like smp_call_function() to collect
the other CPUs' counter values -- just disable interrupts across the
increments for architectures that cannot atomically increment a 64-bit
value.  (And it only needs to be atomic with respect to an interrupt,
not necessarily to some other CPU.)

>   * Add a cond_resched() into net_rx_action which might cause rx processing
>     to get out of rcu sooner? also in transmit packet scheduler.

This might help some, but would probably only give a few tens of percent
improvement.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12 11:34             ` Paul Mackerras
@ 2009-04-12 17:31               ` Paul E. McKenney
  2009-04-13  1:13                 ` David Miller
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-12 17:31 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Ingo Molnar, Linus Torvalds, David Miller, Lai Jiangshan,
	shemminger, jeff.chua.linux, dada1, jengelh, kaber, r000n,
	Linux Kernel Mailing List, netfilter-devel, netdev, benh

On Sun, Apr 12, 2009 at 09:34:27PM +1000, Paul Mackerras wrote:
> Paul E. McKenney writes:
> 
> > If the generic implementation is needed only on !SMP systems, that
> > could work.  The architectures I would be worried about include
> > powerpc and ia64, which I believe support 32-bit SMP builds.
> 
> 32-bit powerpc doesn't have 64-bit atomic operations and does support
> SMP.
> 
> What about ARM?  I thought they had 32-bit SMP these days as well.

Some of Steve Hemminger's recent suggestions in this thread seem to me
to avoid this whole issue nicely.  But we will see!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-12 17:31               ` Paul E. McKenney
@ 2009-04-13  1:13                 ` David Miller
  2009-04-13  4:04                   ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-13  1:13 UTC (permalink / raw)
  To: paulmck
  Cc: paulus, mingo, torvalds, laijs, shemminger, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 12 Apr 2009 10:31:08 -0700

> On Sun, Apr 12, 2009 at 09:34:27PM +1000, Paul Mackerras wrote:
>> Paul E. McKenney writes:
>> 
>> > If the generic implementation is needed only on !SMP systems, that
>> > could work.  The architectures I would be worried about include
>> > powerpc and ia64, which I believe support 32-bit SMP builds.
>> 
>> 32-bit powerpc doesn't have 64-bit atomic operations and does support
>> SMP.
>> 
>> What about ARM?  I thought they had 32-bit SMP these days as well.
> 
> Some of Steve Hemminger's recent suggestions in this thread seem to me
> to avoid this whole issue nicely.  But we will see!  ;-)

I hope so.

Eventually it seems that all of the older 32-bit SMP platforms
will be run under a bus having to execute some many "efficient"
primitives using the "hash table of spinlocks" scheme for
synchronization.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
  2009-04-13  1:13                 ` David Miller
@ 2009-04-13  4:04                   ` Paul E. McKenney
  2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-13  4:04 UTC (permalink / raw)
  To: David Miller
  Cc: paulus, mingo, torvalds, laijs, shemminger, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Sun, Apr 12, 2009 at 06:13:30PM -0700, David Miller wrote:
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Date: Sun, 12 Apr 2009 10:31:08 -0700
> 
> > On Sun, Apr 12, 2009 at 09:34:27PM +1000, Paul Mackerras wrote:
> >> Paul E. McKenney writes:
> >> 
> >> > If the generic implementation is needed only on !SMP systems, that
> >> > could work.  The architectures I would be worried about include
> >> > powerpc and ia64, which I believe support 32-bit SMP builds.
> >> 
> >> 32-bit powerpc doesn't have 64-bit atomic operations and does support
> >> SMP.
> >> 
> >> What about ARM?  I thought they had 32-bit SMP these days as well.
> > 
> > Some of Steve Hemminger's recent suggestions in this thread seem to me
> > to avoid this whole issue nicely.  But we will see!  ;-)
> 
> I hope so.
> 
> Eventually it seems that all of the older 32-bit SMP platforms
> will be run under a bus having to execute some many "efficient"
> primitives using the "hash table of spinlocks" scheme for
> synchronization.

Or, in some cases, per-CPU locks.  It might also be that 32-bit SMP
systems use stop-the-world approaches.

Or that we get a variant of RCU with shorter grace periods.  I don't
recommend holding your breath, but I have not given up on this.  For one
only slightly crazy example, some of the user-level RCU variants could be
adapted to in-kernel use.  Some of these have sub-microsecond grace-period
latencies, but also have some limitations that would make it difficult
for them to replace RCU wholesale in the Linux kernel.  And it is not
like we are suffering from any lack of distinct RCU implementations in
the Linux kernel just now.  :-/

However, they might be useful in isolated situations.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13  4:04                   ` Paul E. McKenney
@ 2009-04-13 16:53                     ` Stephen Hemminger
  2009-04-13 17:40                         ` Eric Dumazet
                                         ` (3 more replies)
  0 siblings, 4 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-13 16:53 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, paulus, mingo, torvalds, laijs, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

This is an alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Duzamet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Tested basic functionality (add/remove/list), but don't have test cases
for stress, ip6tables or arptables.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Patch against 2.6.30-rc1

 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  122 ++++++++++------------------------
 net/ipv4/netfilter/ip_tables.c     |  129 +++++++++++--------------------------
 net/ipv6/netfilter/ip6_tables.c    |  127 +++++++++++-------------------------
 net/netfilter/x_tables.c           |   21 ------
 5 files changed, 116 insertions(+), 288 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-13 08:27:45.698412446 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-13 08:34:05.499348295 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-13 08:27:45.684411824 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-13 09:12:54.295536361 -0700
@@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -339,9 +341,10 @@ ipt_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +439,8 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -891,86 +894,34 @@ get_counters(const struct xt_table_info 
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
-	unsigned int i;
-	unsigned int curcpu;
+	unsigned int i = 0;
+	unsigned int curcpu = raw_smp_processor_id();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
 	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
-
-	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
+
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
 }
 
@@ -979,7 +930,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -990,26 +940,10 @@ static struct xt_counters * alloc_counte
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1377,6 +1311,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1386,7 +1332,7 @@ do_add_counters(struct net *net, void __
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int curcpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1437,25 +1383,27 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	loc_cpu_entry = private->entries[curcpu];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2220,10 @@ static struct pernet_operations ip_table
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
--- a/net/netfilter/x_tables.c	2009-04-13 08:27:45.671412509 -0700
+++ b/net/netfilter/x_tables.c	2009-04-13 09:19:51.687537585 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -685,22 +671,18 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
+	table->private = newinfo;
 	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +716,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-13 08:37:58.189723514 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-13 09:14:26.576349680 -0700
@@ -329,6 +329,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -365,9 +367,10 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +469,8 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -920,84 +924,33 @@ get_counters(const struct xt_table_info 
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
-	unsigned int i;
-	unsigned int curcpu;
+	unsigned int i = 0;
+	unsigned int curcpu = raw_smp_processor_id();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
 	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
-
-	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
 }
 
@@ -1006,7 +959,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1017,26 +969,9 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
-
+	get_counters(private, counters);
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1405,6 +1340,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1414,7 +1362,7 @@ do_add_counters(struct net *net, void __
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	char *name;
-	int size;
+	int curcpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1465,25 +1413,27 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2248,10 @@ static struct pernet_operations ip6_tabl
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-13 08:38:05.803598692 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-13 09:19:57.079411426 -0700
@@ -231,6 +231,8 @@ static inline struct arpt_entry *get_ent
 	return (struct arpt_entry *)(base + offset);
 }
 
+static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -253,9 +255,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -328,8 +331,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -705,85 +708,33 @@ static void get_counters(const struct xt
 			 struct xt_counters counters[])
 {
 	unsigned int cpu;
-	unsigned int i;
-	unsigned int curcpu;
+	unsigned int i = 0;
+	unsigned int curcpu = raw_smp_processor_id();
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
 	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
-
-	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
 }
 
@@ -792,7 +743,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -804,26 +754,10 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1165,6 +1099,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1173,7 +1120,7 @@ static int do_add_counters(struct net *n
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int curcpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1224,25 +1171,26 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	local_bh_disable();
+	curcpu = smp_processor_id();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
+	loc_cpu_entry = private->entries[curcpu];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
@ 2009-04-13 17:40                         ` Eric Dumazet
  2009-04-13 19:06                       ` Martin Josefsson
                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-13 17:40 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, paulus, mingo, torvalds, laijs,
	jeff.chua.linux, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger a écrit :
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Tested basic functionality (add/remove/list), but don't have test cases
> for stress, ip6tables or arptables.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Patch seems good to me, but apparently xt_replace_table()
misses the "acquiring the locks on all cpus" you mentioned in ChangeLog ?

I am still off-computers until tomorrow so cannot provide a patch for this, sorry.

Some form of

local_bh_disable();
for_each_possible_cpu(cpu)
	spin_lock(&per_cpu(ip_tables_lock, cpu));

oldinfo = private;
/* do the substitution */
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;

for_each_possible_cpu(cpu)
	spin_unlock(&per_cpu(ip_tables_lock, cpu));
local_bh_enable();


But I wonder if this could hit a limit of max spinlocks held by this cpu, say on a 4096 cpu machine ?




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
@ 2009-04-13 17:40                         ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-13 17:40 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, paulus, mingo, torvalds, laijs,
	jeff.chua.linux, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger a écrit :
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Tested basic functionality (add/remove/list), but don't have test cases
> for stress, ip6tables or arptables.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Patch seems good to me, but apparently xt_replace_table()
misses the "acquiring the locks on all cpus" you mentioned in ChangeLog ?

I am still off-computers until tomorrow so cannot provide a patch for this, sorry.

Some form of

local_bh_disable();
for_each_possible_cpu(cpu)
	spin_lock(&per_cpu(ip_tables_lock, cpu));

oldinfo = private;
/* do the substitution */
table->private = newinfo;
newinfo->initial_entries = oldinfo->initial_entries;

for_each_possible_cpu(cpu)
	spin_unlock(&per_cpu(ip_tables_lock, cpu));
local_bh_enable();


But I wonder if this could hit a limit of max spinlocks held by this cpu, say on a 4096 cpu machine ?



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 17:40                         ` Eric Dumazet
  (?)
@ 2009-04-13 18:11                         ` Stephen Hemminger
  -1 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-13 18:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, David Miller, paulus, mingo, torvalds, laijs,
	jeff.chua.linux, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Mon, 13 Apr 2009 19:40:24 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > This is an alternative version of ip/ip6/arp tables locking using
> > per-cpu locks.  This avoids the overhead of synchronize_net() during
> > update but still removes the expensive rwlock in earlier versions.
> > 
> > The idea for this came from an earlier version done by Eric Duzamet.
> > Locking is done per-cpu, the fast path locks on the current cpu
> > and updates counters.  The slow case involves acquiring the locks on
> > all cpu's.
> > 
> > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > there already is a mutex for xt[af].mutex that is held.
> > 
> > Tested basic functionality (add/remove/list), but don't have test cases
> > for stress, ip6tables or arptables.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> Patch seems good to me, but apparently xt_replace_table()
> misses the "acquiring the locks on all cpus" you mentioned in ChangeLog ?

It happens in get_counters already.

> I am still off-computers until tomorrow so cannot provide a patch for this, sorry.
> 
> Some form of
> 
> local_bh_disable();
> for_each_possible_cpu(cpu)
> 	spin_lock(&per_cpu(ip_tables_lock, cpu));
> 
> oldinfo = private;
> /* do the substitution */
> table->private = newinfo;
> newinfo->initial_entries = oldinfo->initial_entries;
> 
> for_each_possible_cpu(cpu)
> 	spin_unlock(&per_cpu(ip_tables_lock, cpu));
> local_bh_enable();
> 
> 
> But I wonder if this could hit a limit of max spinlocks held by this cpu, say on a 4096 cpu machine ?
> 
> 
> 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
  2009-04-13 17:40                         ` Eric Dumazet
@ 2009-04-13 19:06                       ` Martin Josefsson
  2009-04-13 19:17                         ` Linus Torvalds
  2009-04-13 22:24                       ` Andrew Morton
  2009-04-14 12:27                       ` Patrick McHardy
  3 siblings, 1 reply; 254+ messages in thread
From: Martin Josefsson @ 2009-04-13 19:06 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, paulus, mingo, torvalds, laijs,
	jeff.chua.linux, dada1, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Mon, 13 Apr 2009, Stephen Hemminger wrote:

> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
>
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.

Doesn't spin_lock() result in a pipeline flush on x86?

iirc there was a benchmark in an RCU paper that tested using per cpu 
spin_locks and the result was that it didn't scale well at all.

/Martin

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 19:06                       ` Martin Josefsson
@ 2009-04-13 19:17                         ` Linus Torvalds
  0 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-13 19:17 UTC (permalink / raw)
  To: Martin Josefsson
  Cc: Stephen Hemminger, paulmck, David Miller, paulus, mingo, laijs,
	jeff.chua.linux, dada1, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh



On Mon, 13 Apr 2009, Martin Josefsson wrote:
> 
> Doesn't spin_lock() result in a pipeline flush on x86?

It's about 20-40 cycles when cached and uncontended, with some outliers 
(50+ cycles on P4, 12 cycles on some AMD opterons).

So it's not a big deal if you actually hit that case.

> iirc there was a benchmark in an RCU paper that tested using per cpu
> spin_locks and the result was that it didn't scale well at all.

Spinlocks scale wonderfully well if you only touch them on one CPU.

Of course, if you truly only touch them on one CPU they are pointless, but 
a "all normal code only touches the local CPU spinlock, the really odd 
cases take all locks" approach works fine. It makes the uncommon case 
really quite slow, but if it truly is uncommon, that's fine.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
  2009-04-13 17:40                         ` Eric Dumazet
  2009-04-13 19:06                       ` Martin Josefsson
@ 2009-04-13 22:24                       ` Andrew Morton
  2009-04-13 23:20                         ` Stephen Hemminger
  2009-04-14 12:27                       ` Patrick McHardy
  3 siblings, 1 reply; 254+ messages in thread
From: Andrew Morton @ 2009-04-13 22:24 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, davem, paulus, mingo, torvalds, laijs, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Mon, 13 Apr 2009 09:53:09 -0700
Stephen Hemminger <shemminger@vyatta.com> wrote:

> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Tested basic functionality (add/remove/list), but don't have test cases
> for stress, ip6tables or arptables.
> 
>  unsigned int
>  ipt_do_table(struct sk_buff *skb,
> @@ -339,9 +341,10 @@ ipt_do_table(struct sk_buff *skb,
>  
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>  
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	local_bh_disable();
> +	spin_lock(&__get_cpu_var(ip_tables_lock));

spin_lock_bh()?

> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -436,8 +439,8 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	spin_unlock(&__get_cpu_var(ip_tables_lock));
> +	local_bh_enable();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -891,86 +894,34 @@ get_counters(const struct xt_table_info 
>  	     struct xt_counters counters[])
>  {
>  	unsigned int cpu;
> -	unsigned int i;
> -	unsigned int curcpu;
> +	unsigned int i = 0;
> +	unsigned int curcpu = raw_smp_processor_id();
>  
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
>  	 * We dont care about preemption here.
>  	 */
> -	curcpu = raw_smp_processor_id();
> -
> -	i = 0;
> +	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
> +
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}

This would be racy against cpu hotplug if this code was hotplug-aware.

And it should be hotplug aware, really.  num_possible_cpus() can exceed
num_online_cpus().  The extent by which possible>online is
controversial, but one can conceive of situations where it is "lots".

Is lib/percpu_counter.c no good for this application?  Unfixably no
good?  That code automagically handles cpu hotplug.



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 22:24                       ` Andrew Morton
@ 2009-04-13 23:20                         ` Stephen Hemminger
  2009-04-13 23:26                           ` Andrew Morton
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-13 23:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: paulmck, davem, paulus, mingo, torvalds, laijs, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Mon, 13 Apr 2009 15:24:37 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 13 Apr 2009 09:53:09 -0700
> Stephen Hemminger <shemminger@vyatta.com> wrote:
> 
> > This is an alternative version of ip/ip6/arp tables locking using
> > per-cpu locks.  This avoids the overhead of synchronize_net() during
> > update but still removes the expensive rwlock in earlier versions.
> > 
> > The idea for this came from an earlier version done by Eric Duzamet.
> > Locking is done per-cpu, the fast path locks on the current cpu
> > and updates counters.  The slow case involves acquiring the locks on
> > all cpu's.
> > 
> > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > there already is a mutex for xt[af].mutex that is held.
> > 
> > Tested basic functionality (add/remove/list), but don't have test cases
> > for stress, ip6tables or arptables.
> > 
> >  unsigned int
> >  ipt_do_table(struct sk_buff *skb,
> > @@ -339,9 +341,10 @@ ipt_do_table(struct sk_buff *skb,
> >  
> >  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> >  
> > -	rcu_read_lock_bh();
> > -	private = rcu_dereference(table->private);
> > -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > +	local_bh_disable();
> > +	spin_lock(&__get_cpu_var(ip_tables_lock));
> 
> spin_lock_bh()?

No. get_cpu_var implies smp_processor_id which is not safe
without preempt_disable (ie bh disable).

> 
> > +	private = table->private;
> > +	table_base = private->entries[smp_processor_id()];
> >  
> >  	e = get_entry(table_base, private->hook_entry[hook]);
> >  
> > @@ -436,8 +439,8 @@ ipt_do_table(struct sk_buff *skb,
> >  			e = (void *)e + e->next_offset;
> >  		}
> >  	} while (!hotdrop);
> > -
> > -	rcu_read_unlock_bh();
> > +	spin_unlock(&__get_cpu_var(ip_tables_lock));
> > +	local_bh_enable();
> >  
> >  #ifdef DEBUG_ALLOW_ALL
> >  	return NF_ACCEPT;
> > @@ -891,86 +894,34 @@ get_counters(const struct xt_table_info 
> >  	     struct xt_counters counters[])
> >  {
> >  	unsigned int cpu;
> > -	unsigned int i;
> > -	unsigned int curcpu;
> > +	unsigned int i = 0;
> > +	unsigned int curcpu = raw_smp_processor_id();
> >  
> >  	/* Instead of clearing (by a previous call to memset())
> >  	 * the counters and using adds, we set the counters
> >  	 * with data used by 'current' CPU
> >  	 * We dont care about preemption here.
> >  	 */
> > -	curcpu = raw_smp_processor_id();
> > -
> > -	i = 0;
> > +	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
> >  	IPT_ENTRY_ITERATE(t->entries[curcpu],
> >  			  t->size,
> >  			  set_entry_to_counter,
> >  			  counters,
> >  			  &i);
> > +	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
> >  
> >  	for_each_possible_cpu(cpu) {
> >  		if (cpu == curcpu)
> >  			continue;
> > +
> >  		i = 0;
> > +		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
> >  		IPT_ENTRY_ITERATE(t->entries[cpu],
> >  				  t->size,
> >  				  add_entry_to_counter,
> >  				  counters,
> >  				  &i);
> > -	}
> 
> This would be racy against cpu hotplug if this code was hotplug-aware.
> 
> And it should be hotplug aware, really.  num_possible_cpus() can exceed
> num_online_cpus().  The extent by which possible>online is
> controversial, but one can conceive of situations where it is "lots".

It is doing right thing already with hotplug.
This code still needs to count packets processed by previously online
cpu, that is no longer there.

> Is lib/percpu_counter.c no good for this application?  Unfixably no
> good?  That code automagically handles cpu hotplug.

percpu_counter can't deal with the layout/load here.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 23:20                         ` Stephen Hemminger
@ 2009-04-13 23:26                           ` Andrew Morton
  2009-04-13 23:37                             ` Linus Torvalds
  0 siblings, 1 reply; 254+ messages in thread
From: Andrew Morton @ 2009-04-13 23:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, davem, paulus, mingo, torvalds, laijs, jeff.chua.linux,
	dada1, jengelh, kaber, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Mon, 13 Apr 2009 16:20:00 -0700
Stephen Hemminger <shemminger@vyatta.com> wrote:

> On Mon, 13 Apr 2009 15:24:37 -0700
> Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Mon, 13 Apr 2009 09:53:09 -0700
> > Stephen Hemminger <shemminger@vyatta.com> wrote:
> > 
> > > This is an alternative version of ip/ip6/arp tables locking using
> > > per-cpu locks.  This avoids the overhead of synchronize_net() during
> > > update but still removes the expensive rwlock in earlier versions.
> > > 
> > > The idea for this came from an earlier version done by Eric Duzamet.
> > > Locking is done per-cpu, the fast path locks on the current cpu
> > > and updates counters.  The slow case involves acquiring the locks on
> > > all cpu's.
> > > 
> > > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > > there already is a mutex for xt[af].mutex that is held.
> > > 
> > > Tested basic functionality (add/remove/list), but don't have test cases
> > > for stress, ip6tables or arptables.
> > > 
> > >  unsigned int
> > >  ipt_do_table(struct sk_buff *skb,
> > > @@ -339,9 +341,10 @@ ipt_do_table(struct sk_buff *skb,
> > >  
> > >  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > >  
> > > -	rcu_read_lock_bh();
> > > -	private = rcu_dereference(table->private);
> > > -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > > +	local_bh_disable();
> > > +	spin_lock(&__get_cpu_var(ip_tables_lock));
> > 
> > spin_lock_bh()?
> 
> No. get_cpu_var implies smp_processor_id which is not safe
> without preempt_disable (ie bh disable).

spin_lock_bh() will dtrt, but spelling it out seems a good idea.

It should have an explanatory comment, IMO.

> > 
> > And it should be hotplug aware, really.  num_possible_cpus() can exceed
> > num_online_cpus().  The extent by which possible>online is
> > controversial, but one can conceive of situations where it is "lots".
> 
> It is doing right thing already with hotplug.

It's slow.

> This code still needs to count packets processed by previously online
> cpu, that is no longer there.

Those counts could be migrated off that CPU when it is offlined.  As
percpucounter does.

> > Is lib/percpu_counter.c no good for this application?  Unfixably no
> > good?  That code automagically handles cpu hotplug.
> 
> percpu_counter can't deal with the layout/load here.

Insufficient detail here for anyone to understand why percpucounter
cannot be adapted to this requirement.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 23:26                           ` Andrew Morton
@ 2009-04-13 23:37                             ` Linus Torvalds
  2009-04-13 23:52                               ` Ingo Molnar
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-13 23:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Stephen Hemminger, paulmck, davem, paulus, mingo, laijs,
	jeff.chua.linux, dada1, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh



On Mon, 13 Apr 2009, Andrew Morton wrote:
> > > >  
> > > > -	rcu_read_lock_bh();
> > > > -	private = rcu_dereference(table->private);
> > > > -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > > > +	local_bh_disable();
> > > > +	spin_lock(&__get_cpu_var(ip_tables_lock));
> > > 
> > > spin_lock_bh()?
> > 
> > No. get_cpu_var implies smp_processor_id which is not safe
> > without preempt_disable (ie bh disable).
> 
> spin_lock_bh() will dtrt, but spelling it out seems a good idea.

No, spin_lock_bh() will _not_ do the right thing. 

On UP it will actually work for two reasons: it will work because (a) it's 
UP, so there are no issues with smp_processor_id() to beging with, but 
also because even if there _were_ issues, it would still work because it 
would all expand as a macro, and the preempt_disable() will actually 
happen before the argument is evaluated.

But on SMP, spin_lock_bh() expands to just _spin_lock_bh(), and is a real 
function - and the argument will be evaluated before the call (obviously), 
and thus before the preempt_disable().

So

	local_bh_disable();
	spin_lock(&__get_cpu_var(ip_tables_lock));

is correct, and 

	spin_lock_bh(&__get_cpu_var(ip_tables_lock));

is _not_ correct. The latter will do "&__get_cpu_var(ip_tables_lock)"
with no protection from the process being switched to another CPU.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 23:37                             ` Linus Torvalds
@ 2009-04-13 23:52                               ` Ingo Molnar
  0 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-13 23:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Stephen Hemminger, paulmck, davem, paulus, laijs,
	jeff.chua.linux, dada1, jengelh, kaber, r000n, linux-kernel,
	netfilter-devel, netdev, benh


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Mon, 13 Apr 2009, Andrew Morton wrote:
> > > > >  
> > > > > -	rcu_read_lock_bh();
> > > > > -	private = rcu_dereference(table->private);
> > > > > -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > > > > +	local_bh_disable();
> > > > > +	spin_lock(&__get_cpu_var(ip_tables_lock));
> > > > 
> > > > spin_lock_bh()?
> > > 
> > > No. get_cpu_var implies smp_processor_id which is not safe
> > > without preempt_disable (ie bh disable).
> > 
> > spin_lock_bh() will dtrt, but spelling it out seems a good idea.
> 
> No, spin_lock_bh() will _not_ do the right thing. 
> 
> On UP it will actually work for two reasons: it will work because (a) it's 
> UP, so there are no issues with smp_processor_id() to beging with, but 
> also because even if there _were_ issues, it would still work because it 
> would all expand as a macro, and the preempt_disable() will actually 
> happen before the argument is evaluated.
> 
> But on SMP, spin_lock_bh() expands to just _spin_lock_bh(), and is 
> a real function - and the argument will be evaluated before the 
> call (obviously), and thus before the preempt_disable().
> 
> So
> 
> 	local_bh_disable();
> 	spin_lock(&__get_cpu_var(ip_tables_lock));
> 
> is correct, and 
> 
> 	spin_lock_bh(&__get_cpu_var(ip_tables_lock));
> 
> is _not_ correct. The latter will do 
> "&__get_cpu_var(ip_tables_lock)" with no protection from the 
> process being switched to another CPU.

One option would be to make it more robust for such use. The 
downside would be all the other cases where the expression is really 
constant (but still takes a few instructions to calculate) and could 
be (and should be) evaluated outside of that critical section.

So i'd tend to leave it in its current (optimistic) form: we've got 
1283 uses of spin_lock_bh(), and just a quick look at git grep 
suggests that the current optimistic optimization matters in 
practice.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
                                         ` (2 preceding siblings ...)
  2009-04-13 22:24                       ` Andrew Morton
@ 2009-04-14 12:27                       ` Patrick McHardy
  2009-04-14 14:23                         ` Eric Dumazet
  3 siblings, 1 reply; 254+ messages in thread
From: Patrick McHardy @ 2009-04-14 12:27 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, paulus, mingo, torvalds, laijs,
	jeff.chua.linux, dada1, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger wrote:
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Tested basic functionality (add/remove/list), but don't have test cases
> for stress, ip6tables or arptables.

Thanks Stephen, I'll do some testing with ip6tables.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-14 12:27                       ` Patrick McHardy
@ 2009-04-14 14:23                         ` Eric Dumazet
  2009-04-14 14:45                           ` Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 14:23 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jeff.chua.linux, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Patrick McHardy a écrit :
> Stephen Hemminger wrote:
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Duzamet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  The slow case involves acquiring the locks on
>> all cpu's.
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Tested basic functionality (add/remove/list), but don't have test cases
>> for stress, ip6tables or arptables.
> 
> Thanks Stephen, I'll do some testing with ip6tables.

Here is the patch I cooked on top of Stephen one to get proper locking.

In the "iptables -L" case, we freeze updates on all cpus to get previous
RCU behavior (not sure it is mandatory, but anyway...)

And xt_replace_table() uses same logic to make sure a cpu wont try to parse rules
and update counters while a writer is replacing tables (and thus calling
vfree() and unmapping in-use pages)

Feel free to merge this patch to Stephen one before upstream submission

Thank you

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/netfilter/x_tables.h |    5 ++
 net/ipv4/netfilter/arp_tables.c    |   20 +++------
 net/ipv4/netfilter/ip_tables.c     |   24 ++++-------
 net/ipv6/netfilter/ip6_tables.c    |   24 ++++-------
 net/netfilter/x_tables.c           |   55 ++++++++++++++++++++++++++-
 5 files changed, 84 insertions(+), 44 deletions(-)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1ff1a76..a5840a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 					   const char *name);
 extern void xt_table_unlock(struct xt_table *t);
 
+extern void xt_tlock_lockall(void);
+extern void xt_tlock_unlockall(void);
+extern void xt_tlock_lock(void);
+extern void xt_tlock_unlock(void);
+
 extern int xt_proto_init(struct net *net, u_int8_t af);
 extern void xt_proto_fini(struct net *net, u_int8_t af);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c60cc11..b561e1e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
 	return (struct arpt_entry *)(base + offset);
 }
 
-static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
-
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	outdev = out ? out->name : nulldevname;
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 	if (hotdrop)
@@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 	/* Choose the copy that is on our node */
 	local_bh_disable();
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
  unlock_up_free:
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cb3b779..81d173e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
-	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 
 		i = 0;
-		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_ops = {
 
 static int __init ip_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ac46ca4..d6ba69e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_ops = {
 
 static int __init ip6_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0d94020..f2ad79f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -670,19 +670,21 @@ xt_replace_table(struct xt_table *table,
 {
 	struct xt_table_info *oldinfo, *private;
 
+	xt_tlock_lockall();
 	/* Do the substitution. */
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
+		xt_tlock_unlockall();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
 	table->private = newinfo;
 	newinfo->initial_entries = oldinfo->initial_entries;
-
+	xt_tlock_unlockall();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1126,9 +1128,58 @@ static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
 };
 
+static DEFINE_PER_CPU(spinlock_t, xt_tables_lock);
+
+void xt_tlock_lockall(void)
+{
+	int cpu;
+
+	local_bh_disable();
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		spin_lock(&per_cpu(xt_tables_lock, cpu));
+		/*
+		 * avoid preempt counter overflow
+		 */
+		preempt_enable_no_resched();
+	}
+}
+EXPORT_SYMBOL(xt_tlock_lockall);
+
+void xt_tlock_unlockall(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		spin_unlock(&per_cpu(xt_tables_lock, cpu));
+	}
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(xt_tlock_unlockall);
+
+/*
+ * preemption should be disabled by caller
+ */
+void xt_tlock_lock(void)
+{
+	spin_lock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_lock);
+
+void xt_tlock_unlock(void)
+{
+	spin_unlock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_unlock);
+
 static int __init xt_init(void)
 {
-	int i, rv;
+	int i, rv, cpu;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(xt_tables_lock, cpu));
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)



^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-14 14:23                         ` Eric Dumazet
@ 2009-04-14 14:45                           ` Stephen Hemminger
  2009-04-14 15:49                               ` Eric Dumazet
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-14 14:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, paulmck, David Miller, paulus, mingo, torvalds,
	laijs, jeff.chua.linux, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Tue, 14 Apr 2009 16:23:33 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Patrick McHardy a écrit :
> > Stephen Hemminger wrote:
> >> This is an alternative version of ip/ip6/arp tables locking using
> >> per-cpu locks.  This avoids the overhead of synchronize_net() during
> >> update but still removes the expensive rwlock in earlier versions.
> >>
> >> The idea for this came from an earlier version done by Eric Duzamet.
> >> Locking is done per-cpu, the fast path locks on the current cpu
> >> and updates counters.  The slow case involves acquiring the locks on
> >> all cpu's.
> >>
> >> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> >> there already is a mutex for xt[af].mutex that is held.
> >>
> >> Tested basic functionality (add/remove/list), but don't have test cases
> >> for stress, ip6tables or arptables.
> > 
> > Thanks Stephen, I'll do some testing with ip6tables.
> 
> Here is the patch I cooked on top of Stephen one to get proper locking.

I see no demonstrated problem with locking in my version.
The reader/writer race is already handled. On replace the race of

CPU 0                          CPU 1
                           lock (iptables(1))
                           refer to oldinfo
swap in new info
foreach CPU
   lock iptables(i)
   (spin)                  unlock(iptables(1))
   read oldinfo
   unlock
...

The point is my locking works, you just seem to feel more comfortable with
a global "stop all CPU's" solution.

> In the "iptables -L" case, we freeze updates on all cpus to get previous
> RCU behavior (not sure it is mandatory, but anyway...)

No, it isn't. Because the code in get_counters will fetch all CPU's.

> And xt_replace_table() uses same logic to make sure a cpu wont try to parse rules
> and update counters while a writer is replacing tables (and thus calling
> vfree() and unmapping in-use pages)

With RCU type semantics this isn't an issue. A CPU always sees consistent
state, and it counters never get lost.

> Feel free to merge this patch to Stephen one before upstream submission
> 
> Thank you
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  include/linux/netfilter/x_tables.h |    5 ++
>  net/ipv4/netfilter/arp_tables.c    |   20 +++------
>  net/ipv4/netfilter/ip_tables.c     |   24 ++++-------
>  net/ipv6/netfilter/ip6_tables.c    |   24 ++++-------
>  net/netfilter/x_tables.c           |   55 ++++++++++++++++++++++++++-
>  5 files changed, 84 insertions(+), 44 deletions(-)
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 1ff1a76..a5840a4 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  					   const char *name);
>  extern void xt_table_unlock(struct xt_table *t);
>  
> +extern void xt_tlock_lockall(void);
> +extern void xt_tlock_unlockall(void);
> +extern void xt_tlock_lock(void);
> +extern void xt_tlock_unlock(void);
> +
>  extern int xt_proto_init(struct net *net, u_int8_t af);
>  extern void xt_proto_fini(struct net *net, u_int8_t af);
>  
> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
> index c60cc11..b561e1e 100644
> --- a/net/ipv4/netfilter/arp_tables.c
> +++ b/net/ipv4/netfilter/arp_tables.c
> @@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
>  	return (struct arpt_entry *)(base + offset);
>  }
>  
> -static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
> -
>  unsigned int arpt_do_table(struct sk_buff *skb,
>  			   unsigned int hook,
>  			   const struct net_device *in,
> @@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  	outdev = out ? out->name : nulldevname;
>  
>  	local_bh_disable();
> -	spin_lock(&__get_cpu_var(arp_tables_lock));
> +	xt_tlock_lock();
>  	private = table->private;
>  	table_base = private->entries[smp_processor_id()];
>  
> @@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -	spin_unlock(&__get_cpu_var(arp_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>  
>  	if (hotdrop)
> @@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_info *t,
>  {
>  	unsigned int cpu;
>  	unsigned int i = 0;
> -	unsigned int curcpu = raw_smp_processor_id();
> +	unsigned int curcpu;
>  
> +	xt_tlock_lockall();
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
>  	 */
> -	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
> +	curcpu = smp_processor_id();
>  	ARPT_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> -	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> -		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
>  				   counters,
>  				   &i);
> -		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
>  	}
> +	xt_tlock_unlockall();
>  }
>  
>  static struct xt_counters *alloc_counters(struct xt_table *table)
> @@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  	/* Choose the copy that is on our node */
>  	local_bh_disable();
>  	curcpu = smp_processor_id();
> -	spin_lock(&__get_cpu_var(arp_tables_lock));
> +	xt_tlock_lock();
>  	loc_cpu_entry = private->entries[curcpu];
>  	ARPT_ENTRY_ITERATE(loc_cpu_entry,
>  			   private->size,
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	spin_unlock(&__get_cpu_var(arp_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>   unlock_up_free:
>  
> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
> index cb3b779..81d173e 100644
> --- a/net/ipv4/netfilter/ip_tables.c
> +++ b/net/ipv4/netfilter/ip_tables.c
> @@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> -static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
>  
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
> @@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb,
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>  
>  	local_bh_disable();
> -	spin_lock(&__get_cpu_var(ip_tables_lock));
> +	xt_tlock_lock();
>  	private = table->private;
>  	table_base = private->entries[smp_processor_id()];
>  
> @@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -	spin_unlock(&__get_cpu_var(ip_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>  
>  #ifdef DEBUG_ALLOW_ALL
> @@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t,
>  {
>  	unsigned int cpu;
>  	unsigned int i = 0;
> -	unsigned int curcpu = raw_smp_processor_id();
> +	unsigned int curcpu;
>  
> +	xt_tlock_lockall();
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
>  	 */
> -	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
> +	curcpu = smp_processor_id();
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> -	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  
>  		i = 0;
> -		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
>  	}
> +	xt_tlock_unlockall();
>  }
>  
>  static struct xt_counters * alloc_counters(struct xt_table *table)
> @@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  	local_bh_disable();
>  	/* Choose the copy that is on our node */
>  	curcpu = smp_processor_id();
> -	spin_lock(&__get_cpu_var(ip_tables_lock));
> +	xt_tlock_lock();
>  	loc_cpu_entry = private->entries[curcpu];
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	spin_unlock(&__get_cpu_var(ip_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>  
>   unlock_up_free:
> @@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_ops = {
>  
>  static int __init ip_tables_init(void)
>  {
> -	int cpu, ret;
> -
> -	for_each_possible_cpu(cpu)
> -		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
> +	int ret;
>  
>  	ret = register_pernet_subsys(&ip_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
> index ac46ca4..d6ba69e 100644
> --- a/net/ipv6/netfilter/ip6_tables.c
> +++ b/net/ipv6/netfilter/ip6_tables.c
> @@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> -static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
>  
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
> @@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>  
>  	local_bh_disable();
> -	spin_lock(&__get_cpu_var(ip6_tables_lock));
> +	xt_tlock_lock();
>  	private = table->private;
>  	table_base = private->entries[smp_processor_id()];
>  
> @@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb,
>  #ifdef CONFIG_NETFILTER_DEBUG
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
> -	spin_unlock(&__get_cpu_var(ip6_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>  
>  #ifdef DEBUG_ALLOW_ALL
> @@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t,
>  {
>  	unsigned int cpu;
>  	unsigned int i = 0;
> -	unsigned int curcpu = raw_smp_processor_id();
> +	unsigned int curcpu;
>  
> +	xt_tlock_lockall();
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
>  	 */
> -	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
> +	curcpu = smp_processor_id();
>  	IP6T_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> -	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> -		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
>  	}
> +	xt_tlock_unlockall();
>  }
>  
>  static struct xt_counters *alloc_counters(struct xt_table *table)
> @@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
>  	local_bh_disable();
>  	/* Choose the copy that is on our node */
>  	curcpu = smp_processor_id();
> -	spin_lock(&__get_cpu_var(ip6_tables_lock));
> +	xt_tlock_lock();
>  	loc_cpu_entry = private->entries[curcpu];
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	spin_unlock(&__get_cpu_var(ip6_tables_lock));
> +	xt_tlock_unlock();
>  	local_bh_enable();
>  
>   unlock_up_free:
> @@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_ops = {
>  
>  static int __init ip6_tables_init(void)
>  {
> -	int cpu, ret;
> -
> -	for_each_possible_cpu(cpu)
> -		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
> +	int ret;
>  
>  	ret = register_pernet_subsys(&ip6_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
> index 0d94020..f2ad79f 100644
> --- a/net/netfilter/x_tables.c
> +++ b/net/netfilter/x_tables.c
> @@ -670,19 +670,21 @@ xt_replace_table(struct xt_table *table,
>  {
>  	struct xt_table_info *oldinfo, *private;
>  
> +	xt_tlock_lockall();
>  	/* Do the substitution. */
>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> +		xt_tlock_unlockall();
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
>  	oldinfo = private;
>  	table->private = newinfo;
>  	newinfo->initial_entries = oldinfo->initial_entries;
> -
> +	xt_tlock_unlockall();
>  	return oldinfo;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
> @@ -1126,9 +1128,58 @@ static struct pernet_operations xt_net_ops = {
>  	.init = xt_net_init,
>  };
>  
> +static DEFINE_PER_CPU(spinlock_t, xt_tables_lock);
> +
> +void xt_tlock_lockall(void)
> +{
> +	int cpu;
> +
> +	local_bh_disable();
> +	preempt_disable();
> +	for_each_possible_cpu(cpu) {
> +		spin_lock(&per_cpu(xt_tables_lock, cpu));
> +		/*
> +		 * avoid preempt counter overflow
> +		 */
> +		preempt_enable_no_resched();
> +	}
> +}
> +EXPORT_SYMBOL(xt_tlock_lockall);
> +
> +void xt_tlock_unlockall(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		preempt_disable();
> +		spin_unlock(&per_cpu(xt_tables_lock, cpu));
> +	}
> +	preempt_enable();
> +	local_bh_enable();
> +}
> +EXPORT_SYMBOL(xt_tlock_unlockall);
> +
> +/*
> + * preemption should be disabled by caller
> + */
> +void xt_tlock_lock(void)
> +{
> +	spin_lock(&__get_cpu_var(xt_tables_lock));
> +}
> +EXPORT_SYMBOL(xt_tlock_lock);
> +
> +void xt_tlock_unlock(void)
> +{
> +	spin_unlock(&__get_cpu_var(xt_tables_lock));
> +}
> +EXPORT_SYMBOL(xt_tlock_unlock);
> +
>  static int __init xt_init(void)
>  {
> -	int i, rv;
> +	int i, rv, cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(xt_tables_lock, cpu));
>  
>  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
>  	if (!xt)
> 
> 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-14 14:45                           ` Stephen Hemminger
@ 2009-04-14 15:49                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 15:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Patrick McHardy, paulmck, David Miller, paulus, mingo, torvalds,
	laijs, jeff.chua.linux, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger a écrit :
> On Tue, 14 Apr 2009 16:23:33 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Patrick McHardy a écrit :
>>> Stephen Hemminger wrote:
>>>> This is an alternative version of ip/ip6/arp tables locking using
>>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>>>> update but still removes the expensive rwlock in earlier versions.
>>>>
>>>> The idea for this came from an earlier version done by Eric Duzamet.
>>>> Locking is done per-cpu, the fast path locks on the current cpu
>>>> and updates counters.  The slow case involves acquiring the locks on
>>>> all cpu's.
>>>>
>>>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>>>> there already is a mutex for xt[af].mutex that is held.
>>>>
>>>> Tested basic functionality (add/remove/list), but don't have test cases
>>>> for stress, ip6tables or arptables.
>>> Thanks Stephen, I'll do some testing with ip6tables.
>> Here is the patch I cooked on top of Stephen one to get proper locking.
> 
> I see no demonstrated problem with locking in my version.

Yes, I did not crash any machine around there, should we wait for a bug report ? :)

> The reader/writer race is already handled. On replace the race of
> 
> CPU 0                          CPU 1
>                            lock (iptables(1))
>                            refer to oldinfo
> swap in new info
> foreach CPU
>    lock iptables(i)
>    (spin)                  unlock(iptables(1))
>    read oldinfo
>    unlock
> ...
> 
> The point is my locking works, you just seem to feel more comfortable with
> a global "stop all CPU's" solution.

Oh right, I missed that xt_replace_table() was *followed* by a get_counters()
call, but I am pretty sure something is needed in xt_replace_table().

A memory barrier at least (smp_wmb())

As soon as we do "table->private = newinfo;", other cpus might fetch incorrect
values for newinfo->fields.

In the past, we had a write_lock_bh()/write_unlock_bh() pair that was
doing this for us.
Then we had rcu_assign_pointer() that also had this memory barrier implied.

Even if vmalloc() calls we do before calling xt_replace_table() probably
already force barriers, add one for reference, just in case we change callers
logic to call kmalloc() instead of vmalloc() or whatever...

> 
>> In the "iptables -L" case, we freeze updates on all cpus to get previous
>> RCU behavior (not sure it is mandatory, but anyway...)
> 
> No, it isn't. Because the code in get_counters will fetch all CPU's.

Previous to RCU conversion, we had a rwlock.

Doing a write_lock_bh() on it while reading counters (iptables -L)
*did* stop all cpus from doing their read_lock_bh() and counters updates.

After RCU and your last patch, an "iptables -L" locks each table one by one.

This is correct, since a cpu wont update its table while we are fetching it,
but we lost previous "rwlock freeze all" behavior, and some apps/users could
complain about it, this is why I said "not sure it is mandatory"...

Here is an updated patch ontop of yours, with the smp_wmb() in xt_replace_table() :

Thank you

 include/linux/netfilter/x_tables.h |    5 ++
 net/ipv4/netfilter/arp_tables.c    |   20 +++------
 net/ipv4/netfilter/ip_tables.c     |   24 ++++-------
 net/ipv6/netfilter/ip6_tables.c    |   24 ++++-------
 net/netfilter/x_tables.c           |   57 ++++++++++++++++++++++++++-
 5 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1ff1a76..a5840a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 					   const char *name);
 extern void xt_table_unlock(struct xt_table *t);
 
+extern void xt_tlock_lockall(void);
+extern void xt_tlock_unlockall(void);
+extern void xt_tlock_lock(void);
+extern void xt_tlock_unlock(void);
+
 extern int xt_proto_init(struct net *net, u_int8_t af);
 extern void xt_proto_fini(struct net *net, u_int8_t af);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c60cc11..b561e1e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
 	return (struct arpt_entry *)(base + offset);
 }
 
-static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
-
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	outdev = out ? out->name : nulldevname;
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 	if (hotdrop)
@@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 	/* Choose the copy that is on our node */
 	local_bh_disable();
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
  unlock_up_free:
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cb3b779..81d173e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
-	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 
 		i = 0;
-		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_ops = {
 
 static int __init ip_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ac46ca4..d6ba69e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_ops = {
 
 static int __init ip6_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0d94020..3cf19bf 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -680,9 +680,13 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 	oldinfo = private;
+	/*
+	 * make sure all newinfo fields are committed to memory before changing
+	 * table->private, since other cpus have no synchronization with us.
+	 */
+	smp_wmb();
 	table->private = newinfo;
 	newinfo->initial_entries = oldinfo->initial_entries;
-
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1126,9 +1130,58 @@ static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
 };
 
+static DEFINE_PER_CPU(spinlock_t, xt_tables_lock);
+
+void xt_tlock_lockall(void)
+{
+	int cpu;
+
+	local_bh_disable();
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		spin_lock(&per_cpu(xt_tables_lock, cpu));
+		/*
+		 * avoid preempt counter overflow
+		 */
+		preempt_enable_no_resched();
+	}
+}
+EXPORT_SYMBOL(xt_tlock_lockall);
+
+void xt_tlock_unlockall(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		spin_unlock(&per_cpu(xt_tables_lock, cpu));
+	}
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(xt_tlock_unlockall);
+
+/*
+ * preemption should be disabled by caller
+ */
+void xt_tlock_lock(void)
+{
+	spin_lock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_lock);
+
+void xt_tlock_unlock(void)
+{
+	spin_unlock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_unlock);
+
 static int __init xt_init(void)
 {
-	int i, rv;
+	int i, rv, cpu;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(xt_tables_lock, cpu));
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)



^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
@ 2009-04-14 15:49                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 15:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Patrick McHardy, paulmck, David Miller, paulus, mingo, torvalds,
	laijs, jeff.chua.linux, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger a écrit :
> On Tue, 14 Apr 2009 16:23:33 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Patrick McHardy a écrit :
>>> Stephen Hemminger wrote:
>>>> This is an alternative version of ip/ip6/arp tables locking using
>>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>>>> update but still removes the expensive rwlock in earlier versions.
>>>>
>>>> The idea for this came from an earlier version done by Eric Duzamet.
>>>> Locking is done per-cpu, the fast path locks on the current cpu
>>>> and updates counters.  The slow case involves acquiring the locks on
>>>> all cpu's.
>>>>
>>>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>>>> there already is a mutex for xt[af].mutex that is held.
>>>>
>>>> Tested basic functionality (add/remove/list), but don't have test cases
>>>> for stress, ip6tables or arptables.
>>> Thanks Stephen, I'll do some testing with ip6tables.
>> Here is the patch I cooked on top of Stephen one to get proper locking.
> 
> I see no demonstrated problem with locking in my version.

Yes, I did not crash any machine around there, should we wait for a bug report ? :)

> The reader/writer race is already handled. On replace the race of
> 
> CPU 0                          CPU 1
>                            lock (iptables(1))
>                            refer to oldinfo
> swap in new info
> foreach CPU
>    lock iptables(i)
>    (spin)                  unlock(iptables(1))
>    read oldinfo
>    unlock
> ...
> 
> The point is my locking works, you just seem to feel more comfortable with
> a global "stop all CPU's" solution.

Oh right, I missed that xt_replace_table() was *followed* by a get_counters()
call, but I am pretty sure something is needed in xt_replace_table().

A memory barrier at least (smp_wmb())

As soon as we do "table->private = newinfo;", other cpus might fetch incorrect
values for newinfo->fields.

In the past, we had a write_lock_bh()/write_unlock_bh() pair that was
doing this for us.
Then we had rcu_assign_pointer() that also had this memory barrier implied.

Even if vmalloc() calls we do before calling xt_replace_table() probably
already force barriers, add one for reference, just in case we change callers
logic to call kmalloc() instead of vmalloc() or whatever...

> 
>> In the "iptables -L" case, we freeze updates on all cpus to get previous
>> RCU behavior (not sure it is mandatory, but anyway...)
> 
> No, it isn't. Because the code in get_counters will fetch all CPU's.

Previous to RCU conversion, we had a rwlock.

Doing a write_lock_bh() on it while reading counters (iptables -L)
*did* stop all cpus from doing their read_lock_bh() and counters updates.

After RCU and your last patch, an "iptables -L" locks each table one by one.

This is correct, since a cpu wont update its table while we are fetching it,
but we lost previous "rwlock freeze all" behavior, and some apps/users could
complain about it, this is why I said "not sure it is mandatory"...

Here is an updated patch ontop of yours, with the smp_wmb() in xt_replace_table() :

Thank you

 include/linux/netfilter/x_tables.h |    5 ++
 net/ipv4/netfilter/arp_tables.c    |   20 +++------
 net/ipv4/netfilter/ip_tables.c     |   24 ++++-------
 net/ipv6/netfilter/ip6_tables.c    |   24 ++++-------
 net/netfilter/x_tables.c           |   57 ++++++++++++++++++++++++++-
 5 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1ff1a76..a5840a4 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 					   const char *name);
 extern void xt_table_unlock(struct xt_table *t);
 
+extern void xt_tlock_lockall(void);
+extern void xt_tlock_unlockall(void);
+extern void xt_tlock_lock(void);
+extern void xt_tlock_unlock(void);
+
 extern int xt_proto_init(struct net *net, u_int8_t af);
 extern void xt_proto_fini(struct net *net, u_int8_t af);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c60cc11..b561e1e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
 	return (struct arpt_entry *)(base + offset);
 }
 
-static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
-
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	outdev = out ? out->name : nulldevname;
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 	if (hotdrop)
@@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 	/* Choose the copy that is on our node */
 	local_bh_disable();
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
  unlock_up_free:
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cb3b779..81d173e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
-	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 
 		i = 0;
-		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_ops = {
 
 static int __init ip_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ac46ca4..d6ba69e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
-static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
 
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
@@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
 	local_bh_disable();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i = 0;
-	unsigned int curcpu = raw_smp_processor_id();
+	unsigned int curcpu;
 
+	xt_tlock_lockall();
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
+	curcpu = smp_processor_id();
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
-	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
-		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
+	xt_tlock_unlockall();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
@@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 	local_bh_disable();
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_lock();
 	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	xt_tlock_unlock();
 	local_bh_enable();
 
  unlock_up_free:
@@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_ops = {
 
 static int __init ip6_tables_init(void)
 {
-	int cpu, ret;
-
-	for_each_possible_cpu(cpu)
-		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
+	int ret;
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0d94020..3cf19bf 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -680,9 +680,13 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 	oldinfo = private;
+	/*
+	 * make sure all newinfo fields are committed to memory before changing
+	 * table->private, since other cpus have no synchronization with us.
+	 */
+	smp_wmb();
 	table->private = newinfo;
 	newinfo->initial_entries = oldinfo->initial_entries;
-
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1126,9 +1130,58 @@ static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
 };
 
+static DEFINE_PER_CPU(spinlock_t, xt_tables_lock);
+
+void xt_tlock_lockall(void)
+{
+	int cpu;
+
+	local_bh_disable();
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		spin_lock(&per_cpu(xt_tables_lock, cpu));
+		/*
+		 * avoid preempt counter overflow
+		 */
+		preempt_enable_no_resched();
+	}
+}
+EXPORT_SYMBOL(xt_tlock_lockall);
+
+void xt_tlock_unlockall(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		preempt_disable();
+		spin_unlock(&per_cpu(xt_tables_lock, cpu));
+	}
+	preempt_enable();
+	local_bh_enable();
+}
+EXPORT_SYMBOL(xt_tlock_unlockall);
+
+/*
+ * preemption should be disabled by caller
+ */
+void xt_tlock_lock(void)
+{
+	spin_lock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_lock);
+
+void xt_tlock_unlock(void)
+{
+	spin_unlock(&__get_cpu_var(xt_tables_lock));
+}
+EXPORT_SYMBOL(xt_tlock_unlock);
+
 static int __init xt_init(void)
 {
-	int i, rv;
+	int i, rv, cpu;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(xt_tables_lock, cpu));
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-14 15:49                               ` Eric Dumazet
  (?)
@ 2009-04-14 16:51                               ` Jeff Chua
  2009-04-14 18:17                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2) Stephen Hemminger
  -1 siblings, 1 reply; 254+ messages in thread
From: Jeff Chua @ 2009-04-14 16:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Patrick McHardy, paulmck, David Miller,
	paulus, mingo, torvalds, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Tue, Apr 14, 2009 at 11:49 PM, Eric Dumazet <dada1@cosmosbay.com> wrote:
> Stephen Hemminger a écrit :
>> On Tue, 14 Apr 2009 16:23:33 +0200
>> Eric Dumazet <dada1@cosmosbay.com> wrote:
>>
>>> Patrick McHardy a écrit :
>>>> Stephen Hemminger wrote:
>>>>> This is an alternative version of ip/ip6/arp tables locking using
>>>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>>>>> update but still removes the expensive rwlock in earlier versions.

Tested. Loaded as fast as 2.6.29.

> Here is an updated patch ontop of yours, with the smp_wmb() in xt_replace_table() :

Tested as well. Loaded as fast as 2.6.29.

Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU
  2009-04-14 15:49                               ` Eric Dumazet
  (?)
  (?)
@ 2009-04-14 17:19                               ` Stephen Hemminger
  -1 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-14 17:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, paulmck, David Miller, paulus, mingo, torvalds,
	laijs, jeff.chua.linux, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Tue, 14 Apr 2009 17:49:57 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > On Tue, 14 Apr 2009 16:23:33 +0200
> > Eric Dumazet <dada1@cosmosbay.com> wrote:
> > 
> >> Patrick McHardy a écrit :
> >>> Stephen Hemminger wrote:
> >>>> This is an alternative version of ip/ip6/arp tables locking using
> >>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
> >>>> update but still removes the expensive rwlock in earlier versions.
> >>>>
> >>>> The idea for this came from an earlier version done by Eric Duzamet.
> >>>> Locking is done per-cpu, the fast path locks on the current cpu
> >>>> and updates counters.  The slow case involves acquiring the locks on
> >>>> all cpu's.
> >>>>
> >>>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> >>>> there already is a mutex for xt[af].mutex that is held.
> >>>>
> >>>> Tested basic functionality (add/remove/list), but don't have test cases
> >>>> for stress, ip6tables or arptables.
> >>> Thanks Stephen, I'll do some testing with ip6tables.
> >> Here is the patch I cooked on top of Stephen one to get proper locking.
> > 
> > I see no demonstrated problem with locking in my version.
> 
> Yes, I did not crash any machine around there, should we wait for a bug report ? :)
> 
> > The reader/writer race is already handled. On replace the race of
> > 
> > CPU 0                          CPU 1
> >                            lock (iptables(1))
> >                            refer to oldinfo
> > swap in new info
> > foreach CPU
> >    lock iptables(i)
> >    (spin)                  unlock(iptables(1))
> >    read oldinfo
> >    unlock
> > ...
> > 
> > The point is my locking works, you just seem to feel more comfortable with
> > a global "stop all CPU's" solution.
> 
> Oh right, I missed that xt_replace_table() was *followed* by a get_counters()
> call, but I am pretty sure something is needed in xt_replace_table().
> 
> A memory barrier at least (smp_wmb())
> 
> As soon as we do "table->private = newinfo;", other cpus might fetch incorrect
> values for newinfo->fields.
> 
> In the past, we had a write_lock_bh()/write_unlock_bh() pair that was
> doing this for us.
> Then we had rcu_assign_pointer() that also had this memory barrier implied.
> 
> Even if vmalloc() calls we do before calling xt_replace_table() probably
> already force barriers, add one for reference, just in case we change callers
> logic to call kmalloc() instead of vmalloc() or whatever...
>

You are right, doing something with barrier would be safer there.
How about using xchg?

@@ -682,26 +668,19 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
-
-	synchronize_net();
-	return oldinfo;
+	newinfo->initial_entries = private->initial_entries;
+	return xchg(&table->private, newinfo);
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
P.s: we all missed the ordering bug in the RCU version??


^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2)
  2009-04-14 16:51                               ` Jeff Chua
@ 2009-04-14 18:17                                 ` Stephen Hemminger
  2009-04-14 19:28                                   ` Eric Dumazet
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-14 18:17 UTC (permalink / raw)
  To: Jeff Chua
  Cc: Eric Dumazet, Patrick McHardy, paulmck, David Miller, paulus,
	mingo, torvalds, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Subject: iptables: 

This is an alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Duzamet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
 net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
 net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
 net/netfilter/x_tables.c           |   28 --------
 5 files changed, 110 insertions(+), 277 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-14 10:13:59.932292529 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-14 10:14:04.121043331 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-14 10:13:59.923167357 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-14 11:13:51.066830533 -0700
@@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -339,9 +341,13 @@ ipt_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	/* Not spin_lock_bh() because get_cpu_var
+	 *  needs to happen after preempt is disabled
+	 */
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +442,8 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -902,75 +908,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
 }
 
@@ -979,7 +935,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -990,26 +945,10 @@ static struct xt_counters * alloc_counte
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1377,6 +1316,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1388,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2224,10 @@ static struct pernet_operations ip_table
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
--- a/net/netfilter/x_tables.c	2009-04-14 10:13:59.901168540 -0700
+++ b/net/netfilter/x_tables.c	2009-04-14 10:14:04.123043143 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -682,26 +668,19 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
-
-	synchronize_net();
-	return oldinfo;
+	newinfo->initial_entries = private->initial_entries;
+	return xchg(&table->private, newinfo);
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +713,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-14 10:13:59.909167832 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-14 11:15:15.405141464 -0700
@@ -329,6 +329,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -365,9 +367,10 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +469,8 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -931,73 +935,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
 }
 
@@ -1006,7 +962,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1017,26 +972,9 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
-
+	get_counters(private, counters);
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1405,6 +1343,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1416,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2250,10 @@ static struct pernet_operations ip6_tabl
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-14 10:13:59.915167635 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-14 11:14:36.203830871 -0700
@@ -231,6 +231,8 @@ static inline struct arpt_entry *get_ent
 	return (struct arpt_entry *)(base + offset);
 }
 
+static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -253,9 +255,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -328,8 +331,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -716,74 +719,25 @@ static void get_counters(const struct xt
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
 }
 
@@ -792,7 +746,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -804,26 +757,10 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1165,6 +1102,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,25 +1174,25 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
 	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2)
  2009-04-14 18:17                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2) Stephen Hemminger
@ 2009-04-14 19:28                                   ` Eric Dumazet
  2009-04-14 21:11                                     ` Stephen Hemminger
  2009-04-14 21:13                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Stephen Hemminger
  0 siblings, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 19:28 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jeff Chua, Patrick McHardy, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Stephen Hemminger a écrit :
> Subject: iptables: 
> 
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Duzamet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |    5 -
>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
>  net/netfilter/x_tables.c           |   28 --------
>  5 files changed, 110 insertions(+), 277 deletions(-)

Oh well, it seems factorization of this stuff is not what you want, so
I'll stop arguing.

Please check spelling of my name in ChangeLog, and more importantly :
initialize arp_tables_lock that is missing in V1/2

	for_each_possible_cpu(cpu)
		spin_lock_init(&per_cpu(arp_tables_lock, cpu));

Then please add my :

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Thanks



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2)
  2009-04-14 19:28                                   ` Eric Dumazet
@ 2009-04-14 21:11                                     ` Stephen Hemminger
  2009-04-14 21:13                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Stephen Hemminger
  1 sibling, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-14 21:11 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jeff Chua, Patrick McHardy, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

> Oh well, it seems factorization of this stuff is not what you want, so
> I'll stop arguing.

That's no fun..  I didn't mind the refactorizing, but was concerned
that it caused the locking to be out of line in fast path. 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-14 19:28                                   ` Eric Dumazet
  2009-04-14 21:11                                     ` Stephen Hemminger
@ 2009-04-14 21:13                                     ` Stephen Hemminger
  2009-04-14 21:40                                         ` Eric Dumazet
  2009-04-15  3:23                                       ` David Miller
  1 sibling, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-14 21:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jeff Chua, Patrick McHardy, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

This is an alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
 net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
 net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
 net/netfilter/x_tables.c           |   28 --------
 5 files changed, 110 insertions(+), 277 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-14 10:13:59.932292529 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-14 10:14:04.121043331 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-14 10:13:59.923167357 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-14 11:13:51.066830533 -0700
@@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -339,9 +341,13 @@ ipt_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	/* Not spin_lock_bh() because get_cpu_var
+	 *  needs to happen after preempt is disabled
+	 */
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +442,8 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -902,75 +908,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
 }
 
@@ -979,7 +935,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -990,26 +945,10 @@ static struct xt_counters * alloc_counte
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1377,6 +1316,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1388,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2224,10 @@ static struct pernet_operations ip_table
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
--- a/net/netfilter/x_tables.c	2009-04-14 10:13:59.901168540 -0700
+++ b/net/netfilter/x_tables.c	2009-04-14 10:14:04.123043143 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -682,26 +668,19 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
-
-	synchronize_net();
-	return oldinfo;
+	newinfo->initial_entries = private->initial_entries;
+	return xchg(&table->private, newinfo);
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +713,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-14 10:13:59.909167832 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-14 11:15:15.405141464 -0700
@@ -329,6 +329,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -365,9 +367,10 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +469,8 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -931,73 +935,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
 }
 
@@ -1006,7 +962,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1017,26 +972,9 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
-
+	get_counters(private, counters);
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1405,6 +1343,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1416,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2250,10 @@ static struct pernet_operations ip6_tabl
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-14 10:13:59.915167635 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-14 14:12:30.169955949 -0700
@@ -231,6 +231,8 @@ static inline struct arpt_entry *get_ent
 	return (struct arpt_entry *)(base + offset);
 }
 
+static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -253,9 +255,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -328,8 +331,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -716,74 +719,25 @@ static void get_counters(const struct xt
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
 }
 
@@ -792,7 +746,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -804,26 +757,10 @@ static struct xt_counters *alloc_counter
 	if (counters == NULL)
 		goto nomem;
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
 
- free_counters:
-	vfree(counters);
  nomem:
 	return ERR_PTR(-ENOMEM);
 }
@@ -1165,6 +1102,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,25 +1174,25 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
+	local_bh_disable();
+	spin_lock(&__get_cpu_var(arp_tables_lock));
 	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(arp_tables_lock));
+	local_bh_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);
@@ -1923,7 +1873,10 @@ static struct pernet_operations arp_tabl
 
 static int __init arp_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(arp_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&arp_tables_net_ops);
 	if (ret < 0)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-14 21:13                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Stephen Hemminger
@ 2009-04-14 21:40                                         ` Eric Dumazet
  2009-04-15  3:23                                       ` David Miller
  1 sibling, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 21:40 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jeff Chua, Patrick McHardy, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Stephen Hemminger a écrit :
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |    5 -
>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
>  net/netfilter/x_tables.c           |   28 --------
>  5 files changed, 110 insertions(+), 277 deletions(-)
> 

Tested successfuly on my dev machine, thanks Stephen.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


"strace -tt -T iptables -L" on this 8 cpu machine show quite fast operations now
(sub 1/HZ) 

23:37:26.629489 getsockopt(3, SOL_IP, 0x40 /* IP_??? */, "filter\0\300\\\236@\365\271\231\"\300`\225T\367\1\0\0\0\0vm\300`\225T\367\
16"..., [84]) = 0 <0.000008>
23:37:26.629579 brk(0)                  = 0x8054000 <0.000006>
23:37:26.629613 brk(0x8075000)          = 0x8075000 <0.000007>
23:37:26.629660 getsockopt(3, SOL_IP, 0x41 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0L"..., [10608])
= 0 <0.000031>
23:37:26.629772 fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0 <0.000007>

same fast operation for an "iptables -A INPUT" operation


23:37:02.180313 close(4)                = 0 <0.000006>
23:37:02.180382 setsockopt(3, SOL_IP, 0x40 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\16"..., 10664)
= 0 <0.000114>
23:37:02.180552 setsockopt(3, SOL_IP, 0x41 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\20"..., 292) =
0 <0.000015>
23:37:02.180635 close(3)                = 0 <0.000010>


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-14 21:40                                         ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-14 21:40 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jeff Chua, Patrick McHardy, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Stephen Hemminger a écrit :
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |    5 -
>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
>  net/netfilter/x_tables.c           |   28 --------
>  5 files changed, 110 insertions(+), 277 deletions(-)
> 

Tested successfuly on my dev machine, thanks Stephen.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


"strace -tt -T iptables -L" on this 8 cpu machine show quite fast operations now
(sub 1/HZ) 

23:37:26.629489 getsockopt(3, SOL_IP, 0x40 /* IP_??? */, "filter\0\300\\\236@\365\271\231\"\300`\225T\367\1\0\0\0\0vm\300`\225T\367\
16"..., [84]) = 0 <0.000008>
23:37:26.629579 brk(0)                  = 0x8054000 <0.000006>
23:37:26.629613 brk(0x8075000)          = 0x8075000 <0.000007>
23:37:26.629660 getsockopt(3, SOL_IP, 0x41 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0L"..., [10608])
= 0 <0.000031>
23:37:26.629772 fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0 <0.000007>

same fast operation for an "iptables -A INPUT" operation


23:37:02.180313 close(4)                = 0 <0.000006>
23:37:02.180382 setsockopt(3, SOL_IP, 0x40 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\16"..., 10664)
= 0 <0.000114>
23:37:02.180552 setsockopt(3, SOL_IP, 0x41 /* IP_??? */, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\20"..., 292) =
0 <0.000015>
23:37:02.180635 close(3)                = 0 <0.000010>

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-14 21:13                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Stephen Hemminger
  2009-04-14 21:40                                         ` Eric Dumazet
@ 2009-04-15  3:23                                       ` David Miller
  1 sibling, 0 replies; 254+ messages in thread
From: David Miller @ 2009-04-15  3:23 UTC (permalink / raw)
  To: shemminger
  Cc: dada1, jeff.chua.linux, kaber, paulmck, paulus, mingo, torvalds,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 14 Apr 2009 14:13:51 -0700

> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Things seem to be winding down, good. :-)

I'll let Patrick McHardy merge this to me with his other pending
netfilter fixes.

Thanks!

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-14 21:40                                         ` Eric Dumazet
@ 2009-04-15 10:59                                           ` Patrick McHardy
  -1 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Jeff Chua, paulmck, David Miller, paulus,
	mingo, torvalds, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  The slow case involves acquiring the locks on
>> all cpu's.
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> ---
>>  include/linux/netfilter/x_tables.h |    5 -
>>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
>>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
>>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
>>  net/netfilter/x_tables.c           |   28 --------
>>  5 files changed, 110 insertions(+), 277 deletions(-)
>>
> 
> Tested successfuly on my dev machine, thanks Stephen.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied, thanks everyone. I'll give it some testing myself and
will send it upstream tonight.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-15 10:59                                           ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-15 10:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Jeff Chua, paulmck, David Miller, paulus,
	mingo, torvalds, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  The slow case involves acquiring the locks on
>> all cpu's.
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> ---
>>  include/linux/netfilter/x_tables.h |    5 -
>>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
>>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
>>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
>>  net/netfilter/x_tables.c           |   28 --------
>>  5 files changed, 110 insertions(+), 277 deletions(-)
>>
> 
> Tested successfuly on my dev machine, thanks Stephen.
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied, thanks everyone. I'll give it some testing myself and
will send it upstream tonight.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 10:59                                           ` Patrick McHardy
@ 2009-04-15 16:31                                             ` Stephen Hemminger
  -1 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-15 16:31 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Jeff Chua, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Wed, 15 Apr 2009 12:59:03 +0200
Patrick McHardy <kaber@trash.net> wrote:

> Eric Dumazet wrote:
> > Stephen Hemminger a écrit :
> >> This is an alternative version of ip/ip6/arp tables locking using
> >> per-cpu locks.  This avoids the overhead of synchronize_net() during
> >> update but still removes the expensive rwlock in earlier versions.
> >>
> >> The idea for this came from an earlier version done by Eric Dumazet.
> >> Locking is done per-cpu, the fast path locks on the current cpu
> >> and updates counters.  The slow case involves acquiring the locks on
> >> all cpu's.
> >>
> >> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> >> there already is a mutex for xt[af].mutex that is held.
> >>
> >> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >>
> >> ---
> >>  include/linux/netfilter/x_tables.h |    5 -
> >>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
> >>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
> >>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
> >>  net/netfilter/x_tables.c           |   28 --------
> >>  5 files changed, 110 insertions(+), 277 deletions(-)
> >>
> > 
> > Tested successfuly on my dev machine, thanks Stephen.
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Applied, thanks everyone. I'll give it some testing myself and
> will send it upstream tonight.

I am running it with LOCKDEP now to check for any issues.
It also needs to be validated with SMP configured kernel running on UP.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-15 16:31                                             ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-15 16:31 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Jeff Chua, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Wed, 15 Apr 2009 12:59:03 +0200
Patrick McHardy <kaber@trash.net> wrote:

> Eric Dumazet wrote:
> > Stephen Hemminger a écrit :
> >> This is an alternative version of ip/ip6/arp tables locking using
> >> per-cpu locks.  This avoids the overhead of synchronize_net() during
> >> update but still removes the expensive rwlock in earlier versions.
> >>
> >> The idea for this came from an earlier version done by Eric Dumazet.
> >> Locking is done per-cpu, the fast path locks on the current cpu
> >> and updates counters.  The slow case involves acquiring the locks on
> >> all cpu's.
> >>
> >> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> >> there already is a mutex for xt[af].mutex that is held.
> >>
> >> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >>
> >> ---
> >>  include/linux/netfilter/x_tables.h |    5 -
> >>  net/ipv4/netfilter/arp_tables.c    |  112 +++++++++------------------------
> >>  net/ipv4/netfilter/ip_tables.c     |  123 +++++++++++--------------------------
> >>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++------------------------
> >>  net/netfilter/x_tables.c           |   28 --------
> >>  5 files changed, 110 insertions(+), 277 deletions(-)
> >>
> > 
> > Tested successfuly on my dev machine, thanks Stephen.
> > 
> > Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Applied, thanks everyone. I'll give it some testing myself and
> will send it upstream tonight.

I am running it with LOCKDEP now to check for any issues.
It also needs to be validated with SMP configured kernel running on UP.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 10:59                                           ` Patrick McHardy
  (?)
  (?)
@ 2009-04-15 20:55                                           ` Stephen Hemminger
  2009-04-15 21:07                                             ` Eric Dumazet
  -1 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-15 20:55 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Jeff Chua, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Looks like there is some recursive path into ip_tables that makes the
per-cpu spinlock break.  I get lockup's with KVM networking.

Suggestions?

[ 1931.788787] virbr3: port 1(vnet2) entering forwarding state
[ 2106.068500] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6
[ 2106.068501] Call Trace:
[ 2106.068503]  <IRQ>  [<ffffffff80393a09>] ? _raw_spin_lock+0xdd/0x105
[ 2106.068513]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2106.068516]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2106.068521]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2106.068525]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2106.068528]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2106.068536]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2106.068540]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2106.068544]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2106.068547]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2106.068550]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2106.068552]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2106.068555]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2106.068557]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2106.068559]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2106.068562]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2106.068566]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2106.068569]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2106.068571]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2106.068575]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2106.068578]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2106.068581]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2106.068584]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2106.068587]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2106.068589]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2106.068591]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2106.068594]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2106.068596]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2106.068598]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2106.068601]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2106.068604]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2106.068610]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2106.068616]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2106.068619]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2106.068625]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2106.068631]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2106.068633]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2106.068639]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2106.068641]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2106.068647]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2106.068652]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2106.068655]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2106.068657]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2106.068660]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2106.068663]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2106.068665]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2106.068668]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2106.068671]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2106.068672]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2106.068676]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2106.068680]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2106.068684]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2106.068687]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2106.068691]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2106.068701]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2106.068704]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2106.068706]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2106.068709]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2106.068711]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2106.068714]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2171.106240] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2171.106283] CPU 6:
[ 2171.106284] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2171.106322] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2171.106324] RIP: 0010:[<ffffffff8039039f>]  [<ffffffff8039039f>] delay_tsc+0x26/0x57
[ 2171.106330] RSP: 0018:ffff8800281a34f8  EFLAGS: 00000202
[ 2171.106332] RAX: 00000547850e39df RBX: 0000000000000006 RCX: 00000000850e39df
[ 2171.106333] RDX: 00000000850e39df RSI: 0000000000000001 RDI: 0000000000000001
[ 2171.106335] RBP: ffffffff8020bbd3 R08: ffffffff850e39a4 R09: 0000000000013a4b
[ 2171.106337] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2171.106339] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2171.106341] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2171.106343] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2171.106344] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2171.106346] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2171.106348] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2171.106349] Call Trace:
[ 2171.106350]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2171.106356]  [<ffffffff803903cf>] ? delay_tsc+0x56/0x57
[ 2171.106359]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2171.106364]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2171.106367]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2171.106372]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2171.106376]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2171.106379]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2171.106387]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2171.106391]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2171.106395]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2171.106398]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2171.106401]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2171.106403]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2171.106405]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2171.106408]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2171.106410]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2171.106412]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2171.106416]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2171.106418]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2171.106421]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2171.106424]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2171.106428]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2171.106431]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2171.106434]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2171.106436]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2171.106439]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2171.106441]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2171.106443]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2171.106445]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2171.106448]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2171.106451]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2171.106453]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2171.106460]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2171.106466]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2171.106469]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2171.106474]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2171.106480]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2171.106483]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2171.106488]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2171.106491]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2171.106496]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2171.106502]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2171.106505]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2171.106507]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2171.106510]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2171.106512]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2171.106515]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2171.106518]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2171.106521]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2171.106522]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2171.106526]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2171.106530]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2171.106533]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2171.106536]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2171.106539]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2171.106549]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2171.106552]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2171.106554]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2171.106556]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2171.106559]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2171.106563]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2236.604439] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2236.604494] CPU 6:
[ 2236.604496] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2236.604543] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2236.604545] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2236.604551] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 2236.604552] RAX: 000000004ae54875 RBX: 0000000000000006 RCX: 000000004ae54852
[ 2236.604554] RDX: 0000000000000570 RSI: 0000000000000001 RDI: 0000000000000001
[ 2236.604556] RBP: ffffffff8020bbd3 R08: 000000004ae54837 R09: 0000000000013a4b
[ 2236.604558] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2236.604561] R13: 0000000000000006 R14: 000001fc89170ac9 R15: ffffffff8021d470
[ 2236.604563] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2236.604566] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2236.604568] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2236.604570] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2236.604573] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2236.604575] Call Trace:
[ 2236.604577]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2236.604586]  [<ffffffff80390382>] ? delay_tsc+0x9/0x57
[ 2236.604590]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2236.604595]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2236.604598]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2236.604603]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2236.604607]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2236.604610]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2236.604618]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2236.604622]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2236.604626]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2236.604629]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2236.604632]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2236.604634]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2236.604636]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2236.604639]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2236.604641]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2236.604643]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2236.604647]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2236.604650]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2236.604653]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2236.604656]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2236.604660]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2236.604663]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2236.604666]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2236.604668]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2236.604670]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2236.604672]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2236.604675]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2236.604677]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2236.604679]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2236.604682]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2236.604685]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2236.604691]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2236.604697]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2236.604700]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2236.604706]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2236.604711]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2236.604714]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2236.604719]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2236.604722]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2236.604727]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2236.604733]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2236.604736]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2236.604738]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2236.604741]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2236.604743]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2236.604746]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2236.604748]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2236.604751]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2236.604752]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2236.604756]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2236.604760]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2236.604763]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2236.604766]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2236.604769]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2236.604779]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2236.604782]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2236.604784]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2236.604787]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2236.604789]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2236.604792]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2302.102637] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2302.102680] CPU 6:
[ 2302.102681] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2302.102719] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2302.102721] RIP: 0010:[<ffffffff80210d95>]  [<ffffffff80210d95>] native_read_tsc+0x0/0x11
[ 2302.102727] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000206
[ 2302.102728] RAX: 0000059910bc579f RBX: 0000000000000006 RCX: 0000000010bc579f
[ 2302.102730] RDX: 0000000010bc579f RSI: 0000000000000001 RDI: 0000000000000001
[ 2302.102732] RBP: ffffffff8020bbd3 R08: 0000000010bc5776 R09: 0000000000013a4b
[ 2302.102733] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2302.102735] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2302.102737] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2302.102739] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2302.102741] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2302.102742] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2302.102744] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2302.102746] Call Trace:
[ 2302.102747]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2302.102754]  [<ffffffff803903dc>] ? __delay+0x0/0xa
[ 2302.102757]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2302.102762]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2302.102765]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2302.102769]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2302.102773]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2302.102776]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2302.102784]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2302.102788]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2302.102791]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2302.102794]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2302.102797]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2302.102799]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2302.102801]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2302.102804]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2302.102806]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2302.102808]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2302.102812]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2302.102814]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2302.102817]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2302.102820]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2302.102824]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2302.102827]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2302.102830]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2302.102832]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2302.102835]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2302.102837]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2302.102839]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2302.102841]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2302.102844]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2302.102846]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2302.102849]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2302.102855]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2302.102862]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2302.102864]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2302.102870]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2302.102876]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2302.102878]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2302.102884]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2302.102886]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2302.102892]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2302.102897]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2302.102900]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2302.102903]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2302.102905]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2302.102908]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2302.102910]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2302.102913]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2302.102915]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2302.102917]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2302.102921]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2302.102924]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2302.102928]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2302.102930]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2302.102934]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2302.102944]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2302.102946]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2302.102948]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2302.102951]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2302.102953]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2302.102956]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2367.600835] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2367.600879] CPU 6:
[ 2367.600880] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2367.600918] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2367.600920] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2367.600925] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 2367.600927] RAX: 00000000d69365df RBX: 0000000000000006 RCX: 00000000d69365bc
[ 2367.600929] RDX: 00000000000005c1 RSI: 0000000000000001 RDI: 0000000000000001
[ 2367.600930] RBP: ffffffff8020bbd3 R08: ffffffffd69365a4 R09: 0000000000013a4b
[ 2367.600932] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2367.600934] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2367.600936] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2367.600938] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2367.600939] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2367.600941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2367.600943] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2367.600944] Call Trace:
[ 2367.600946]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2367.600952]  [<ffffffff80390380>] ? delay_tsc+0x7/0x57
[ 2367.600956]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2367.600961]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2367.600964]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2367.600969]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2367.600973]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2367.600976]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2367.600984]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2367.600987]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2367.600991]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2367.600994]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2367.600997]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2367.600999]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2367.601001]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2367.601004]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2367.601006]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2367.601009]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2367.601012]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2367.601015]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2367.601017]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2367.601021]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2367.601024]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2367.601027]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2367.601030]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2367.601032]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2367.601035]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2367.601037]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2367.601039]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2367.601041]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2367.601044]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2367.601047]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2367.601049]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2367.601056]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2367.601062]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2367.601065]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2367.601070]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2367.601076]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2367.601079]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2367.601084]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2367.601087]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2367.601092]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2367.601098]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2367.601100]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2367.601103]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2367.601106]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2367.601108]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2367.601111]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2367.601114]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2367.601116]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2367.601118]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2367.601122]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2367.601126]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2367.601129]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2367.601132]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2367.601136]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2367.601145]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2367.601148]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2367.601150]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2367.601153]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2367.601155]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2367.601158]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2433.099031] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2433.099074] CPU 6:
[ 2433.099076] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2433.099114] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2433.099115] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2433.099121] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 2433.099123] RAX: 000000009c6a7306 RBX: ffff8800281b2730 RCX: 000000009c6a72ee
[ 2433.099124] RDX: 00000000000005ea RSI: 0000000000000001 RDI: 0000000000000001
[ 2433.099126] RBP: ffffffff8020bbd3 R08: ffffffff9c6a72ee R09: 0000000000013a4b
[ 2433.099128] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2433.099130] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2433.099132] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2433.099134] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2433.099136] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2433.099137] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2433.099139] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2433.099140] Call Trace:
[ 2433.099142]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 2433.099149]  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 2433.099152]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2433.099156]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2433.099159]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2433.099164]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2433.099168]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2433.099171]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2433.099179]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2433.099183]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2433.099187]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2433.099190]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2433.099193]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2433.099195]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2433.099197]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2433.099200]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2433.099202]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2433.099205]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2433.099208]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2433.099211]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2433.099214]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2433.099217]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2433.099221]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2433.099224]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2433.099227]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2433.099229]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2433.099231]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2433.099234]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2433.099236]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2433.099238]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2433.099241]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2433.099244]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2433.099246]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2433.099252]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2433.099259]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2433.099261]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2433.099267]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2433.099273]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2433.099275]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2433.099281]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2433.099284]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2433.099289]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2433.099295]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2433.099297]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2433.099300]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2433.099302]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2433.099305]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2433.099307]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2433.099310]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2433.099313]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2433.099314]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2433.099318]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2433.099322]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2433.099325]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2433.099328]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2433.099331]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2433.099341]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2433.099344]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2433.099346]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2433.099348]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2433.099351]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2433.099354]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
Apr 15 08:28:38 nehalam syslogd 1.5.0#5ubuntu3: restart.
[ 2498.597229] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2498.597272] CPU 6:
[ 2498.597274] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2498.597312] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2498.597313] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2498.597319] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 2498.597321] RAX: 00000000624181c5 RBX: 0000000000000006 RCX: 00000000624181a2
[ 2498.597323] RDX: 0000000000000613 RSI: 0000000000000001 RDI: 0000000000000001
[ 2498.597324] RBP: ffffffff8020bbd3 R08: 000000006241818a R09: 0000000000013a4b
[ 2498.597326] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2498.597328] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2498.597330] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2498.597332] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2498.597333] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2498.597335] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2498.597337] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2498.597338] Call Trace:
[ 2498.597340]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2498.597347]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2498.597349]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2498.597354]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2498.597357]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2498.597362]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2498.597366]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2498.597369]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2498.597377]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2498.597381]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2498.597385]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2498.597388]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2498.597390]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2498.597393]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2498.597395]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2498.597398]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2498.597400]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2498.597402]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2498.597406]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2498.597409]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2498.597411]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2498.597415]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2498.597418]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2498.597421]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2498.597424]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2498.597426]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2498.597429]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2498.597431]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2498.597433]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2498.597436]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2498.597438]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2498.597441]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2498.597443]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2498.597450]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2498.597456]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2498.597458]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2498.597464]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2498.597470]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2498.597472]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2498.597478]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2498.597480]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2498.597486]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2498.597491]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2498.597494]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2498.597497]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2498.597499]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2498.597502]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2498.597504]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2498.597507]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2498.597509]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2498.597511]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2498.597515]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2498.597519]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2498.597522]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2498.597525]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2498.597529]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2498.597539]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2498.597542]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2498.597544]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2498.597546]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2498.597549]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2498.597552]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2564.096427] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2564.096470] CPU 6:
[ 2564.096472] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2564.096512] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2564.096513] RIP: 0010:[<ffffffff80390382>]  [<ffffffff80390382>] delay_tsc+0x9/0x57
[ 2564.096519] RSP: 0018:ffff8800281a3530  EFLAGS: 00000202
[ 2564.096521] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000284159f4
[ 2564.096523] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 2564.096525] RBP: ffffffff8020bbd3 R08: 00000000284159f4 R09: 0000000000013a4b
[ 2564.096526] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34b0
[ 2564.096528] R13: 000000002471c8ef R14: 0000000028188f77 R15: ffffffff8021d470
[ 2564.096530] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2564.096532] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2564.096534] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2564.096536] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2564.096537] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2564.096539] Call Trace:
[ 2564.096540]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2564.096549]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2564.096552]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2564.096557]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2564.096561]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2564.096564]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2564.096572]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2564.096576]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2564.096580]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2564.096583]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2564.096585]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2564.096588]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2564.096590]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2564.096593]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2564.096595]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2564.096597]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2564.096601]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2564.096604]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2564.096606]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2564.096610]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2564.096613]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2564.096616]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2564.096619]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2564.096622]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2564.096624]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2564.096626]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2564.096629]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2564.096631]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2564.096633]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2564.096636]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2564.096639]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2564.096645]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2564.096651]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2564.096654]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2564.096660]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2564.096666]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2564.096668]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2564.096674]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2564.096676]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2564.096682]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2564.096687]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2564.096690]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2564.096693]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2564.096695]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2564.096698]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2564.096700]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2564.096704]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2564.096707]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2564.096708]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2564.096712]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2564.096716]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2564.096720]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2564.096723]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2564.096727]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2564.096736]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2564.096739]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2564.096741]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2564.096744]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2564.096746]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2564.096749]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2629.594624] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2629.594668] CPU 6:
[ 2629.594669] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2629.594707] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2629.594709] RIP: 0010:[<ffffffff803903c5>]  [<ffffffff803903c5>] delay_tsc+0x4c/0x57
[ 2629.594715] RSP: 0018:ffff8800281a34f8  EFLAGS: 00000206
[ 2629.594717] RAX: 0000000000000012 RBX: 0000000000000006 RCX: 00000000ee1867ed
[ 2629.594718] RDX: 00000000ee1867ed RSI: 0000000000000001 RDI: 0000000000000001
[ 2629.594720] RBP: ffffffff8020bbd3 R08: ffffffffee1867ed R09: 0000000000013a4b
[ 2629.594722] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2629.594724] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2629.594726] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2629.594728] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2629.594729] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2629.594731] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2629.594733] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2629.594735] Call Trace:
[ 2629.594736]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2629.594742]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 2629.594745]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2629.594749]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2629.594752]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2629.594757]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2629.594761]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2629.594764]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2629.594772]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2629.594776]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2629.594780]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2629.594783]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2629.594786]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2629.594788]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2629.594791]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2629.594793]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2629.594796]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2629.594798]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2629.594801]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2629.594804]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2629.594806]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2629.594810]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2629.594814]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2629.594817]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2629.594820]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2629.594822]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2629.594824]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2629.594827]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2629.594829]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2629.594831]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2629.594834]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2629.594837]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2629.594839]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2629.594845]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2629.594852]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2629.594854]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2629.594860]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2629.594866]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2629.594868]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2629.594874]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2629.594877]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2629.594882]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2629.594888]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2629.594890]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2629.594893]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2629.594896]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2629.594898]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2629.594901]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2629.594904]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2629.594906]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2629.594908]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2629.594912]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2629.594916]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2629.594919]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2629.594922]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2629.594925]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2629.594935]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2629.594937]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2629.594940]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2629.594942]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2629.594945]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2629.594948]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2695.092824] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2695.092867] CPU 6:
[ 2695.092868] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2695.092907] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2695.092909] RIP: 0010:[<ffffffff803903a7>]  [<ffffffff803903a7>] delay_tsc+0x2e/0x57
[ 2695.092915] RSP: 0018:ffff8800281a34f8  EFLAGS: 00000202
[ 2695.092917] RAX: ffffffffb3ef768f RBX: 0000000000000006 RCX: 00000000b3ef768f
[ 2695.092918] RDX: 00000000b3ef768f RSI: 0000000000000001 RDI: 0000000000000001
[ 2695.092920] RBP: ffffffff8020bbd3 R08: ffffffffb3ef768f R09: 0000000000013a4b
[ 2695.092922] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2695.092924] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2695.092926] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2695.092928] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2695.092930] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2695.092931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2695.092933] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2695.092935] Call Trace:
[ 2695.092936]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2695.092943]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2695.092945]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2695.092950]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2695.092953]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2695.092958]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2695.092962]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2695.092965]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2695.092973]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2695.092977]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2695.092981]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2695.092984]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2695.092986]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2695.092988]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2695.092991]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2695.092993]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2695.092996]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2695.092998]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2695.093001]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2695.093004]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2695.093007]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2695.093010]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2695.093014]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2695.093017]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2695.093020]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2695.093022]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2695.093025]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2695.093027]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2695.093030]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2695.093032]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2695.093034]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2695.093037]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2695.093040]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2695.093046]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2695.093052]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2695.093055]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2695.093061]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2695.093067]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2695.093069]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2695.093075]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2695.093078]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2695.093083]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2695.093089]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2695.093092]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2695.093094]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2695.093097]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2695.093100]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2695.093102]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2695.093105]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2695.093108]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2695.093109]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2695.093113]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2695.093117]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2695.093120]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2695.093124]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2695.093128]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2695.093138]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2695.093141]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2695.093143]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2695.093146]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2695.093148]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2695.093151]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2760.591021] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2760.591065] CPU 6:
[ 2760.591066] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2760.591104] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2760.591106] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2760.591112] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 2760.591114] RAX: 0000000079c68488 RBX: ffff8800281b2730 RCX: 0000000079c6846d
[ 2760.591115] RDX: 00000000000006b6 RSI: 0000000000000001 RDI: 0000000000000001
[ 2760.591117] RBP: ffffffff8020bbd3 R08: 0000000079c6846d R09: 0000000000013a4b
[ 2760.591119] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2760.591121] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2760.591123] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2760.591125] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2760.591126] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2760.591128] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2760.591130] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2760.591132] Call Trace:
[ 2760.591133]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 2760.591140]  [<ffffffff803903dc>] ? __delay+0x0/0xa
[ 2760.591143]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2760.591148]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2760.591151]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2760.591156]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2760.591160]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2760.591163]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2760.591171]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2760.591175]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2760.591178]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2760.591181]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2760.591183]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2760.591185]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2760.591188]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2760.591190]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2760.591193]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2760.591195]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2760.591198]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2760.591201]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2760.591204]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2760.591207]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2760.591211]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2760.591214]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2760.591217]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2760.591219]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2760.591222]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2760.591224]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2760.591226]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2760.591229]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2760.591231]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2760.591234]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2760.591236]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2760.591243]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2760.591249]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2760.591252]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2760.591257]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2760.591263]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2760.591266]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2760.591271]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2760.591274]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2760.591279]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2760.591285]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2760.591288]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2760.591290]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2760.591293]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2760.591295]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2760.591298]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2760.591301]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2760.591303]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2760.591304]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2760.591309]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2760.591312]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2760.591316]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2760.591318]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2760.591322]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2760.591332]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2760.591334]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2760.591336]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2760.591339]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2760.591341]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2760.591344]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2826.089219] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2826.089263] CPU 6:
[ 2826.089265] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2826.089303] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2826.089305] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 2826.089311] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 2826.089313] RAX: 000000003f9d9365 RBX: ffff8800281b2730 RCX: 000000003f9d934d
[ 2826.089315] RDX: 00000000000006df RSI: 0000000000000001 RDI: 0000000000000001
[ 2826.089317] RBP: ffffffff8020bbd3 R08: 000000003f9d934d R09: 0000000000013a4b
[ 2826.089318] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2826.089320] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2826.089322] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2826.089324] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2826.089326] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2826.089328] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2826.089330] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2826.089331] Call Trace:
[ 2826.089333]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 2826.089340]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 2826.089343]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2826.089348]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2826.089351]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2826.089356]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2826.089360]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2826.089363]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2826.089371]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2826.089375]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2826.089380]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2826.089383]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2826.089385]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2826.089387]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2826.089390]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2826.089393]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2826.089395]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2826.089397]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2826.089401]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2826.089404]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2826.089406]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2826.089410]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2826.089413]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2826.089416]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2826.089419]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2826.089421]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2826.089424]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2826.089426]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2826.089429]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2826.089431]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2826.089433]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2826.089436]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2826.089439]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2826.089446]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2826.089452]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2826.089455]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2826.089461]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2826.089466]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2826.089469]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2826.089475]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2826.089477]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2826.089483]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2826.089489]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2826.089491]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2826.089494]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2826.089496]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2826.089499]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2826.089502]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2826.089505]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2826.089508]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2826.089509]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2826.089513]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2826.089517]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2826.089521]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2826.089524]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2826.089527]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2826.089537]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2826.089540]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2826.089542]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2826.089545]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2826.089547]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2826.089550]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2891.587418] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2891.587462] CPU 6:
[ 2891.587464] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2891.587503] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2891.587504] RIP: 0010:[<ffffffff803903c5>]  [<ffffffff803903c5>] delay_tsc+0x4c/0x57
[ 2891.587511] RSP: 0018:ffff8800281a34f8  EFLAGS: 00000206
[ 2891.587512] RAX: 0000000000000012 RBX: 0000000000000006 RCX: 000000000574a13d
[ 2891.587514] RDX: 000000000574a13d RSI: 0000000000000001 RDI: 0000000000000001
[ 2891.587516] RBP: ffffffff8020bbd3 R08: 000000000574a13d R09: 0000000000013a4b
[ 2891.587518] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 2891.587520] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 2891.587522] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2891.587524] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2891.587525] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2891.587527] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2891.587529] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2891.587531] Call Trace:
[ 2891.587532]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 2891.587538]  [<ffffffff803903dc>] ? __delay+0x0/0xa
[ 2891.587541]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 2891.587546]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2891.587549]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2891.587554]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2891.587558]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2891.587561]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2891.587569]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2891.587573]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2891.587577]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2891.587580]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2891.587583]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2891.587585]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2891.587587]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2891.587590]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2891.587592]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2891.587595]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2891.587599]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2891.587602]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2891.587604]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2891.587608]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2891.587611]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2891.587614]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2891.587617]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2891.587620]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2891.587622]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2891.587624]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2891.587627]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2891.587629]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2891.587631]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2891.587634]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2891.587637]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2891.587643]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2891.587650]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2891.587652]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2891.587658]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2891.587664]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2891.587667]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2891.587672]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2891.587675]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2891.587680]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2891.587686]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2891.587689]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2891.587691]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2891.587694]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2891.587696]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2891.587699]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2891.587702]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2891.587705]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2891.587706]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2891.587710]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2891.587714]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2891.587717]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2891.587720]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2891.587723]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2891.587733]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2891.587736]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2891.587738]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2891.587741]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2891.587743]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2891.587746]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 2957.085615] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2957.085659] CPU 6:
[ 2957.085661] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 2957.085700] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 2957.085701] RIP: 0010:[<ffffffff803939bc>]  [<ffffffff803939bc>] _raw_spin_lock+0x90/0x105
[ 2957.085707] RSP: 0018:ffff8800281a3558  EFLAGS: 00000246
[ 2957.085709] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000cb4bae55
[ 2957.085711] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 2957.085713] RBP: ffffffff8020bbd3 R08: ffffffffcb4bae55 R09: 0000000000013a4b
[ 2957.085714] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 2957.085716] R13: 0000000045b83c35 R14: 0000000000000001 R15: ffffffff8021d470
[ 2957.085718] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 2957.085720] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 2957.085722] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 2957.085724] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2957.085726] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2957.085727] Call Trace:
[ 2957.085729]  <IRQ>  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 2957.085736]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 2957.085739]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 2957.085744]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 2957.085748]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 2957.085751]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 2957.085760]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 2957.085763]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 2957.085767]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2957.085770]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2957.085772]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2957.085775]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 2957.085777]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 2957.085780]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 2957.085782]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 2957.085785]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 2957.085788]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 2957.085791]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 2957.085793]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 2957.085797]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 2957.085801]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 2957.085803]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 2957.085806]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2957.085809]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2957.085811]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2957.085813]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 2957.085816]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 2957.085818]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 2957.085821]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 2957.085824]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 2957.085826]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2957.085833]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 2957.085839]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 2957.085842]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 2957.085848]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 2957.085854]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 2957.085856]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 2957.085862]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2957.085865]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 2957.085870]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 2957.085876]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 2957.085878]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 2957.085881]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 2957.085884]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 2957.085886]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 2957.085889]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 2957.085892]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 2957.085895]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 2957.085896]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 2957.085900]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 2957.085904]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 2957.085908]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 2957.085911]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 2957.085916]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 2957.085926]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 2957.085928]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 2957.085931]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 2957.085933]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 2957.085936]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 2957.085939]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3022.583814] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3022.583859] CPU 6:
[ 3022.583860] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3022.583899] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3022.583900] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3022.583906] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3022.583908] RAX: 000000009122bda2 RBX: 0000000000000006 RCX: 000000009122bd7f
[ 3022.583910] RDX: 0000000000000759 RSI: 0000000000000001 RDI: 0000000000000001
[ 3022.583911] RBP: ffffffff8020bbd3 R08: ffffffff9122bd64 R09: 0000000000013a4b
[ 3022.583913] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3022.583915] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3022.583917] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3022.583919] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3022.583921] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3022.583923] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3022.583924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3022.583926] Call Trace:
[ 3022.583927]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3022.583935]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 3022.583937]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3022.583942]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3022.583945]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3022.583950]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3022.583954]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3022.583957]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3022.583966]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3022.583970]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3022.583974]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3022.583977]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3022.583979]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3022.583981]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3022.583984]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3022.583987]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3022.583989]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3022.583991]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3022.583995]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3022.583998]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3022.584000]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3022.584004]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3022.584008]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3022.584011]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3022.584014]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3022.584016]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3022.584019]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3022.584021]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3022.584023]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3022.584026]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3022.584028]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3022.584031]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3022.584033]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3022.584040]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3022.584046]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3022.584049]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3022.584055]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3022.584060]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3022.584063]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3022.584068]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3022.584071]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3022.584076]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3022.584082]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3022.584085]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3022.584087]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3022.584090]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3022.584093]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3022.584095]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3022.584098]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3022.584100]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3022.584102]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3022.584106]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3022.584110]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3022.584113]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3022.584116]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3022.584121]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3022.584130]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3022.584133]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3022.584136]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3022.584138]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3022.584141]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3022.584144]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3088.082011] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3088.082056] CPU 6:
[ 3088.082057] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3088.082097] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3088.082099] RIP: 0010:[<ffffffff80210da2>]  [<ffffffff80210da2>] native_read_tsc+0xd/0x11
[ 3088.082104] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000206
[ 3088.082106] RAX: 0000078200000000 RBX: 0000000000000006 RCX: 0000000056f9cad2
[ 3088.082108] RDX: 0000000056f9cad2 RSI: 0000000000000001 RDI: 0000000000000001
[ 3088.082110] RBP: ffffffff8020bbd3 R08: 0000000056f9caa5 R09: 0000000000013a4b
[ 3088.082112] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3088.082114] R13: 0000000000000006 R14: 000002c2ce8ec94b R15: ffffffff8021d470
[ 3088.082116] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3088.082118] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3088.082119] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3088.082121] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3088.082123] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3088.082125] Call Trace:
[ 3088.082126]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3088.082133]  [<ffffffff80390380>] ? delay_tsc+0x7/0x57
[ 3088.082137]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3088.082141]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3088.082144]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3088.082149]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3088.082154]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3088.082157]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3088.082165]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3088.082169]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3088.082173]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3088.082176]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3088.082179]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3088.082181]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3088.082183]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3088.082186]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3088.082188]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3088.082191]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3088.082194]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3088.082197]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3088.082199]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3088.082203]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3088.082207]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3088.082210]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3088.082213]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3088.082215]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3088.082218]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3088.082220]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3088.082222]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3088.082225]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3088.082227]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3088.082230]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3088.082233]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3088.082239]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3088.082245]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3088.082248]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3088.082254]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3088.082259]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3088.082262]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3088.082268]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3088.082270]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3088.082276]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3088.082281]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3088.082284]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3088.082287]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3088.082289]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3088.082292]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3088.082294]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3088.082297]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3088.082300]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3088.082301]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3088.082305]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3088.082309]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3088.082312]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3088.082315]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3088.082318]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3088.082328]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3088.082331]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3088.082333]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3088.082336]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3088.082338]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3088.082341]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3153.580209] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3153.580254] CPU 6:
[ 3153.580255] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3153.580295] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3153.580296] RIP: 0010:[<ffffffff803939bc>]  [<ffffffff803939bc>] _raw_spin_lock+0x90/0x105
[ 3153.580302] RSP: 0018:ffff8800281a3558  EFLAGS: 00000246
[ 3153.580304] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 000000001cd0d825
[ 3153.580306] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 3153.580308] RBP: ffffffff8020bbd3 R08: 000000001cd0d825 R09: 0000000000013a4b
[ 3153.580310] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 3153.580311] R13: 00000000051c61dd R14: 0000000000000001 R15: ffffffff8021d470
[ 3153.580314] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3153.580315] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3153.580317] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3153.580319] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3153.580321] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3153.580322] Call Trace:
[ 3153.580324]  <IRQ>  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 3153.580331]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3153.580335]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3153.580340]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3153.580344]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3153.580347]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3153.580355]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3153.580359]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3153.580363]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3153.580366]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3153.580369]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3153.580371]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3153.580374]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3153.580376]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3153.580379]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3153.580381]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3153.580384]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3153.580387]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3153.580389]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3153.580393]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3153.580397]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3153.580400]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3153.580403]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3153.580405]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3153.580408]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3153.580410]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3153.580412]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3153.580415]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3153.580417]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3153.580420]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3153.580422]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3153.580429]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3153.580435]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3153.580438]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3153.580444]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3153.580449]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3153.580452]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3153.580458]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3153.580460]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3153.580466]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3153.580471]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3153.580474]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3153.580477]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3153.580479]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3153.580482]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3153.580484]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3153.580487]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3153.580490]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3153.580491]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3153.580496]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3153.580499]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3153.580503]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3153.580506]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3153.580509]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3153.580519]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3153.580522]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3153.580524]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3153.580527]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3153.580529]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3153.580532]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3219.078407] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3219.078451] CPU 6:
[ 3219.078452] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3219.078491] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3219.078493] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3219.078499] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 3219.078500] RAX: 00000000e2a7e845 RBX: ffff8800281b2730 RCX: 00000000e2a7e82d
[ 3219.078502] RDX: 00000000000007d3 RSI: 0000000000000001 RDI: 0000000000000001
[ 3219.078504] RBP: ffffffff8020bbd3 R08: ffffffffe2a7e82d R09: 0000000000013a4b
[ 3219.078506] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3219.078508] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3219.078510] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3219.078512] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3219.078513] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3219.078515] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3219.078517] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3219.078519] Call Trace:
[ 3219.078520]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 3219.078527]  [<ffffffff803903cb>] ? delay_tsc+0x52/0x57
[ 3219.078531]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3219.078536]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3219.078539]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3219.078544]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3219.078548]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3219.078551]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3219.078559]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3219.078563]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3219.078566]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3219.078569]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3219.078572]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3219.078574]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3219.078576]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3219.078579]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3219.078581]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3219.078584]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3219.078587]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3219.078590]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3219.078592]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3219.078596]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3219.078600]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3219.078602]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3219.078605]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3219.078608]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3219.078610]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3219.078613]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3219.078615]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3219.078617]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3219.078620]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3219.078623]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3219.078625]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3219.078632]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3219.078638]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3219.078641]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3219.078646]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3219.078652]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3219.078655]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3219.078660]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3219.078663]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3219.078668]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3219.078674]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3219.078677]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3219.078679]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3219.078682]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3219.078685]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3219.078687]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3219.078690]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3219.078692]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3219.078694]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3219.078698]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3219.078702]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3219.078705]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3219.078708]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3219.078711]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3219.078721]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3219.078724]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3219.078726]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3219.078729]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3219.078731]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3219.078734]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3284.576604] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3284.576648] CPU 6:
[ 3284.576650] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3284.576690] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3284.576692] RIP: 0010:[<ffffffff80390382>]  [<ffffffff80390382>] delay_tsc+0x9/0x57
[ 3284.576698] RSP: 0018:ffff8800281a3530  EFLAGS: 00000206
[ 3284.576699] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000a87ef632
[ 3284.576701] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 3284.576703] RBP: ffffffff8020bbd3 R08: ffffffffa87ef632 R09: 0000000000013a4b
[ 3284.576705] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34b0
[ 3284.576707] R13: 000000004521ae9a R14: ffffffffa8562b7f R15: ffffffff8021d470
[ 3284.576709] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3284.576711] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3284.576712] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3284.576714] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3284.576716] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3284.576717] Call Trace:
[ 3284.576719]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3284.576727]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3284.576730]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3284.576735]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3284.576739]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3284.576742]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3284.576750]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3284.576754]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3284.576758]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3284.576761]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3284.576764]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3284.576766]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3284.576769]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3284.576771]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3284.576774]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3284.576776]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3284.576779]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3284.576782]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3284.576784]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3284.576788]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3284.576792]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3284.576795]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3284.576798]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3284.576800]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3284.576803]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3284.576805]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3284.576807]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3284.576810]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3284.576812]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3284.576815]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3284.576817]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3284.576824]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3284.576830]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3284.576833]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3284.576839]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3284.576845]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3284.576847]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3284.576853]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3284.576856]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3284.576861]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3284.576867]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3284.576869]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3284.576872]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3284.576875]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3284.576877]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3284.576880]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3284.576883]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3284.576886]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3284.576887]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3284.576892]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3284.576895]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3284.576899]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3284.576902]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3284.576905]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3284.576915]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3284.576918]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3284.576920]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3284.576923]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3284.576925]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3284.576928]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3350.074802] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3350.074846] CPU 6:
[ 3350.074847] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3350.074886] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3350.074887] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3350.074893] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3350.074895] RAX: 000000006e560376 RBX: 0000000000000006 RCX: 000000006e560364
[ 3350.074896] RDX: 0000000000000825 RSI: 0000000000000001 RDI: 0000000000000001
[ 3350.074898] RBP: ffffffff8020bbd3 R08: 000000006e56034c R09: 0000000000013a4b
[ 3350.074900] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3350.074902] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3350.074904] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3350.074906] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3350.074908] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3350.074910] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3350.074911] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3350.074913] Call Trace:
[ 3350.074915]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3350.074922]  [<ffffffff803903dc>] ? __delay+0x0/0xa
[ 3350.074925]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3350.074929]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3350.074933]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3350.074938]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3350.074942]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3350.074945]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3350.074953]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3350.074957]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3350.074961]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3350.074964]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3350.074966]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3350.074969]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3350.074971]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3350.074974]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3350.074976]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3350.074978]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3350.074982]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3350.074984]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3350.074987]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3350.074991]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3350.074994]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3350.074997]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3350.075000]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3350.075003]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3350.075005]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3350.075007]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3350.075010]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3350.075012]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3350.075014]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3350.075017]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3350.075020]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3350.075026]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3350.075032]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3350.075035]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3350.075041]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3350.075047]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3350.075049]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3350.075055]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3350.075058]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3350.075063]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3350.075069]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3350.075072]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3350.075074]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3350.075077]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3350.075079]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3350.075082]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3350.075085]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3350.075087]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3350.075089]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3350.075093]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3350.075097]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3350.075100]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3350.075103]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3350.075106]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3350.075116]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3350.075119]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3350.075121]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3350.075124]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3350.075126]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3350.075129]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3415.573001] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3415.573046] CPU 6:
[ 3415.573047] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3415.573086] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3415.573087] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3415.573093] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 3415.573095] RAX: 00000000342d1169 RBX: ffff8800281b2730 RCX: 00000000342d1151
[ 3415.573097] RDX: 000000000000084e RSI: 0000000000000001 RDI: 0000000000000001
[ 3415.573099] RBP: ffffffff8020bbd3 R08: 00000000342d1151 R09: 0000000000013a4b
[ 3415.573100] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3415.573102] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3415.573104] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3415.573106] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3415.573108] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3415.573110] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3415.573112] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3415.573113] Call Trace:
[ 3415.573115]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 3415.573123]  [<ffffffff80221ed7>] ? __ticket_spin_trylock+0x12/0x19
[ 3415.573126]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3415.573131]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3415.573134]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3415.573139]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3415.573143]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3415.573146]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3415.573154]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3415.573158]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3415.573162]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3415.573166]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3415.573168]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3415.573170]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3415.573173]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3415.573175]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3415.573178]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3415.573180]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3415.573183]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3415.573186]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3415.573188]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3415.573192]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3415.573196]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3415.573199]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3415.573202]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3415.573204]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3415.573206]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3415.573209]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3415.573211]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3415.573214]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3415.573216]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3415.573219]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3415.573221]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3415.573228]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3415.573234]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3415.573237]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3415.573243]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3415.573248]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3415.573251]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3415.573257]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3415.573259]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3415.573265]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3415.573270]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3415.573273]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3415.573276]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3415.573278]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3415.573281]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3415.573283]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3415.573287]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3415.573289]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3415.573291]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3415.573295]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3415.573299]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3415.573302]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3415.573306]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3415.573309]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3415.573319]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3415.573322]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3415.573324]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3415.573327]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3415.573329]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3415.573332]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3481.071199] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3481.071243] CPU 6:
[ 3481.071244] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3481.071283] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3481.071285] RIP: 0010:[<ffffffff803939bc>]  [<ffffffff803939bc>] _raw_spin_lock+0x90/0x105
[ 3481.071291] RSP: 0018:ffff8800281a3558  EFLAGS: 00000246
[ 3481.071292] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000fa041ff6
[ 3481.071294] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 3481.071296] RBP: ffffffff8020bbd3 R08: fffffffffa041ff6 R09: 0000000000013a4b
[ 3481.071298] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 3481.071300] R13: 000000009df34429 R14: ffff8800281a34c0 R15: ffffffff8021d470
[ 3481.071302] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3481.071304] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3481.071305] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3481.071307] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3481.071309] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3481.071311] Call Trace:
[ 3481.071312]  <IRQ>  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 3481.071320]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3481.071323]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3481.071328]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3481.071332]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3481.071335]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3481.071343]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3481.071347]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3481.071351]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3481.071354]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3481.071357]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3481.071359]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3481.071362]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3481.071364]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3481.071367]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3481.071369]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3481.071373]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3481.071376]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3481.071379]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3481.071382]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3481.071386]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3481.071389]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3481.071392]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3481.071394]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3481.071397]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3481.071399]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3481.071402]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3481.071404]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3481.071406]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3481.071409]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3481.071412]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3481.071418]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3481.071424]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3481.071427]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3481.071433]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3481.071439]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3481.071441]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3481.071447]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3481.071450]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3481.071455]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3481.071461]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3481.071464]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3481.071466]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3481.071469]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3481.071472]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3481.071474]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3481.071477]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3481.071480]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3481.071481]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3481.071486]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3481.071490]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3481.071493]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3481.071496]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3481.071501]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3481.071511]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3481.071513]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3481.071516]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3481.071518]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3481.071521]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3481.071524]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3546.569396] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3546.569440] CPU 6:
[ 3546.569442] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3546.569481] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3546.569483] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3546.569488] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 3546.569490] RAX: 00000000bfdb2e36 RBX: ffff8800281b2730 RCX: 00000000bfdb2e1e
[ 3546.569492] RDX: 000000000000089f RSI: 0000000000000001 RDI: 0000000000000001
[ 3546.569494] RBP: ffffffff8020bbd3 R08: ffffffffbfdb2e1e R09: 0000000000013a4b
[ 3546.569496] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3546.569497] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3546.569500] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3546.569502] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3546.569503] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3546.569505] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3546.569507] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3546.569508] Call Trace:
[ 3546.569510]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 3546.569518]  [<ffffffff80221ed7>] ? __ticket_spin_trylock+0x12/0x19
[ 3546.569522]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3546.569526]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3546.569530]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3546.569535]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3546.569539]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3546.569542]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3546.569550]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3546.569554]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3546.569557]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3546.569560]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3546.569562]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3546.569565]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3546.569567]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3546.569570]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3546.569572]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3546.569574]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3546.569578]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3546.569580]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3546.569583]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3546.569586]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3546.569590]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3546.569593]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3546.569596]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3546.569598]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3546.569601]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3546.569603]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3546.569605]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3546.569608]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3546.569610]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3546.569613]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3546.569615]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3546.569622]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3546.569628]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3546.569631]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3546.569637]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3546.569642]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3546.569645]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3546.569651]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3546.569653]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3546.569658]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3546.569664]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3546.569667]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3546.569669]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3546.569672]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3546.569675]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3546.569677]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3546.569680]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3546.569683]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3546.569684]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3546.569688]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3546.569692]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3546.569695]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3546.569699]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3546.569703]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3546.569713]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3546.569715]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3546.569718]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3546.569720]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3546.569723]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3546.569726]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3612.067594] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3612.067638] CPU 6:
[ 3612.067639] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3612.067678] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3612.067680] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3612.067685] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3612.067687] RAX: 0000000085b23c6d RBX: 0000000000000006 RCX: 0000000085b23c5b
[ 3612.067689] RDX: 00000000000008c8 RSI: 0000000000000001 RDI: 0000000000000001
[ 3612.067691] RBP: ffffffff8020bbd3 R08: ffffffff85b23c2f R09: 0000000000013a4b
[ 3612.067693] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3612.067694] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3612.067697] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3612.067699] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3612.067700] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3612.067702] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3612.067704] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3612.067705] Call Trace:
[ 3612.067707]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3612.067714]  [<ffffffff803903cf>] ? delay_tsc+0x56/0x57
[ 3612.067717]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3612.067722]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3612.067725]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3612.067730]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3612.067734]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3612.067737]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3612.067745]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3612.067749]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3612.067753]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3612.067756]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3612.067759]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3612.067761]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3612.067764]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3612.067766]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3612.067768]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3612.067771]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3612.067775]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3612.067778]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3612.067780]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3612.067784]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3612.067788]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3612.067791]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3612.067794]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3612.067796]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3612.067799]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3612.067801]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3612.067803]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3612.067806]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3612.067808]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3612.067811]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3612.067814]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3612.067820]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3612.067826]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3612.067829]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3612.067835]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3612.067840]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3612.067843]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3612.067849]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3612.067851]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3612.067857]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3612.067862]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3612.067865]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3612.067868]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3612.067870]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3612.067873]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3612.067875]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3612.067878]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3612.067881]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3612.067882]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3612.067886]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3612.067890]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3612.067894]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3612.067896]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3612.067900]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3612.067910]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3612.067912]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3612.067915]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3612.067917]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3612.067920]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3612.067923]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3677.565792] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3677.565836] CPU 6:
[ 3677.565837] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3677.565876] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3677.565878] RIP: 0010:[<ffffffff80210d9c>]  [<ffffffff80210d9c>] native_read_tsc+0x7/0x11
[ 3677.565883] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3677.565885] RAX: 00000000000008f1 RBX: 0000000000000006 RCX: 000000004b894982
[ 3677.565887] RDX: 00000000000008f1 RSI: 0000000000000001 RDI: 0000000000000001
[ 3677.565888] RBP: ffffffff8020bbd3 R08: 000000004b894958 R09: 0000000000013a4b
[ 3677.565890] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3677.565892] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3677.565894] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3677.565896] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3677.565898] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3677.565900] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3677.565901] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3677.565903] Call Trace:
[ 3677.565904]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3677.565913]  [<ffffffff80221ed0>] ? __ticket_spin_trylock+0xb/0x19
[ 3677.565916]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3677.565921]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3677.565924]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3677.565929]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3677.565933]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3677.565936]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3677.565944]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3677.565948]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3677.565951]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3677.565954]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3677.565956]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3677.565959]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3677.565961]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3677.565964]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3677.565966]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3677.565968]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3677.565972]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3677.565974]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3677.565977]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3677.565981]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3677.565984]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3677.565987]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3677.565990]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3677.565992]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3677.565995]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3677.565997]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3677.566000]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3677.566002]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3677.566004]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3677.566008]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3677.566010]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3677.566017]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3677.566023]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3677.566026]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3677.566032]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3677.566037]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3677.566040]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3677.566046]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3677.566048]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3677.566054]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3677.566059]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3677.566062]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3677.566065]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3677.566067]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3677.566070]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3677.566072]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3677.566076]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3677.566078]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3677.566080]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3677.566084]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3677.566088]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3677.566091]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3677.566094]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3677.566097]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3677.566107]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3677.566110]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3677.566112]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3677.566115]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3677.566117]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3677.566121]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3743.064484] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3743.064528] CPU 6:
[ 3743.064529] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3743.064568] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3743.064570] RIP: 0010:[<ffffffff803939bc>]  [<ffffffff803939bc>] _raw_spin_lock+0x90/0x105
[ 3743.064576] RSP: 0018:ffff8800281a3558  EFLAGS: 00000246
[ 3743.064577] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 0000000011748295
[ 3743.064579] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 3743.064581] RBP: ffffffff8020bbd3 R08: 0000000011748295 R09: 0000000000013a4b
[ 3743.064583] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 3743.064585] R13: 000000006ef5396e R14: 0000000000000001 R15: ffffffff8021d470
[ 3743.064587] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3743.064589] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3743.064590] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3743.064592] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3743.064594] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3743.064596] Call Trace:
[ 3743.064597]  <IRQ>  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 3743.064605]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3743.064608]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3743.064613]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3743.064617]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3743.064620]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3743.064628]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3743.064632]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3743.064636]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3743.064639]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3743.064642]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3743.064644]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3743.064646]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3743.064649]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3743.064651]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3743.064654]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3743.064657]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3743.064660]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3743.064662]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3743.064666]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3743.064670]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3743.064673]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3743.064676]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3743.064678]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3743.064681]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3743.064683]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3743.064685]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3743.064688]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3743.064690]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3743.064693]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3743.064695]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3743.064702]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3743.064708]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3743.064711]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3743.064717]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3743.064722]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3743.064725]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3743.064731]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3743.064733]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3743.064739]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3743.064744]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3743.064747]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3743.064750]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3743.064753]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3743.064755]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3743.064758]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3743.064760]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3743.064763]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3743.064765]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3743.064769]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3743.064773]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3743.064776]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3743.064779]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3743.064782]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3743.064792]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3743.064795]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3743.064797]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3743.064800]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3743.064802]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3743.064805]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3808.562418] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3808.562462] CPU 6:
[ 3808.562463] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3808.562503] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3808.562504] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3808.562510] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3808.562512] RAX: 00000000d740cea4 RBX: 0000000000000006 RCX: 00000000d740ce92
[ 3808.562513] RDX: 0000000000000942 RSI: 0000000000000001 RDI: 0000000000000001
[ 3808.562515] RBP: ffffffff8020bbd3 R08: ffffffffd740ce66 R09: 0000000000013a4b
[ 3808.562517] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3808.562519] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3808.562521] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3808.562523] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3808.562525] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3808.562526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3808.562528] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3808.562530] Call Trace:
[ 3808.562531]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3808.562538]  [<ffffffff803903cb>] ? delay_tsc+0x52/0x57
[ 3808.562542]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3808.562546]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3808.562549]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3808.562554]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3808.562558]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3808.562561]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3808.562570]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3808.562574]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3808.562578]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3808.562581]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3808.562583]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3808.562585]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3808.562588]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3808.562591]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3808.562593]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3808.562595]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3808.562599]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3808.562601]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3808.562604]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3808.562607]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3808.562611]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3808.562614]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3808.562617]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3808.562619]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3808.562622]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3808.562624]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3808.562626]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3808.562629]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3808.562631]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3808.562634]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3808.562637]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3808.562643]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3808.562649]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3808.562652]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3808.562658]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3808.562663]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3808.562666]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3808.562672]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3808.562674]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3808.562680]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3808.562685]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3808.562688]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3808.562691]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3808.562693]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3808.562696]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3808.562698]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3808.562701]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3808.562704]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3808.562705]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3808.562710]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3808.562713]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3808.562717]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3808.562720]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3808.562724]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3808.562734]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3808.562736]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3808.562739]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3808.562742]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3808.562744]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3808.562747]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3874.060789] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3874.060832] CPU 6:
[ 3874.060834] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3874.060872] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3874.060874] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 3874.060880] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3874.060882] RAX: 000000009d1ee6b5 RBX: 0000000000000006 RCX: 000000009d1ee692
[ 3874.060883] RDX: 000000000000096b RSI: 0000000000000001 RDI: 0000000000000001
[ 3874.060885] RBP: ffffffff8020bbd3 R08: ffffffff9d1ee67a R09: 0000000000013a4b
[ 3874.060887] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3874.060889] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3874.060891] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3874.060893] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3874.060895] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3874.060896] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3874.060898] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3874.060900] Call Trace:
[ 3874.060901]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 3874.060909]  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 3874.060912]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3874.060916]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3874.060919]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3874.060924]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3874.060928]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3874.060931]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3874.060939]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3874.060944]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3874.060948]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3874.060951]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3874.060953]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3874.060956]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3874.060958]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3874.060961]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3874.060963]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3874.060965]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3874.060969]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3874.060971]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3874.060974]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3874.060978]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3874.060981]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3874.060984]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3874.060987]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3874.060990]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3874.060992]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3874.060994]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3874.060997]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3874.060999]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3874.061001]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3874.061004]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3874.061007]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3874.061013]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3874.061020]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3874.061023]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3874.061028]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3874.061034]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3874.061037]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3874.061042]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3874.061045]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3874.061050]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3874.061056]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3874.061059]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3874.061061]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3874.061064]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3874.061067]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3874.061069]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3874.061072]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3874.061074]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3874.061076]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3874.061080]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3874.061084]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3874.061087]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3874.061090]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3874.061093]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3874.061103]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3874.061106]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3874.061108]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3874.061111]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3874.061113]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3874.061116]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 3939.559568] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3939.559610] CPU 6:
[ 3939.559611] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 3939.559651] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 3939.559652] RIP: 0010:[<ffffffff80210da5>]  [<ffffffff80210da5>] native_read_tsc+0x10/0x11
[ 3939.559657] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 3939.559659] RAX: 00000994630db0b0 RBX: ffff8800281b2730 RCX: 00000000630db0b0
[ 3939.559661] RDX: 00000000630db0b0 RSI: 0000000000000001 RDI: 0000000000000001
[ 3939.559662] RBP: ffffffff8020bbd3 R08: 00000000630db098 R09: 0000000000013a4b
[ 3939.559664] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 3939.559666] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 3939.559668] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 3939.559670] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 3939.559672] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 3939.559674] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 3939.559675] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 3939.559677] Call Trace:
[ 3939.559678]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 3939.559685]  [<ffffffff803939bc>] ? _raw_spin_lock+0x90/0x105
[ 3939.559688]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 3939.559692]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 3939.559695]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 3939.559700]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 3939.559704]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 3939.559707]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 3939.559715]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 3939.559719]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 3939.559723]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3939.559725]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3939.559728]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3939.559730]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 3939.559733]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 3939.559735]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 3939.559737]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 3939.559740]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 3939.559743]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 3939.559746]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 3939.559748]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 3939.559752]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 3939.559756]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 3939.559759]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 3939.559762]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3939.559764]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3939.559766]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3939.559769]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 3939.559771]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 3939.559773]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 3939.559776]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 3939.559779]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 3939.559781]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3939.559787]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 3939.559794]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 3939.559796]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 3939.559802]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 3939.559808]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 3939.559810]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 3939.559816]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3939.559819]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 3939.559824]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 3939.559830]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 3939.559832]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 3939.559835]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 3939.559838]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 3939.559840]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 3939.559843]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 3939.559845]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 3939.559848]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 3939.559849]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 3939.559853]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 3939.559857]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 3939.559860]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 3939.559863]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 3939.559866]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 3939.559876]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 3939.559878]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 3939.559881]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 3939.559883]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 3939.559886]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 3939.559889]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4005.057732] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4005.057776] CPU 6:
[ 4005.057777] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4005.057815] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4005.057817] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4005.057823] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 4005.057825] RAX: 0000000028e359a2 RBX: 0000000000000006 RCX: 0000000028e3597f
[ 4005.057826] RDX: 00000000000009bd RSI: 0000000000000001 RDI: 0000000000000001
[ 4005.057828] RBP: ffffffff8020bbd3 R08: 0000000028e35964 R09: 0000000000013a4b
[ 4005.057830] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4005.057832] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4005.057834] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4005.057836] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4005.057838] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4005.057840] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4005.057841] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4005.057843] Call Trace:
[ 4005.057844]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4005.057853]  [<ffffffff80221ed0>] ? __ticket_spin_trylock+0xb/0x19
[ 4005.057856]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4005.057861]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4005.057864]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4005.057869]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4005.057873]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4005.057876]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4005.057884]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4005.057888]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4005.057892]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4005.057894]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4005.057897]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4005.057899]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4005.057902]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4005.057904]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4005.057907]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4005.057909]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4005.057912]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4005.057915]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4005.057917]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4005.057921]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4005.057925]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4005.057928]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4005.057931]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4005.057933]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4005.057935]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4005.057938]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4005.057940]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4005.057942]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4005.057945]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4005.057948]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4005.057950]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4005.057957]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4005.057963]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4005.057966]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4005.057971]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4005.057977]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4005.057980]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4005.057985]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4005.057988]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4005.057993]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4005.057999]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4005.058002]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4005.058004]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4005.058007]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4005.058010]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4005.058012]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4005.058015]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4005.058018]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4005.058019]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4005.058023]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4005.058027]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4005.058030]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4005.058033]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4005.058036]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4005.058046]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4005.058049]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4005.058051]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4005.058054]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4005.058056]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4005.058059]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
Apr 15 08:54:48 nehalam bonobo-activation-server (shemminger-19713): could not associate with desktop session: Failed to connect to socket /tmp/dbus-z9XxFF6Gr5: Connection refused
[ 4070.556255] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4070.556299] CPU 6:
[ 4070.556300] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4070.556339] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4070.556340] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4070.556346] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000206
[ 4070.556348] RAX: 00000000eec7b224 RBX: 0000000000000006 RCX: 00000000eec7b212
[ 4070.556350] RDX: 00000000000009e5 RSI: 0000000000000001 RDI: 0000000000000001
[ 4070.556351] RBP: ffffffff8020bbd3 R08: ffffffffeec7b1e6 R09: 0000000000013a4b
[ 4070.556353] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4070.556355] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4070.556357] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4070.556359] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4070.556361] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4070.556363] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4070.556364] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4070.556366] Call Trace:
[ 4070.556367]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4070.556375]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 4070.556377]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4070.556382]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4070.556385]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4070.556390]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4070.556394]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4070.556397]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4070.556405]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4070.556409]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4070.556413]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4070.556416]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4070.556419]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4070.556421]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4070.556424]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4070.556426]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4070.556428]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4070.556431]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4070.556434]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4070.556437]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4070.556439]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4070.556443]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4070.556447]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4070.556449]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4070.556452]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4070.556455]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4070.556457]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4070.556459]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4070.556462]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4070.556464]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4070.556467]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4070.556469]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4070.556472]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4070.556478]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4070.556485]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4070.556487]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4070.556493]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4070.556499]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4070.556502]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4070.556507]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4070.556510]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4070.556515]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4070.556521]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4070.556523]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4070.556526]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4070.556529]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4070.556531]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4070.556534]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4070.556536]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4070.556539]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4070.556540]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4070.556544]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4070.556548]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4070.556551]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4070.556554]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4070.556558]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4070.556567]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4070.556570]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4070.556572]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4070.556575]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4070.556577]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4070.556580]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4133.531555] Xorg          D 0000000000000000     0  7152      1
[ 4133.531558]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4133.531561]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4133.531565]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4133.531568] Call Trace:
[ 4133.531573]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4133.531576]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4133.531578]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4133.531581]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4133.531584]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4133.531586]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4133.531589]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4133.531592]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4133.531595]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4133.531597]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4133.531603]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4133.531606]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4133.531609]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4133.531611]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4133.531614]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4133.531616] INFO: lockdep is turned off.
[ 4136.054116] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4136.054156] CPU 6:
[ 4136.054157] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4136.054198] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4136.054199] RIP: 0010:[<ffffffff803903cb>]  [<ffffffff803903cb>] delay_tsc+0x52/0x57
[ 4136.054204] RSP: 0018:ffff8800281a3540  EFLAGS: 00000296
[ 4136.054206] RAX: 0000000000000012 RBX: ffff8800281b2730 RCX: 00000000b491091e
[ 4136.054208] RDX: 00000000b491091e RSI: 0000000000000001 RDI: 0000000000000001
[ 4136.054209] RBP: ffffffff8020bbd3 R08: ffffffffb491091e R09: 0000000000013a4b
[ 4136.054211] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34c0
[ 4136.054213] R13: 00000000855541cc R14: 00000000b4683e4e R15: ffffffff8021d470
[ 4136.054215] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4136.054217] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4136.054219] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4136.054221] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4136.054222] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4136.054224] Call Trace:
[ 4136.054225]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4136.054232]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4136.054235]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4136.054239]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4136.054243]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4136.054246]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4136.054253]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4136.054257]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4136.054260]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4136.054263]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4136.054265]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4136.054268]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4136.054270]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4136.054273]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4136.054275]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4136.054277]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4136.054280]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4136.054283]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4136.054285]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4136.054289]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4136.054292]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4136.054295]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4136.054298]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4136.054301]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4136.054303]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4136.054305]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4136.054308]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4136.054310]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4136.054313]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4136.054315]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4136.054318]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4136.054324]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4136.054330]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4136.054333]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4136.054339]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4136.054345]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4136.054347]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4136.054353]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4136.054356]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4136.054361]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4136.054367]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4136.054370]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4136.054372]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4136.054375]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4136.054377]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4136.054380]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4136.054382]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4136.054385]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4136.054386]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4136.054390]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4136.054394]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4136.054397]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4136.054400]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4136.054403]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4136.054412]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4136.054415]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4136.054417]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4136.054420]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4136.054422]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4136.054425]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4201.552295] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4201.552335] CPU 6:
[ 4201.552336] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4201.552375] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4201.552377] RIP: 0010:[<ffffffff803939a9>]  [<ffffffff803939a9>] _raw_spin_lock+0x7d/0x105
[ 4201.552381] RSP: 0018:ffff8800281a3558  EFLAGS: 00000287
[ 4201.552383] RAX: 0000000000000012 RBX: ffff8800281b2730 RCX: 000000007a674b19
[ 4201.552385] RDX: 000000007a674b19 RSI: 0000000000000001 RDI: 0000000000000001
[ 4201.552387] RBP: ffffffff8020bbd3 R08: 000000007a674b19 R09: 0000000000013a4b
[ 4201.552389] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 4201.552390] R13: 00000000064e1b4e R14: 0000000000000001 R15: ffffffff8021d470
[ 4201.552393] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4201.552395] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4201.552396] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4201.552398] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4201.552400] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4201.552401] Call Trace:
[ 4201.552402]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4201.552409]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4201.552412]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4201.552416]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4201.552420]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4201.552423]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4201.552430]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4201.552434]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4201.552437]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4201.552440]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4201.552442]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4201.552444]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4201.552447]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4201.552449]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4201.552452]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4201.552454]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4201.552457]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4201.552459]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4201.552462]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4201.552465]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4201.552469]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4201.552472]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4201.552475]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4201.552477]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4201.552480]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4201.552482]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4201.552484]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4201.552487]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4201.552489]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4201.552492]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4201.552494]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4201.552500]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4201.552506]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4201.552509]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4201.552515]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4201.552521]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4201.552523]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4201.552529]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4201.552532]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4201.552537]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4201.552543]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4201.552545]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4201.552548]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4201.552551]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4201.552553]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4201.552556]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4201.552558]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4201.552560]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4201.552562]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4201.552566]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4201.552569]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4201.552573]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4201.552575]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4201.552578]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4201.552588]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4201.552590]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4201.552593]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4201.552595]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4201.552598]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4201.552600]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4253.511078] Xorg          D 0000000000000000     0  7152      1
[ 4253.511081]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4253.511085]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4253.511088]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4253.511091] Call Trace:
[ 4253.511095]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4253.511098]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4253.511100]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4253.511103]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4253.511105]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4253.511108]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4253.511111]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4253.511114]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4253.511116]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4253.511119]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4253.511124]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4253.511127]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4253.511129]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4253.511131]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4253.511134]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4253.511135] INFO: lockdep is turned off.
[ 4253.511141] killall5      D 0000000000000000     0 20049  20037
[ 4253.511144]  ffff8801bf0dbec0 0000000000000046 0000000000000000 ffffffff802850d0
[ 4253.511148]  0000000000011a00 000000000000cc78 ffff8801b8ffaf10 ffff8801b8ffb2a0
[ 4253.511151]  0000000400000000 ffff88002817da18 ffff880028178c78 ffff8801b8ffb2a0
[ 4253.511154] Call Trace:
[ 4253.511158]  [<ffffffff802850d0>] ? find_get_page+0x0/0xc1
[ 4253.511161]  [<ffffffff8028da00>] ? ____pagevec_lru_add+0x142/0x172
[ 4253.511164]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4253.511166]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4253.511169]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4253.511171]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4253.511174]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4253.511176]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4253.511179]  [<ffffffff8028dafd>] ? lru_add_drain_per_cpu+0x0/0x5
[ 4253.511181]  [<ffffffff8024a56b>] ? flush_work+0xd4/0xe9
[ 4253.511183]  [<ffffffff8024a4cc>] ? flush_work+0x35/0xe9
[ 4253.511186]  [<ffffffff8022c29e>] ? __wake_up_common+0x44/0x73
[ 4253.511188]  [<ffffffff80249fa4>] ? wq_barrier_func+0x0/0x9
[ 4253.511191]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4253.511193]  [<ffffffff8024a8af>] ? schedule_on_each_cpu+0xc6/0x102
[ 4253.511196]  [<ffffffff8029c604>] ? sys_mlockall+0x31/0xbf
[ 4253.511199]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4253.511200] INFO: lockdep is turned off.
[ 4267.050770] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4267.050809] CPU 6:
[ 4267.050811] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4267.050850] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4267.050852] RIP: 0010:[<ffffffff80221ed0>]  [<ffffffff80221ed0>] __ticket_spin_trylock+0xb/0x19
[ 4267.050857] RSP: 0018:ffff8800281a3550  EFLAGS: 00000297
[ 4267.050859] RAX: 000000000000c2c1 RBX: ffff8800281b2730 RCX: 000000004049ac98
[ 4267.050861] RDX: 000000000000c3c1 RSI: 0000000000000001 RDI: ffff8800281b2730
[ 4267.050863] RBP: ffffffff8020bbd3 R08: 000000004049ac98 R09: 0000000000013a4b
[ 4267.050865] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 4267.050867] R13: 0000000026a24493 R14: 0000000000000001 R15: ffffffff8021d470
[ 4267.050869] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4267.050871] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4267.050872] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4267.050874] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4267.050876] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4267.050877] Call Trace:
[ 4267.050879]  <IRQ>  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 4267.050885]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4267.050888]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4267.050892]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4267.050897]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4267.050900]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4267.050907]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4267.050911]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4267.050914]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4267.050916]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4267.050919]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4267.050921]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4267.050923]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4267.050926]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4267.050928]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4267.050931]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4267.050934]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4267.050936]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4267.050938]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4267.050942]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4267.050946]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4267.050948]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4267.050951]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4267.050954]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4267.050956]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4267.050959]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4267.050961]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4267.050963]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4267.050966]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4267.050968]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4267.050971]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4267.050977]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4267.050983]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4267.050986]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4267.050992]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4267.050998]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4267.051000]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4267.051006]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4267.051009]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4267.051014]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4267.051020]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4267.051022]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4267.051025]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4267.051028]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4267.051030]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4267.051033]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4267.051036]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4267.051038]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4267.051039]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4267.051043]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4267.051047]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4267.051050]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4267.051053]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4267.051056]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4267.051066]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4267.051068]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4267.051071]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4267.051073]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4267.051075]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4267.051078]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4332.548525] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4332.548569] CPU 6:
[ 4332.548570] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4332.548609] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4332.548611] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4332.548616] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 4332.548618] RAX: 00000000060ea447 RBX: ffff8800281b2730 RCX: 00000000060ea42f
[ 4332.548620] RDX: 0000000000000a89 RSI: 0000000000000001 RDI: 0000000000000001
[ 4332.548622] RBP: ffffffff8020bbd3 R08: 00000000060ea42f R09: 0000000000013a4b
[ 4332.548623] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4332.548625] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4332.548627] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4332.548629] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4332.548631] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4332.548633] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4332.548634] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4332.548636] Call Trace:
[ 4332.548637]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 4332.548645]  [<ffffffff803939b1>] ? _raw_spin_lock+0x85/0x105
[ 4332.548647]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4332.548652]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4332.548655]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4332.548660]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4332.548664]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4332.548668]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4332.548675]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4332.548679]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4332.548683]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4332.548686]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4332.548689]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4332.548691]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4332.548694]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4332.548696]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4332.548698]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4332.548701]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4332.548705]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4332.548708]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4332.548710]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4332.548714]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4332.548717]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4332.548720]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4332.548723]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4332.548725]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4332.548728]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4332.548730]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4332.548733]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4332.548735]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4332.548737]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4332.548740]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4332.548743]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4332.548749]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4332.548755]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4332.548758]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4332.548764]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4332.548770]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4332.548772]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4332.548778]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4332.548780]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4332.548786]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4332.548792]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4332.548794]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4332.548797]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4332.548800]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4332.548802]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4332.548805]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4332.548807]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4332.548810]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4332.548811]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4332.548816]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4332.548819]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4332.548823]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4332.548825]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4332.548829]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4332.548839]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4332.548841]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4332.548843]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4332.548846]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4332.548848]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4332.548851]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4373.491588] Xorg          D 0000000000000000     0  7152      1
[ 4373.491591]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4373.491595]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4373.491598]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4373.491602] Call Trace:
[ 4373.491606]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4373.491609]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4373.491612]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4373.491614]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4373.491617]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4373.491619]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4373.491622]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4373.491626]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4373.491628]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4373.491631]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4373.491637]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4373.491640]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4373.491642]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4373.491644]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4373.491648]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4373.491649] INFO: lockdep is turned off.
[ 4373.491656] killall5      D 0000000000000000     0 20049  20037
[ 4373.491659]  ffff8801bf0dbec0 0000000000000046 0000000000000000 ffffffff802850d0
[ 4373.491663]  0000000000011a00 000000000000cc78 ffff8801b8ffaf10 ffff8801b8ffb2a0
[ 4373.491666]  0000000400000000 ffff88002817da18 ffff880028178c78 ffff8801b8ffb2a0
[ 4373.491669] Call Trace:
[ 4373.491673]  [<ffffffff802850d0>] ? find_get_page+0x0/0xc1
[ 4373.491676]  [<ffffffff8028da00>] ? ____pagevec_lru_add+0x142/0x172
[ 4373.491679]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4373.491681]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4373.491684]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4373.491686]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4373.491689]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4373.491691]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4373.491694]  [<ffffffff8028dafd>] ? lru_add_drain_per_cpu+0x0/0x5
[ 4373.491696]  [<ffffffff8024a56b>] ? flush_work+0xd4/0xe9
[ 4373.491699]  [<ffffffff8024a4cc>] ? flush_work+0x35/0xe9
[ 4373.491701]  [<ffffffff8022c29e>] ? __wake_up_common+0x44/0x73
[ 4373.491703]  [<ffffffff80249fa4>] ? wq_barrier_func+0x0/0x9
[ 4373.491706]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4373.491708]  [<ffffffff8024a8af>] ? schedule_on_each_cpu+0xc6/0x102
[ 4373.491711]  [<ffffffff8029c604>] ? sys_mlockall+0x31/0xbf
[ 4373.491714]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4373.491715] INFO: lockdep is turned off.
[ 4398.046541] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4398.046581] CPU 6:
[ 4398.046582] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4398.046621] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4398.046623] RIP: 0010:[<ffffffff8039039f>]  [<ffffffff8039039f>] delay_tsc+0x26/0x57
[ 4398.046628] RSP: 0018:ffff8800281a34f8  EFLAGS: 00000202
[ 4398.046629] RAX: 00000ab1cbde457f RBX: 0000000000000006 RCX: 00000000cbde457f
[ 4398.046631] RDX: 00000000cbde457f RSI: 0000000000000001 RDI: 0000000000000001
[ 4398.046633] RBP: ffffffff8020bbd3 R08: ffffffffcbde4544 R09: 0000000000013a4b
[ 4398.046635] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4398.046637] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4398.046639] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4398.046641] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4398.046643] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4398.046644] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4398.046646] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4398.046648] Call Trace:
[ 4398.046649]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4398.046654]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 4398.046657]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4398.046661]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4398.046663]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4398.046668]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4398.046672]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4398.046675]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4398.046682]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4398.046686]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4398.046689]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4398.046692]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4398.046694]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4398.046696]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4398.046699]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4398.046702]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4398.046704]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4398.046706]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4398.046709]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4398.046711]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4398.046714]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4398.046717]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4398.046721]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4398.046724]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4398.046727]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4398.046729]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4398.046732]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4398.046734]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4398.046737]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4398.046739]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4398.046741]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4398.046744]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4398.046747]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4398.046753]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4398.046759]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4398.046762]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4398.046767]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4398.046773]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4398.046776]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4398.046782]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4398.046784]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4398.046790]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4398.046795]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4398.046798]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4398.046801]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4398.046803]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4398.046806]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4398.046808]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4398.046811]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4398.046813]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4398.046815]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4398.046819]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4398.046822]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4398.046826]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4398.046828]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4398.046831]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4398.046841]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4398.046844]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4398.046846]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4398.046848]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4398.046851]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4398.046854]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4463.544802] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4463.544841] CPU 6:
[ 4463.544843] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4463.544881] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4463.544883] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4463.544887] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 4463.544888] RAX: 0000000091b7e77b RBX: 0000000000000006 RCX: 0000000091b7e758
[ 4463.544890] RDX: 0000000000000ada RSI: 0000000000000001 RDI: 0000000000000001
[ 4463.544892] RBP: ffffffff8020bbd3 R08: ffffffff91b7e740 R09: 0000000000013a4b
[ 4463.544894] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4463.544896] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4463.544898] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4463.544900] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4463.544901] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4463.544903] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4463.544905] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4463.544906] Call Trace:
[ 4463.544908]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4463.544913]  [<ffffffff803939a9>] ? _raw_spin_lock+0x7d/0x105
[ 4463.544916]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4463.544920]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4463.544923]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4463.544927]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4463.544931]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4463.544934]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4463.544941]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4463.544945]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4463.544948]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4463.544950]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4463.544953]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4463.544955]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4463.544958]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4463.544960]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4463.544962]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4463.544965]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4463.544968]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4463.544970]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4463.544972]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4463.544976]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4463.544980]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4463.544982]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4463.544985]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4463.544988]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4463.544990]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4463.544992]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4463.544995]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4463.544997]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4463.545000]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4463.545002]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4463.545005]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4463.545011]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4463.545017]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4463.545020]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4463.545026]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4463.545031]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4463.545034]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4463.545040]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4463.545042]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4463.545048]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4463.545053]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4463.545056]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4463.545059]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4463.545061]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4463.545064]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4463.545066]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4463.545069]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4463.545071]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4463.545072]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4463.545077]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4463.545080]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4463.545083]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4463.545086]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4463.545089]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4463.545098]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4463.545101]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4463.545103]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4463.545106]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4463.545108]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4463.545111]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4493.472921] Xorg          D 0000000000000000     0  7152      1
[ 4493.472924]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4493.472928]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4493.472931]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4493.472935] Call Trace:
[ 4493.472938]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4493.472941]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4493.472944]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4493.472946]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4493.472949]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4493.472951]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4493.472954]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4493.472957]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4493.472959]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4493.472962]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4493.472967]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4493.472970]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4493.472972]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4493.472974]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4493.472977]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4493.472979] INFO: lockdep is turned off.
[ 4493.472985] killall5      D 0000000000000000     0 20049  20037
[ 4493.472988]  ffff8801bf0dbec0 0000000000000046 0000000000000000 ffffffff802850d0
[ 4493.472991]  0000000000011a00 000000000000cc78 ffff8801b8ffaf10 ffff8801b8ffb2a0
[ 4493.472994]  0000000400000000 ffff88002817da18 ffff880028178c78 ffff8801b8ffb2a0
[ 4493.472998] Call Trace:
[ 4493.473000]  [<ffffffff802850d0>] ? find_get_page+0x0/0xc1
[ 4493.473004]  [<ffffffff8028da00>] ? ____pagevec_lru_add+0x142/0x172
[ 4493.473006]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4493.473008]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4493.473011]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4493.473013]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4493.473016]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4493.473018]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4493.473021]  [<ffffffff8028dafd>] ? lru_add_drain_per_cpu+0x0/0x5
[ 4493.473023]  [<ffffffff8024a56b>] ? flush_work+0xd4/0xe9
[ 4493.473026]  [<ffffffff8024a4cc>] ? flush_work+0x35/0xe9
[ 4493.473028]  [<ffffffff8022c29e>] ? __wake_up_common+0x44/0x73
[ 4493.473030]  [<ffffffff80249fa4>] ? wq_barrier_func+0x0/0x9
[ 4493.473033]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4493.473035]  [<ffffffff8024a8af>] ? schedule_on_each_cpu+0xc6/0x102
[ 4493.473038]  [<ffffffff8029c604>] ? sys_mlockall+0x31/0xbf
[ 4493.473041]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4493.473042] INFO: lockdep is turned off.
[ 4529.043293] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4529.043332] CPU 6:
[ 4529.043334] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4529.043373] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4529.043374] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4529.043379] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000206
[ 4529.043380] RAX: 00000000579aece4 RBX: 0000000000000006 RCX: 00000000579aecd2
[ 4529.043382] RDX: 0000000000000b03 RSI: 0000000000000001 RDI: 0000000000000001
[ 4529.043384] RBP: ffffffff8020bbd3 R08: 00000000579aeca9 R09: 0000000000013a4b
[ 4529.043386] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4529.043388] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4529.043390] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4529.043392] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4529.043394] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4529.043395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4529.043397] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4529.043399] Call Trace:
[ 4529.043400]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4529.043405]  [<ffffffff803903cb>] ? delay_tsc+0x52/0x57
[ 4529.043408]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4529.043412]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4529.043415]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4529.043420]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4529.043424]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4529.043427]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4529.043434]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4529.043438]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4529.043441]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4529.043443]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4529.043446]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4529.043448]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4529.043451]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4529.043453]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4529.043456]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4529.043458]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4529.043461]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4529.043463]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4529.043466]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4529.043469]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4529.043473]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4529.043476]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4529.043479]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4529.043481]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4529.043484]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4529.043486]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4529.043488]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4529.043491]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4529.043493]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4529.043496]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4529.043498]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4529.043504]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4529.043511]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4529.043513]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4529.043519]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4529.043525]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4529.043528]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4529.043533]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4529.043536]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4529.043541]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4529.043547]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4529.043550]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4529.043552]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4529.043555]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4529.043558]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4529.043560]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4529.043563]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4529.043565]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4529.043566]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4529.043570]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4529.043574]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4529.043577]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4529.043580]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4529.043583]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4529.043592]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4529.043595]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4529.043597]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4529.043600]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4529.043602]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4529.043605]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4594.542000] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4594.542039] CPU 6:
[ 4594.542041] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4594.542079] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4594.542081] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4594.542085] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 4594.542087] RAX: 000000001d86bf79 RBX: ffff8800281b2730 RCX: 000000001d86bf61
[ 4594.542089] RDX: 0000000000000b2c RSI: 0000000000000001 RDI: 0000000000000001
[ 4594.542090] RBP: ffffffff8020bbd3 R08: 000000001d86bf61 R09: 0000000000013a4b
[ 4594.542092] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4594.542094] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4594.542096] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4594.542098] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4594.542100] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4594.542102] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4594.542103] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4594.542105] Call Trace:
[ 4594.542106]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 4594.542113]  [<ffffffff80221ed7>] ? __ticket_spin_trylock+0x12/0x19
[ 4594.542116]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4594.542120]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4594.542123]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4594.542127]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4594.542131]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4594.542134]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4594.542141]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4594.542145]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4594.542148]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4594.542150]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4594.542153]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4594.542155]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4594.542158]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4594.542160]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4594.542162]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4594.542165]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4594.542168]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4594.542170]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4594.542172]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4594.542176]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4594.542180]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4594.542182]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4594.542185]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4594.542188]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4594.542190]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4594.542192]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4594.542195]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4594.542197]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4594.542199]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4594.542202]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4594.542205]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4594.542211]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4594.542217]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4594.542220]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4594.542225]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4594.542231]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4594.542234]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4594.542239]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4594.542242]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4594.542247]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4594.542253]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4594.542256]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4594.542258]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4594.542261]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4594.542264]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4594.542266]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4594.542269]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4594.542271]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4594.542272]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4594.542276]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4594.542280]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4594.542283]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4594.542286]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4594.542289]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4594.542298]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4594.542301]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4594.542303]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4594.542306]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4594.542308]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4594.542311]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4613.455028] Xorg          D 0000000000000000     0  7152      1
[ 4613.455031]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4613.455034]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4613.455038]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4613.455041] Call Trace:
[ 4613.455047]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4613.455049]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4613.455052]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4613.455054]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4613.455058]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4613.455060]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4613.455064]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4613.455068]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4613.455070]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4613.455073]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4613.455082]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4613.455085]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4613.455088]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4613.455090]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4613.455094]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4613.455095] INFO: lockdep is turned off.
[ 4613.455103] killall5      D 0000000000000000     0 20049  20037
[ 4613.455105]  ffff8801bf0dbec0 0000000000000046 0000000000000000 ffffffff802850d0
[ 4613.455109]  0000000000011a00 000000000000cc78 ffff8801b8ffaf10 ffff8801b8ffb2a0
[ 4613.455112]  0000000400000000 ffff88002817da18 ffff880028178c78 ffff8801b8ffb2a0
[ 4613.455116] Call Trace:
[ 4613.455119]  [<ffffffff802850d0>] ? find_get_page+0x0/0xc1
[ 4613.455123]  [<ffffffff8028da00>] ? ____pagevec_lru_add+0x142/0x172
[ 4613.455125]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4613.455127]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4613.455130]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4613.455133]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4613.455135]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4613.455137]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4613.455140]  [<ffffffff8028dafd>] ? lru_add_drain_per_cpu+0x0/0x5
[ 4613.455142]  [<ffffffff8024a56b>] ? flush_work+0xd4/0xe9
[ 4613.455145]  [<ffffffff8024a4cc>] ? flush_work+0x35/0xe9
[ 4613.455147]  [<ffffffff8022c29e>] ? __wake_up_common+0x44/0x73
[ 4613.455150]  [<ffffffff80249fa4>] ? wq_barrier_func+0x0/0x9
[ 4613.455152]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4613.455155]  [<ffffffff8024a8af>] ? schedule_on_each_cpu+0xc6/0x102
[ 4613.455158]  [<ffffffff8029c604>] ? sys_mlockall+0x31/0xbf
[ 4613.455160]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4613.455162] INFO: lockdep is turned off.
[ 4660.039940] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4660.039984] CPU 6:
[ 4660.039985] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4660.040024] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4660.040026] RIP: 0010:[<ffffffff80390379>]  [<ffffffff80390379>] delay_tsc+0x0/0x57
[ 4660.040032] RSP: 0018:ffff8800281a3550  EFLAGS: 00000206
[ 4660.040034] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000e35344d6
[ 4660.040035] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 4660.040037] RBP: ffffffff8020bbd3 R08: ffffffffe35344d6 R09: 0000000000013a4b
[ 4660.040039] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 4660.040041] R13: 0000000048506dde R14: 0000000000000001 R15: ffffffff8021d470
[ 4660.040043] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4660.040045] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4660.040047] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4660.040049] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4660.040050] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4660.040052] Call Trace:
[ 4660.040053]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4660.040062]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4660.040065]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4660.040070]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4660.040074]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4660.040077]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4660.040085]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4660.040089]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4660.040092]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4660.040095]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4660.040098]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4660.040100]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4660.040102]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4660.040105]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4660.040107]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4660.040110]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4660.040113]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4660.040116]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4660.040118]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4660.040122]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4660.040126]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4660.040129]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4660.040132]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4660.040134]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4660.040137]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4660.040139]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4660.040141]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4660.040144]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4660.040146]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4660.040149]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4660.040152]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4660.040158]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4660.040164]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4660.040167]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4660.040173]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4660.040179]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4660.040182]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4660.040187]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4660.040190]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4660.040195]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4660.040201]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4660.040204]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4660.040206]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4660.040209]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4660.040211]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4660.040214]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4660.040217]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4660.040220]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4660.040221]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4660.040225]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4660.040229]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4660.040233]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4660.040236]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4660.040240]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4660.040250]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4660.040253]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4660.040255]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4660.040258]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4660.040260]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4660.040263]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4725.537911] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4725.537951] CPU 6:
[ 4725.537952] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4725.537991] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4725.537993] RIP: 0010:[<ffffffff803903dc>]  [<ffffffff803903dc>] __delay+0x0/0xa
[ 4725.537998] RSP: 0018:ffff8800281a3550  EFLAGS: 00000216
[ 4725.537999] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 00000000a9211a35
[ 4725.538001] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 4725.538003] RBP: ffffffff8020bbd3 R08: ffffffffa9211a35 R09: 0000000000013a4b
[ 4725.538005] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 4725.538006] R13: 0000000068adec60 R14: 0000000000000001 R15: ffffffff8021d470
[ 4725.538009] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4725.538011] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4725.538012] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4725.538014] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4725.538016] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4725.538017] Call Trace:
[ 4725.538018]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4725.538025]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4725.538028]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4725.538032]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4725.538036]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4725.538039]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4725.538046]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4725.538050]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4725.538053]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4725.538055]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4725.538057]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4725.538060]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4725.538062]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4725.538065]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4725.538067]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4725.538069]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4725.538072]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4725.538075]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4725.538077]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4725.538081]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4725.538084]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4725.538087]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4725.538090]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4725.538092]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4725.538095]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4725.538097]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4725.538100]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4725.538102]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4725.538104]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4725.538107]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4725.538110]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4725.538116]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4725.538122]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4725.538125]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4725.538130]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4725.538136]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4725.538139]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4725.538144]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4725.538147]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4725.538152]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4725.538158]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4725.538161]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4725.538163]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4725.538166]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4725.538169]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4725.538171]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4725.538174]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4725.538176]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4725.538177]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4725.538182]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4725.538185]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4725.538188]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4725.538191]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4725.538194]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4725.538204]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4725.538206]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4725.538208]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4725.538211]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4725.538213]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4725.538216]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4733.441024] Xorg          D 0000000000000000     0  7152      1
[ 4733.441027]  ffff8801bf120000 0000000000000046 0000000000000000 ffffffff8053a015
[ 4733.441030]  0000000000011a00 000000000000cc78 ffff8801bc4fbec0 ffff8801bc4fc250
[ 4733.441034]  0000000500000046 ffff880028197a18 ffff880028192c78 ffff8801bc4fc250
[ 4733.441037] Call Trace:
[ 4733.441041]  [<ffffffff8053a015>] ? __schedule+0x131/0x941
[ 4733.441044]  [<ffffffff8053a8d7>] ? schedule+0x9/0x1d
[ 4733.441046]  [<ffffffff8053abae>] ? schedule_timeout+0x1f/0x166
[ 4733.441049]  [<ffffffff8053a9dd>] ? wait_for_common+0x39/0x14f
[ 4733.441052]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4733.441054]  [<ffffffff8053aa85>] ? wait_for_common+0xe1/0x14f
[ 4733.441057]  [<ffffffff80231dfb>] ? default_wake_function+0x0/0x9
[ 4733.441061]  [<ffffffff8024b7c1>] ? synchronize_rcu+0x49/0x50
[ 4733.441063]  [<ffffffff8024b649>] ? wakeme_after_rcu+0x0/0x9
[ 4733.441066]  [<ffffffff8053cc1b>] ? _spin_unlock+0x17/0x20
[ 4733.441072]  [<ffffffffa00e9136>] ? evdev_release+0x58/0x9f [evdev]
[ 4733.441075]  [<ffffffff802b5cfe>] ? __fput+0xe7/0x1b3
[ 4733.441077]  [<ffffffff802b319e>] ? filp_close+0x5b/0x62
[ 4733.441079]  [<ffffffff802b3244>] ? sys_close+0x9f/0xdd
[ 4733.441082]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4733.441084] INFO: lockdep is turned off.
[ 4791.036883] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4791.036927] CPU 6:
[ 4791.036928] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4791.036968] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4791.036969] RIP: 0010:[<ffffffff80390379>]  [<ffffffff80390379>] delay_tsc+0x0/0x57
[ 4791.036976] RSP: 0018:ffff8800281a3550  EFLAGS: 00000206
[ 4791.036977] RAX: 0000000000000000 RBX: ffff8800281b2730 RCX: 000000006f17bae6
[ 4791.036979] RDX: 000000000000c300 RSI: 0000000000000001 RDI: 0000000000000001
[ 4791.036981] RBP: ffffffff8020bbd3 R08: 000000006f17bae6 R09: 0000000000013a4b
[ 4791.036983] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a34d0
[ 4791.036985] R13: 0000000088b84b41 R14: 0000000000000001 R15: ffffffff8021d470
[ 4791.036987] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4791.036989] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4791.036990] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4791.036992] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4791.036994] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4791.036996] Call Trace:
[ 4791.036997]  <IRQ>  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4791.037005]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4791.037008]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4791.037013]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4791.037017]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4791.037020]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4791.037028]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4791.037032]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4791.037037]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4791.037040]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4791.037042]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4791.037044]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4791.037047]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4791.037050]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4791.037052]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4791.037054]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4791.037057]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4791.037060]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4791.037063]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4791.037066]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4791.037070]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4791.037073]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4791.037076]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4791.037078]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4791.037081]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4791.037083]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4791.037086]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4791.037088]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4791.037090]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4791.037093]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4791.037096]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4791.037102]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4791.037109]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4791.037112]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4791.037117]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4791.037123]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4791.037126]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4791.037131]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4791.037134]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4791.037139]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4791.037145]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4791.037148]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4791.037151]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4791.037153]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4791.037156]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4791.037158]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4791.037162]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4791.037165]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4791.037166]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4791.037170]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4791.037174]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4791.037178]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4791.037181]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4791.037184]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4791.037194]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4791.037197]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4791.037199]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4791.037202]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4791.037204]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4791.037207]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4856.534855] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4856.534895] CPU 6:
[ 4856.534897] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4856.534936] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4856.534938] RIP: 0010:[<ffffffff80210d95>]  [<ffffffff80210d95>] native_read_tsc+0x0/0x11
[ 4856.534943] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000202
[ 4856.534944] RAX: 00000bcf34e590d0 RBX: 0000000000000006 RCX: 0000000034e590d0
[ 4856.534946] RDX: 0000000034e590d0 RSI: 0000000000000001 RDI: 0000000000000001
[ 4856.534948] RBP: ffffffff8020bbd3 R08: 0000000034e590b5 R09: 0000000000013a4b
[ 4856.534950] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4856.534951] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4856.534953] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4856.534955] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4856.534957] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4856.534959] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4856.534961] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4856.534962] Call Trace:
[ 4856.534963]  <IRQ>  [<ffffffff8039039f>] ? delay_tsc+0x26/0x57
[ 4856.534969]  [<ffffffff803903dc>] ? __delay+0x0/0xa
[ 4856.534972]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4856.534976]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4856.534979]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4856.534983]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4856.534987]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4856.534990]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4856.534997]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4856.535001]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4856.535004]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4856.535007]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4856.535009]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4856.535011]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4856.535014]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4856.535017]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4856.535019]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4856.535021]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4856.535024]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4856.535027]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4856.535029]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4856.535032]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4856.535036]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4856.535039]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4856.535042]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4856.535044]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4856.535047]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4856.535049]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4856.535051]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4856.535054]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4856.535056]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4856.535059]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4856.535061]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4856.535067]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4856.535074]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4856.535076]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4856.535082]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4856.535088]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4856.535091]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4856.535096]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4856.535099]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4856.535104]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4856.535110]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4856.535113]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4856.535115]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4856.535118]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4856.535120]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4856.535123]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4856.535125]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4856.535128]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4856.535129]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4856.535133]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4856.535137]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4856.535140]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4856.535142]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4856.535146]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4856.535155]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4856.535158]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4856.535160]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4856.535163]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4856.535165]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4856.535168]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b
[ 4922.032827] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4922.032871] CPU 6:
[ 4922.032872] Modules linked in: tun binfmt_misc ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc kvm_intel kvm pci_slot cpufreq_powersave cpufreq_ondemand cpufreq_conservative cpufreq_userspace snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device psmouse snd pcspkr serio_raw evdev wmi rtc_cmos soundcore snd_page_alloc sg sr_mod cdrom hid_belkin usbhid mvsas pata_marvell pata_acpi libsas ata_piix ahci scsi_transport_sas sky2 igb dca floppy fuse
[ 4922.032911] Pid: 12397, comm: kvm Not tainted 2.6.30-rc1 #6 System Product Name
[ 4922.032913] RIP: 0010:[<ffffffff80210d97>]  [<ffffffff80210d97>] native_read_tsc+0x2/0x11
[ 4922.032918] RSP: 0018:ffff8800281a34f0  EFLAGS: 00000292
[ 4922.032920] RAX: 00000000fab36795 RBX: ffff8800281b2730 RCX: 00000000fab3676c
[ 4922.032922] RDX: 0000000000000bf7 RSI: 0000000000000001 RDI: 0000000000000001
[ 4922.032924] RBP: ffffffff8020bbd3 R08: fffffffffab3676c R09: 0000000000013a4b
[ 4922.032925] R10: ffffffff8023a5e7 R11: ffffffff80390379 R12: ffff8800281a3470
[ 4922.032927] R13: 0000000000000006 R14: 0000000000000001 R15: ffffffff8021d470
[ 4922.032929] FS:  00007fa315516950(0000) GS:ffff8800281a0000(0000) knlGS:0000000000000000
[ 4922.032931] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[ 4922.032933] CR2: 00000000b7ff1000 CR3: 000000015cec1000 CR4: 00000000000026e0
[ 4922.032935] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4922.032937] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4922.032938] Call Trace:
[ 4922.032940]  <IRQ>  [<ffffffff80390393>] ? delay_tsc+0x1a/0x57
[ 4922.032947]  [<ffffffff803903cf>] ? delay_tsc+0x56/0x57
[ 4922.032950]  [<ffffffff803939c4>] ? _raw_spin_lock+0x98/0x105
[ 4922.032954]  [<ffffffffa024041d>] ? ipt_do_table+0x102/0x5f1 [ip_tables]
[ 4922.032958]  [<ffffffff8048d4e0>] ? skb_checksum+0x4c/0x257
[ 4922.032963]  [<ffffffffa025f1fc>] ? manip_pkt+0x80/0xf4 [nf_nat]
[ 4922.032967]  [<ffffffffa025f432>] ? nf_nat_icmp_reply_translation+0x1c2/0x245 [nf_nat]
[ 4922.032970]  [<ffffffff8048f632>] ? __alloc_skb+0x6b/0x13d
[ 4922.032978]  [<ffffffffa0246cfe>] ? nf_conntrack_in+0x45e/0x5ac [nf_conntrack]
[ 4922.032982]  [<ffffffffa01602b1>] ? nf_nat_fn+0xc1/0x14d [iptable_nat]
[ 4922.032986]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4922.032989]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4922.032991]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4922.032993]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
[ 4922.032996]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
[ 4922.032999]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
[ 4922.033001]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
[ 4922.033003]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
[ 4922.033007]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
[ 4922.033009]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
[ 4922.033012]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
[ 4922.033015]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
[ 4922.033019]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]
[ 4922.033022]  [<ffffffff804b37a0>] ? rt_intern_hash+0x46f/0x48a
[ 4922.033025]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4922.033027]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4922.033030]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4922.033032]  [<ffffffff804b7d94>] ? ip_forward_finish+0x0/0x3b
[ 4922.033035]  [<ffffffff804b8070>] ? ip_forward+0x2a1/0x321
[ 4922.033037]  [<ffffffff804b69c6>] ? ip_rcv_finish+0x31e/0x338
[ 4922.033039]  [<ffffffff804b6c15>] ? ip_rcv+0x235/0x27a
[ 4922.033042]  [<ffffffff804957ee>] ? netif_receive_skb+0x4b6/0x4f3
[ 4922.033045]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4922.033051]  [<ffffffffa0228a24>] ? br_handle_frame_finish+0x110/0x148 [bridge]
[ 4922.033057]  [<ffffffffa022cdf9>] ? br_nf_pre_routing_finish+0x300/0x30f [bridge]
[ 4922.033060]  [<ffffffff804b01ff>] ? nf_hook_slow+0xf3/0x104
[ 4922.033066]  [<ffffffffa022caf9>] ? br_nf_pre_routing_finish+0x0/0x30f [bridge]
[ 4922.033072]  [<ffffffffa022d51e>] ? br_nf_pre_routing+0x716/0x730 [bridge]
[ 4922.033074]  [<ffffffff804b00d0>] ? nf_iterate+0x41/0x7d
[ 4922.033080]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4922.033083]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
[ 4922.033088]  [<ffffffffa0228914>] ? br_handle_frame_finish+0x0/0x148 [bridge]
[ 4922.033094]  [<ffffffffa0228bf8>] ? br_handle_frame+0x19c/0x1c0 [bridge]
[ 4922.033097]  [<ffffffff804956d2>] ? netif_receive_skb+0x39a/0x4f3
[ 4922.033099]  [<ffffffff8049548d>] ? netif_receive_skb+0x155/0x4f3
[ 4922.033102]  [<ffffffff804958aa>] ? process_backlog+0x7f/0xb4
[ 4922.033105]  [<ffffffff80493b65>] ? net_rx_action+0xa8/0x1d5
[ 4922.033107]  [<ffffffff80493c4f>] ? net_rx_action+0x192/0x1d5
[ 4922.033110]  [<ffffffff8023f041>] ? __do_softirq+0xac/0x173
[ 4922.033112]  [<ffffffff8020c1cc>] ? call_softirq+0x1c/0x28
[ 4922.033114]  <EOI>  [<ffffffff8020d3be>] ? do_softirq+0x3a/0x82
[ 4922.033118]  [<ffffffff80495e89>] ? netif_rx_ni+0x19/0x1d
[ 4922.033121]  [<ffffffffa026d4ae>] ? tun_chr_aio_write+0x347/0x3a6 [tun]
[ 4922.033125]  [<ffffffffa026d167>] ? tun_chr_aio_write+0x0/0x3a6 [tun]
[ 4922.033128]  [<ffffffff802b480d>] ? do_sync_readv_writev+0xc0/0x107
[ 4922.033131]  [<ffffffff8024d8bf>] ? autoremove_wake_function+0x0/0x2e
[ 4922.033141]  [<ffffffffa01eea62>] ? kvm_vcpu_ioctl+0x5eb/0x5fe [kvm]
[ 4922.033143]  [<ffffffff802b4699>] ? rw_copy_check_uvector+0x6d/0xe4
[ 4922.033146]  [<ffffffff802b4f22>] ? do_readv_writev+0xb2/0x18b
[ 4922.033148]  [<ffffffff802b5a80>] ? fget_light+0x46/0xd9
[ 4922.033151]  [<ffffffff802b5121>] ? sys_writev+0x45/0x93
[ 4922.033154]  [<ffffffff8020afc2>] ? system_call_fastpath+0x16/0x1b

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 20:55                                           ` Stephen Hemminger
@ 2009-04-15 21:07                                             ` Eric Dumazet
  2009-04-15 21:55                                               ` Jan Engelhardt
                                                                 ` (2 more replies)
  0 siblings, 3 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-15 21:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Patrick McHardy, Jeff Chua, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Stephen Hemminger a écrit :
> Looks like there is some recursive path into ip_tables that makes the
> per-cpu spinlock break.  I get lockup's with KVM networking.
> 
> Suggestions?

Well, it seems original patch was not so bad after all

http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html

So change per-cpu spinlocks to per-cpu rwlocks 

and use read_lock() in ipt_do_table() to allow recursion...



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 21:07                                             ` Eric Dumazet
@ 2009-04-15 21:55                                               ` Jan Engelhardt
  2009-04-16 12:12                                                 ` Patrick McHardy
  2009-04-15 21:57                                               ` [PATCH] netfilter: use per-cpu rwlock rather than RCU (v4) Stephen Hemminger
  2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
  2 siblings, 1 reply; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-15 21:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Patrick McHardy, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh


On Wednesday 2009-04-15 23:07, Eric Dumazet wrote:
>Stephen Hemminger a écrit :
>> Looks like there is some recursive path into ip_tables that makes the
>> per-cpu spinlock break.  I get lockup's with KVM networking.
>> 
>> Suggestions?
>
>Well, it seems original patch was not so bad after all
>
>http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
>
>So change per-cpu spinlocks to per-cpu rwlocks 
>
>and use read_lock() in ipt_do_table() to allow recursion...
>
iptables cannot quite recurse into itself due to the comefrom stuff.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu rwlock rather than RCU (v4)
  2009-04-15 21:07                                             ` Eric Dumazet
  2009-04-15 21:55                                               ` Jan Engelhardt
@ 2009-04-15 21:57                                               ` Stephen Hemminger
  2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
  2 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-15 21:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Jeff Chua, paulmck, David Miller, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Yet another alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's. This version uses RCU for the table->base reference
but per-cpu-rwlock for counters.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  122 +++++++++++--------------------------
 net/ipv4/netfilter/ip_tables.c     |  122 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  118 ++++++++++-------------------------
 net/netfilter/x_tables.c           |   26 -------
 5 files changed, 112 insertions(+), 281 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-15 08:44:01.449318844 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-15 14:55:27.387131493 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-15 08:44:01.441318723 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-15 14:34:42.688131823 -0700
@@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(rwlock_t, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -341,7 +343,9 @@ ipt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	read_lock(&__get_cpu_var(ip_tables_lock));
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,9 +440,10 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	read_unlock(&__get_cpu_var(ip_tables_lock));
 	rcu_read_unlock_bh();
 
+
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
 #else
@@ -902,75 +907,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	write_lock_bh(&per_cpu(ip_tables_lock, curcpu));
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	write_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		write_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		write_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
 }
 
@@ -979,7 +934,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +942,11 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1312,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1386,7 +1333,7 @@ do_add_counters(struct net *net, void __
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1437,25 +1384,25 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	write_lock_bh(&per_cpu(ip_tables_lock, cpu));
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	write_unlock_bh(&per_cpu(ip_tables_lock, cpu));
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2219,10 @@ static struct pernet_operations ip_table
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		rwlock_init(&per_cpu(ip_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
--- a/net/netfilter/x_tables.c	2009-04-15 08:44:01.424319035 -0700
+++ b/net/netfilter/x_tables.c	2009-04-15 14:41:36.060944273 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -682,26 +668,21 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
 	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	newinfo->initial_entries = private->initial_entries;
 
-	synchronize_net();
-	return oldinfo;
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +715,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-15 08:44:01.430318746 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-15 14:43:56.471981079 -0700
@@ -329,6 +329,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(rwlock_t, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -367,7 +369,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	read_lock(&__get_cpu_var(ip6_tables_lock));
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,6 +470,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
+	read_unlock(&__get_cpu_var(ip6_tables_lock));
 	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -931,73 +936,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	write_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	write_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		write_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		write_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
 }
 
@@ -1006,7 +963,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +971,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1342,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1415,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	write_lock(&__get_cpu_var(ip6_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	write_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2249,10 @@ static struct pernet_operations ip6_tabl
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		rwlock_init(&per_cpu(ip6_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-15 08:44:01.435318846 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-15 14:36:16.762944338 -0700
@@ -231,6 +231,8 @@ static inline struct arpt_entry *get_ent
 	return (struct arpt_entry *)(base + offset);
 }
 
+static DEFINE_PER_CPU(rwlock_t, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -255,7 +257,9 @@ unsigned int arpt_do_table(struct sk_buf
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	read_lock(&__get_cpu_var(arp_tables_lock));
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +277,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,7 +333,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	read_unlock(&__get_cpu_var(arp_tables_lock));
 	rcu_read_unlock_bh();
 
 	if (hotdrop)
@@ -716,74 +721,25 @@ static void get_counters(const struct xt
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	write_lock_bh(&per_cpu(arp_tables_lock, curcpu));
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	write_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		write_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		write_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
 }
 
@@ -792,7 +748,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +757,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1101,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1173,7 +1122,7 @@ static int do_add_counters(struct net *n
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1224,25 +1173,25 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	write_lock_bh(&per_cpu(arp_tables_lock, cpu));
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	write_unlock_bh(&per_cpu(arp_tables_lock, cpu));
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);
@@ -1923,7 +1872,10 @@ static struct pernet_operations arp_tabl
 
 static int __init arp_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		rwlock_init(&per_cpu(arp_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&arp_tables_net_ops);
 	if (ret < 0)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 21:07                                             ` Eric Dumazet
  2009-04-15 21:55                                               ` Jan Engelhardt
  2009-04-15 21:57                                               ` [PATCH] netfilter: use per-cpu rwlock rather than RCU (v4) Stephen Hemminger
@ 2009-04-15 23:48                                               ` David Miller
  2009-04-16  0:01                                                 ` Stephen Hemminger
                                                                   ` (2 more replies)
  2 siblings, 3 replies; 254+ messages in thread
From: David Miller @ 2009-04-15 23:48 UTC (permalink / raw)
  To: dada1
  Cc: shemminger, kaber, jeff.chua.linux, paulmck, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 15 Apr 2009 23:07:29 +0200

> Well, it seems original patch was not so bad after all
> 
> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
> 
> So change per-cpu spinlocks to per-cpu rwlocks 
> 
> and use read_lock() in ipt_do_table() to allow recursion...

Grumble, one more barrier to getting rid of rwlocks in the whole
tree. :-/

I really think we should entertain the idea where we don't RCU quiesce
when adding rules.  That was dismissed as not workable because the new
rule must be "visible" as soon as we return to userspace but let's get
real, effectively it will be.

If there are any stale object reference issues, we can use RCU object
destruction to handle that kind of thing.

I almost cringed when the per-spinlock idea was proposed, but per-cpu
rwlocks just takes things too far for my tastes.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
@ 2009-04-16  0:01                                                 ` Stephen Hemminger
  2009-04-16  0:05                                                   ` David Miller
  2009-04-16  0:10                                                   ` Linus Torvalds
  2009-04-16  0:02                                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Linus Torvalds
  2009-04-16  6:26                                                 ` Eric Dumazet
  2 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-16  0:01 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, kaber, jeff.chua.linux, paulmck, paulus, mingo, torvalds,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

On Wed, 15 Apr 2009 16:48:11 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 15 Apr 2009 23:07:29 +0200
> 
> > Well, it seems original patch was not so bad after all
> > 
> > http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
> > 
> > So change per-cpu spinlocks to per-cpu rwlocks 
> > 
> > and use read_lock() in ipt_do_table() to allow recursion...
> 
> Grumble, one more barrier to getting rid of rwlocks in the whole
> tree. :-/

Hey, we are reinventing your brwlock ;-<

The other option is use spinlock, over a smaller area (only counters),
and other mechanism to synchronize on replace.

> I really think we should entertain the idea where we don't RCU quiesce
> when adding rules.  That was dismissed as not workable because the new
> rule must be "visible" as soon as we return to userspace but let's get
> real, effectively it will be.

The counters are the bigger problem, otherwise we could just free table
info via rcu.  Do we really have to support: replace where the counter
values coming out to user space are always exactly accurate, or is it
allowed to replace a rule and maybe lose some counter ticks (worst case
NCPU-1).

> If there are any stale object reference issues, we can use RCU object
> destruction to handle that kind of thing.

The problem is pulling the counter values out of the object in the
replace case. It could be changed to use some form of counting semaphore
like thing but that gets expensive.
 
> I almost cringed when the per-spinlock idea was proposed, but per-cpu
> rwlocks just takes things too far for my tastes.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
  2009-04-16  0:01                                                 ` Stephen Hemminger
@ 2009-04-16  0:02                                                 ` Linus Torvalds
  2009-04-16  6:26                                                 ` Eric Dumazet
  2 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-16  0:02 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, shemminger, kaber, jeff.chua.linux, paulmck, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh



On Wed, 15 Apr 2009, David Miller wrote:
> 
> I really think we should entertain the idea where we don't RCU quiesce
> when adding rules.  That was dismissed as not workable because the new
> rule must be "visible" as soon as we return to userspace but let's get
> real, effectively it will be.

I never understood that dismissal.

The new rule _will_ be visible as we return to user space. It's just that 
old packets may still be in flight in other queues.

But that is true even _without_ the "synchronize_net()". The old packets 
just had to make it slightly further in the queueing - but as far as user 
space is concerned, there is absolutely _zero_ difference between the two. 
In both cases it may see packets queued with the old rules. 

> I almost cringed when the per-spinlock idea was proposed, but per-cpu
> rwlocks just takes things too far for my tastes.

I really personally would prefer the RCU approach too. I don't think 
rwlocks are any more cringe-worthy than spinlocks, although it is true 
that they tend to be slightly more expensive.

The pure RCU "just get rid of the unnecessary 'serialze_net()'" approach 
seems to be clearly superior to either.

		Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16  0:01                                                 ` Stephen Hemminger
@ 2009-04-16  0:05                                                   ` David Miller
  2009-04-16 12:28                                                     ` Patrick McHardy
  2009-04-16  0:10                                                   ` Linus Torvalds
  1 sibling, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-16  0:05 UTC (permalink / raw)
  To: shemminger
  Cc: dada1, kaber, jeff.chua.linux, paulmck, paulus, mingo, torvalds,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Wed, 15 Apr 2009 17:01:11 -0700

> The counters are the bigger problem, otherwise we could just free table
> info via rcu.  Do we really have to support: replace where the counter
> values coming out to user space are always exactly accurate, or is it
> allowed to replace a rule and maybe lose some counter ticks (worst case
> NCPU-1).

I say this case doesn't matter until someone can prove that it's
any different from the IPTABLES replace operation system call
executing a few microseconds earlier or later.

There really is no difference, and we're making complexity out of
nothing just to ensure something which isn't actually guarenteed right
now.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16  0:01                                                 ` Stephen Hemminger
  2009-04-16  0:05                                                   ` David Miller
@ 2009-04-16  0:10                                                   ` Linus Torvalds
  2009-04-16  0:45                                                     ` [PATCH] netfilter: use per-cpu spinlock and RCU (v5) Stephen Hemminger
  2009-04-16 13:11                                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Patrick McHardy
  1 sibling, 2 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-16  0:10 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, dada1, kaber, jeff.chua.linux, paulmck, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh



On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> 
> The counters are the bigger problem, otherwise we could just free table
> info via rcu.  Do we really have to support: replace where the counter
> values coming out to user space are always exactly accurate, or is it
> allowed to replace a rule and maybe lose some counter ticks (worst case
> NCPU-1).

Why not just read the counters fromt he old one at RCU free time (they are 
guaranteed to be stable at that point, since we're all done with those 
entries), and apply them at that point to the current setup?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
  2009-04-16  0:10                                                   ` Linus Torvalds
@ 2009-04-16  0:45                                                     ` Stephen Hemminger
  2009-04-16  5:01                                                       ` Eric Dumazet
  2009-04-16 13:11                                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Patrick McHardy
  1 sibling, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-16  0:45 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, dada1, kaber, jeff.chua.linux, paulmck, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

This is an alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's. This version uses RCU for the table->base reference
but per-cpu-lock for counters.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

This version does not do coarse locking or synchronize_net() during
the __do_replace function, so there is a small race which allows for
some of the old counter values to be incorrect (Ncpu -1). Scenario
would be replacing a rule set and the same rules are inflight on other
CPU.  The other CPU might still be looking at the old rules (and
update those counters), after counter values have been captured.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |   11 +--
 net/ipv4/netfilter/arp_tables.c    |  121 +++++++++++--------------------------
 net/ipv4/netfilter/ip_tables.c     |  121 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  118 +++++++++++-------------------------
 net/netfilter/x_tables.c           |   45 +++++++------
 5 files changed, 137 insertions(+), 279 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-15 08:44:01.449318844 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-15 17:08:35.303217128 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -385,6 +382,12 @@ struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* Slow death march */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	void *entries[1];
@@ -434,8 +437,6 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-15 08:44:01.441318723 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-15 17:09:49.600404319 -0700
@@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -341,7 +343,7 @@ ipt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -358,7 +360,9 @@ ipt_do_table(struct sk_buff *skb,
 			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
+			spin_lock(&__get_cpu_var(ip_tables_lock));
 			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+			spin_unlock(&__get_cpu_var(ip_tables_lock));
 
 			t = ipt_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -436,9 +440,9 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
 	rcu_read_unlock_bh();
 
+
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
 #else
@@ -902,75 +906,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
 	}
 }
 
@@ -979,7 +933,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +941,11 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1311,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1386,7 +1332,7 @@ do_add_counters(struct net *net, void __
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1437,25 +1383,25 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2218,10 @@ static struct pernet_operations ip_table
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
--- a/net/netfilter/x_tables.c	2009-04-15 08:44:01.424319035 -0700
+++ b/net/netfilter/x_tables.c	2009-04-15 17:10:24.967344496 -0700
@@ -66,6 +66,8 @@ static const char *const xt_prefix[NFPRO
 	[NFPROTO_IPV6]   = "ip6",
 };
 
+static void __xt_free_table_info(struct xt_table_info *);
+
 /* Registration hooks for targets. */
 int
 xt_register_target(struct xt_target *target)
@@ -602,7 +604,7 @@ struct xt_table_info *xt_alloc_table_inf
 							cpu_to_node(cpu));
 
 		if (newinfo->entries[cpu] == NULL) {
-			xt_free_table_info(newinfo);
+			__xt_free_table_info(newinfo);
 			return NULL;
 		}
 	}
@@ -611,7 +613,7 @@ struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+static void __xt_free_table_info(struct xt_table_info *info)
 {
 	int cpu;
 
@@ -623,21 +625,28 @@ void xt_free_table_info(struct xt_table_
 	}
 	kfree(info);
 }
-EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
+static void __xt_free_table_info_wq(struct work_struct *arg)
 {
-	unsigned int cpu;
+	struct xt_table_info *info
+		= container_of(arg, struct xt_table_info, work);
+	__xt_free_table_info(info);
+}
 
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
+static void __xt_free_table_info_rcu(struct rcu_head *arg)
+{
+	struct xt_table_info *info
+		= container_of(arg, struct xt_table_info, rcu);
 
+	INIT_WORK(&info->work, __xt_free_table_info_wq);
+	schedule_work(&info->work);
 }
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+	call_rcu(&info->rcu, __xt_free_table_info_rcu);
+}
+EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
@@ -682,26 +691,21 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
 	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	newinfo->initial_entries = private->initial_entries;
 
-	synchronize_net();
-	return oldinfo;
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +738,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-15 08:44:01.430318746 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-15 17:11:37.663345565 -0700
@@ -329,6 +329,8 @@ static void trace_packet(struct sk_buff 
 }
 #endif
 
+static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -367,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -384,9 +386,12 @@ ip6t_do_table(struct sk_buff *skb,
 			if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
+
+			spin_lock(&__get_cpu_var(ip6_tables_lock));
 			ADD_COUNTER(e->counters,
 				    ntohs(ipv6_hdr(skb)->payload_len) +
 				    sizeof(struct ipv6hdr), 1);
+			spin_unlock(&__get_cpu_var(ip6_tables_lock));
 
 			t = ip6t_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -931,73 +936,25 @@ get_counters(const struct xt_table_info 
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu));
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu));
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu));
 	}
 }
 
@@ -1006,7 +963,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +971,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1342,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1415,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	spin_lock(&__get_cpu_var(ip6_tables_lock));
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock));
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2249,10 @@ static struct pernet_operations ip6_tabl
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-15 08:44:01.435318846 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-15 17:13:01.909334287 -0700
@@ -231,6 +231,8 @@ static inline struct arpt_entry *get_ent
 	return (struct arpt_entry *)(base + offset);
 }
 
+static DEFINE_PER_CPU(spinlock_t, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -255,7 +257,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,7 +275,10 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
+			spin_lock(&__get_cpu_var(arp_tables_lock));
 			ADD_COUNTER(e->counters, hdr_len, 1);
+			spin_unlock(&__get_cpu_var(arp_tables_lock));
 
 			t = arpt_get_target(e);
 
@@ -328,7 +333,6 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
 	rcu_read_unlock_bh();
 
 	if (hotdrop)
@@ -716,74 +720,25 @@ static void get_counters(const struct xt
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu));
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu));
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
 	}
 }
 
@@ -792,7 +747,6 @@ static struct xt_counters *alloc_counter
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +756,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1100,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1173,7 +1121,7 @@ static int do_add_counters(struct net *n
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1224,25 +1172,25 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(arp_tables_lock, cpu));
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(arp_tables_lock, cpu));
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);
@@ -1923,7 +1871,10 @@ static struct pernet_operations arp_tabl
 
 static int __init arp_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(arp_tables_lock, cpu));
 
 	ret = register_pernet_subsys(&arp_tables_net_ops);
 	if (ret < 0)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
  2009-04-16  0:45                                                     ` [PATCH] netfilter: use per-cpu spinlock and RCU (v5) Stephen Hemminger
@ 2009-04-16  5:01                                                       ` Eric Dumazet
  2009-04-16 13:53                                                           ` Patrick McHardy
  0 siblings, 1 reply; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16  5:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Torvalds, David Miller, kaber, jeff.chua.linux, paulmck,
	paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Stephen Hemminger a écrit :
> This is an alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's. This version uses RCU for the table->base reference
> but per-cpu-lock for counters.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> This version does not do coarse locking or synchronize_net() during
> the __do_replace function, so there is a small race which allows for
> some of the old counter values to be incorrect (Ncpu -1). Scenario
> would be replacing a rule set and the same rules are inflight on other
> CPU.  The other CPU might still be looking at the old rules (and
> update those counters), after counter values have been captured.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

This version is a regression over 2.6.2[0-9], because of two points

1) Much more atomic ops :

Because of additional

> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
> +			spin_unlock(&__get_cpu_var(ip_tables_lock));

added on each counter updates.

On many setups, each packet coming in or out of the machine has
to update between 2 to 20 rule counters. So to avoid *one* atomic ops
of read_unlock(), this v4 version adds 2 to 20 atomic ops...


I still not see the problem between the previous version (2.6.2[0-8]) that had a central
 rwlock, that hurted performance on SMP because of cache line ping pong, and the solution
having one rwlock per cpu.

We wanted to reduce the cache line ping pong first. This *is* the hurting point,
by an order of magnitude.

We tried a full RCU solution, it took us three years and we failed.
Lets take an easy solution, before whole replacement of x_table by new Patrick
infrastructure.

Then, if it appears the rwlock itself and its two atomic ops are *really* a problem,
we can go further, but I doubt modern cpus really care about atomic ops on an integer
already hot in L1 cache.

2) Second problem : Potential OOM

About freeing old rules with call_rcu() and/or schedule_work(), this is going
to OOM pretty fast on small appliances with basic firewall setups loading
rules one by one, as done by original topic reporter.

We had reports from guys using linux with 4MB of available ram (French provider free.fr on
their applicance box), and we had to use SLAB_DESTROY_BY_RCU thing on conntrack
 to avoid OOM for their setups. We dont want to use call_rcu() and queue 100 or 200 vfree().


So I prefer your v3 version, even if I didnt tested yet.

Thank you

> 
> ---
>  include/linux/netfilter/x_tables.h |   11 +--
>  net/ipv4/netfilter/arp_tables.c    |  121 +++++++++++--------------------------
>  net/ipv4/netfilter/ip_tables.c     |  121 ++++++++++---------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  118 +++++++++++-------------------------
>  net/netfilter/x_tables.c           |   45 +++++++------
>  5 files changed, 137 insertions(+), 279 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-04-15 08:44:01.449318844 -0700
> +++ b/include/linux/netfilter/x_tables.h	2009-04-15 17:08:35.303217128 -0700
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
>  
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
>  
> @@ -385,6 +382,12 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* Slow death march */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>  	void *entries[1];
> @@ -434,8 +437,6 @@ extern void xt_proto_fini(struct net *ne
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
>  
>  /*
>   * This helper is performance critical and must be inlined
> --- a/net/ipv4/netfilter/ip_tables.c	2009-04-15 08:44:01.441318723 -0700
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-04-15 17:09:49.600404319 -0700
> @@ -297,6 +297,8 @@ static void trace_packet(struct sk_buff 
>  }
>  #endif
>  
> +static DEFINE_PER_CPU(spinlock_t, ip_tables_lock);
> +
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
>  ipt_do_table(struct sk_buff *skb,
> @@ -341,7 +343,7 @@ ipt_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -358,7 +360,9 @@ ipt_do_table(struct sk_buff *skb,
>  			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
>  				goto no_match;
>  
> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
> +			spin_unlock(&__get_cpu_var(ip_tables_lock));
>  
>  			t = ipt_get_target(e);
>  			IP_NF_ASSERT(t->u.kernel.target);
> @@ -436,9 +440,9 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
>  	rcu_read_unlock_bh();
>  
> +
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
>  #else
> @@ -902,75 +906,25 @@ get_counters(const struct xt_table_info 
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu));
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu));
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
>  	}
>  }
>  
> @@ -979,7 +933,6 @@ static struct xt_counters * alloc_counte
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +941,11 @@ static struct xt_counters * alloc_counte
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> +		return ERR_PTR(-ENOMEM);
>  
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1377,6 +1311,18 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1386,7 +1332,7 @@ do_add_counters(struct net *net, void __
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
>  	const char *name;
> -	int size;
> +	int cpu, size;
>  	void *ptmp;
>  	struct xt_table *t;
>  	const struct xt_table_info *private;
> @@ -1437,25 +1383,25 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	cpu = raw_smp_processor_id();
> +	spin_lock_bh(&per_cpu(ip_tables_lock, cpu));
> +	loc_cpu_entry = private->entries[cpu];
> +	i = 0;
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu));
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> @@ -2272,7 +2218,10 @@ static struct pernet_operations ip_table
>  
>  static int __init ip_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(ip_tables_lock, cpu));
>  
>  	ret = register_pernet_subsys(&ip_tables_net_ops);
>  	if (ret < 0)
> --- a/net/netfilter/x_tables.c	2009-04-15 08:44:01.424319035 -0700
> +++ b/net/netfilter/x_tables.c	2009-04-15 17:10:24.967344496 -0700
> @@ -66,6 +66,8 @@ static const char *const xt_prefix[NFPRO
>  	[NFPROTO_IPV6]   = "ip6",
>  };
>  
> +static void __xt_free_table_info(struct xt_table_info *);
> +
>  /* Registration hooks for targets. */
>  int
>  xt_register_target(struct xt_target *target)
> @@ -602,7 +604,7 @@ struct xt_table_info *xt_alloc_table_inf
>  							cpu_to_node(cpu));
>  
>  		if (newinfo->entries[cpu] == NULL) {
> -			xt_free_table_info(newinfo);
> +			__xt_free_table_info(newinfo);
>  			return NULL;
>  		}
>  	}
> @@ -611,7 +613,7 @@ struct xt_table_info *xt_alloc_table_inf
>  }
>  EXPORT_SYMBOL(xt_alloc_table_info);
>  
> -void xt_free_table_info(struct xt_table_info *info)
> +static void __xt_free_table_info(struct xt_table_info *info)
>  {
>  	int cpu;
>  
> @@ -623,21 +625,28 @@ void xt_free_table_info(struct xt_table_
>  	}
>  	kfree(info);
>  }
> -EXPORT_SYMBOL(xt_free_table_info);
>  
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> +static void __xt_free_table_info_wq(struct work_struct *arg)
>  {
> -	unsigned int cpu;
> +	struct xt_table_info *info
> +		= container_of(arg, struct xt_table_info, work);
> +	__xt_free_table_info(info);
> +}
>  
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> +static void __xt_free_table_info_rcu(struct rcu_head *arg)
> +{
> +	struct xt_table_info *info
> +		= container_of(arg, struct xt_table_info, rcu);
>  
> +	INIT_WORK(&info->work, __xt_free_table_info_wq);
> +	schedule_work(&info->work);
>  }
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> +
> +void xt_free_table_info(struct xt_table_info *info)
> +{
> +	call_rcu(&info->rcu, __xt_free_table_info_rcu);
> +}
> +EXPORT_SYMBOL(xt_free_table_info);
>  
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
> @@ -682,26 +691,21 @@ xt_replace_table(struct xt_table *table,
>  	      struct xt_table_info *newinfo,
>  	      int *error)
>  {
> -	struct xt_table_info *oldinfo, *private;
> +	struct xt_table_info *private;
>  
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
> -	oldinfo = private;
>  	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
> +	newinfo->initial_entries = private->initial_entries;
>  
> -	synchronize_net();
> -	return oldinfo;
> +	return private;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
>  
> @@ -734,7 +738,6 @@ struct xt_table *xt_register_table(struc
>  
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
>  
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
  2009-04-16  0:01                                                 ` Stephen Hemminger
  2009-04-16  0:02                                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Linus Torvalds
@ 2009-04-16  6:26                                                 ` Eric Dumazet
  2009-04-16 14:33                                                   ` Paul E. McKenney
  2 siblings, 1 reply; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16  6:26 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, kaber, jeff.chua.linux, paulmck, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Wed, 15 Apr 2009 23:07:29 +0200
> 
>> Well, it seems original patch was not so bad after all
>>
>> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
>>
>> So change per-cpu spinlocks to per-cpu rwlocks 
>>
>> and use read_lock() in ipt_do_table() to allow recursion...
> 
> Grumble, one more barrier to getting rid of rwlocks in the whole
> tree. :-/
> 
> I really think we should entertain the idea where we don't RCU quiesce
> when adding rules.  That was dismissed as not workable because the new
> rule must be "visible" as soon as we return to userspace but let's get
> real, effectively it will be.

We had to RCU quiesce to be sure old rules were not any more used before
freeing them. Alternative is to defer freeing via call_rcu() but
subject to OOM.

With 200 basic rules, size of rules table is about 40960 bytes per cpu.
(88 pages taken on vmalloc virtual space on my 8 cpus machine)
0xfcaf8000-0xfcb03000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb04000-0xfcb0f000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb10000-0xfcb1b000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb1c000-0xfcb27000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb28000-0xfcb33000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb34000-0xfcb3f000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb40000-0xfcb4b000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
0xfcb4c000-0xfcb57000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc

This kind of monolithic huge object is hard to handle with RCU semantic,
more suitable for handling set of small objects (struct file for example),
even if RCU can have a backoff of 10000 elements in its queue...

> 
> If there are any stale object reference issues, we can use RCU object
> destruction to handle that kind of thing.
> 
> I almost cringed when the per-spinlock idea was proposed, but per-cpu
> rwlocks just takes things too far for my tastes.


In my humble opinion, this is a reasonnable compromise, and Stephen patch
version 4 is ok for me.



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-15 21:55                                               ` Jan Engelhardt
@ 2009-04-16 12:12                                                 ` Patrick McHardy
  2009-04-16 12:24                                                     ` Jan Engelhardt
  0 siblings, 1 reply; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 12:12 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Eric Dumazet, Stephen Hemminger, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Jan Engelhardt wrote:
> On Wednesday 2009-04-15 23:07, Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> Looks like there is some recursive path into ip_tables that makes the
>>> per-cpu spinlock break.  I get lockup's with KVM networking.
>>>
>>> Suggestions?
>> Well, it seems original patch was not so bad after all
>>
>> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
>>
>> So change per-cpu spinlocks to per-cpu rwlocks 
>>
>> and use read_lock() in ipt_do_table() to allow recursion...
>>
> iptables cannot quite recurse into itself due to the comefrom stuff.

Actually it can by using the REJECT target:

> [ 2106.068550]  [<ffffffff804b0195>] ? nf_hook_slow+0x89/0x104
> [ 2106.068552]  [<ffffffff804b8ed0>] ? dst_output+0x0/0xb
> [ 2106.068555]  [<ffffffff80393925>] ? _raw_spin_unlock+0x8b/0x92
> [ 2106.068557]  [<ffffffff804ba8c7>] ? __ip_local_out+0x98/0x9a
> [ 2106.068559]  [<ffffffff804ba8d2>] ? ip_local_out+0x9/0x1f
> [ 2106.068562]  [<ffffffff804babb4>] ? ip_push_pending_frames+0x2cc/0x33e
> [ 2106.068566]  [<ffffffff804dac79>] ? icmp_send+0x559/0x588
> [ 2106.068569]  [<ffffffff8022d3a0>] ? task_rq_lock+0x46/0x79
> [ 2106.068571]  [<ffffffff8023004f>] ? enqueue_task_fair+0x23b/0x293
> [ 2106.068575]  [<ffffffffa00f5083>] ? reject_tg+0x41/0x30e [ipt_REJECT]
> [ 2106.068578]  [<ffffffffa024084f>] ? ipt_do_table+0x534/0x5f1 [ip_tables]




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16 12:12                                                 ` Patrick McHardy
@ 2009-04-16 12:24                                                     ` Jan Engelhardt
  0 siblings, 0 replies; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-16 12:24 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Stephen Hemminger, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh


On Thursday 2009-04-16 14:12, Patrick McHardy wrote:
> Jan Engelhardt wrote:
>> On Wednesday 2009-04-15 23:07, Eric Dumazet wrote:
>>> Stephen Hemminger a écrit :
>>>> Looks like there is some recursive path into ip_tables that makes the
>>>> per-cpu spinlock break.  I get lockup's with KVM networking.
>>>>
>>>> Suggestions?
>>> Well, it seems original patch was not so bad after all
>>>
>>> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
>>>
>>> So change per-cpu spinlocks to per-cpu rwlocks 
>>> and use read_lock() in ipt_do_table() to allow recursion...
>>>
>> iptables cannot quite recurse into itself due to the comefrom stuff.
>
> Actually it can by using the REJECT target:

Yes, but it has to return an absolute verdict (which REJECT does),
so it's not really a recursion, it's more like a goto without return.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-16 12:24                                                     ` Jan Engelhardt
  0 siblings, 0 replies; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-16 12:24 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Stephen Hemminger, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh


On Thursday 2009-04-16 14:12, Patrick McHardy wrote:
> Jan Engelhardt wrote:
>> On Wednesday 2009-04-15 23:07, Eric Dumazet wrote:
>>> Stephen Hemminger a écrit :
>>>> Looks like there is some recursive path into ip_tables that makes the
>>>> per-cpu spinlock break.  I get lockup's with KVM networking.
>>>>
>>>> Suggestions?
>>> Well, it seems original patch was not so bad after all
>>>
>>> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
>>>
>>> So change per-cpu spinlocks to per-cpu rwlocks 
>>> and use read_lock() in ipt_do_table() to allow recursion...
>>>
>> iptables cannot quite recurse into itself due to the comefrom stuff.
>
> Actually it can by using the REJECT target:

Yes, but it has to return an absolute verdict (which REJECT does),
so it's not really a recursion, it's more like a goto without return.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16  0:05                                                   ` David Miller
@ 2009-04-16 12:28                                                     ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 12:28 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, dada1, jeff.chua.linux, paulmck, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

David Miller wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Wed, 15 Apr 2009 17:01:11 -0700
> 
>> The counters are the bigger problem, otherwise we could just free table
>> info via rcu.  Do we really have to support: replace where the counter
>> values coming out to user space are always exactly accurate, or is it
>> allowed to replace a rule and maybe lose some counter ticks (worst case
>> NCPU-1).
> 
> I say this case doesn't matter until someone can prove that it's
> any different from the IPTABLES replace operation system call
> executing a few microseconds earlier or later.
> 
> There really is no difference, and we're making complexity out of
> nothing just to ensure something which isn't actually guarenteed right
> now.

Actually I believe it does work right now. Userspace maps the old
counter values to the replacement rules and the kernel adds them
up, so in the end we currently should always have accurate counters,
independant of the exact time when a replacement took place.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16 12:24                                                     ` Jan Engelhardt
@ 2009-04-16 12:31                                                       ` Patrick McHardy
  -1 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 12:31 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Eric Dumazet, Stephen Hemminger, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Jan Engelhardt wrote:
> On Thursday 2009-04-16 14:12, Patrick McHardy wrote:
>> Jan Engelhardt wrote:
>>>> So change per-cpu spinlocks to per-cpu rwlocks 
>>>> and use read_lock() in ipt_do_table() to allow recursion...
>>>>
>>> iptables cannot quite recurse into itself due to the comefrom stuff.
>> Actually it can by using the REJECT target:
> 
> Yes, but it has to return an absolute verdict (which REJECT does),
> so it's not really a recursion, it's more like a goto without return.

Its recursion in the sense that we reenter the same code path,
while holding a lock. The verdict is issued *after* recursing.

A (quite ugly) workaround would be to have ipt_REJECT queue
the packets to a temporary queue and have ipt_do_table call
dst_output() after dropping the lock.




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-16 12:31                                                       ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 12:31 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Eric Dumazet, Stephen Hemminger, Jeff Chua, paulmck,
	David Miller, paulus, mingo, torvalds, laijs, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Jan Engelhardt wrote:
> On Thursday 2009-04-16 14:12, Patrick McHardy wrote:
>> Jan Engelhardt wrote:
>>>> So change per-cpu spinlocks to per-cpu rwlocks 
>>>> and use read_lock() in ipt_do_table() to allow recursion...
>>>>
>>> iptables cannot quite recurse into itself due to the comefrom stuff.
>> Actually it can by using the REJECT target:
> 
> Yes, but it has to return an absolute verdict (which REJECT does),
> so it's not really a recursion, it's more like a goto without return.

Its recursion in the sense that we reenter the same code path,
while holding a lock. The verdict is issued *after* recursing.

A (quite ugly) workaround would be to have ipt_REJECT queue
the packets to a temporary queue and have ipt_do_table call
dst_output() after dropping the lock.




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16  0:10                                                   ` Linus Torvalds
  2009-04-16  0:45                                                     ` [PATCH] netfilter: use per-cpu spinlock and RCU (v5) Stephen Hemminger
@ 2009-04-16 13:11                                                     ` Patrick McHardy
  2009-04-16 22:33                                                       ` David Miller
  1 sibling, 1 reply; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 13:11 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, David Miller, dada1, jeff.chua.linux, paulmck,
	paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Linus Torvalds wrote:
> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
>> The counters are the bigger problem, otherwise we could just free table
>> info via rcu.  Do we really have to support: replace where the counter
>> values coming out to user space are always exactly accurate, or is it
>> allowed to replace a rule and maybe lose some counter ticks (worst case
>> NCPU-1).
> 
> Why not just read the counters fromt he old one at RCU free time (they are 
> guaranteed to be stable at that point, since we're all done with those 
> entries), and apply them at that point to the current setup?

We need the counters immediately to copy them to userspace, so waiting
for an asynchronous RCU free is not going to work.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
  2009-04-16  5:01                                                       ` Eric Dumazet
@ 2009-04-16 13:53                                                           ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 13:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Linus Torvalds, David Miller, jeff.chua.linux,
	paulmck, paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  The slow case involves acquiring the locks on
>> all cpu's. This version uses RCU for the table->base reference
>> but per-cpu-lock for counters.

> This version is a regression over 2.6.2[0-9], because of two points
> 
> 1) Much more atomic ops :
> 
> Because of additional
> 
>> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>> +			spin_unlock(&__get_cpu_var(ip_tables_lock));
> 
> added on each counter updates.
> 
> On many setups, each packet coming in or out of the machine has
> to update between 2 to 20 rule counters. So to avoid *one* atomic ops
> of read_unlock(), this v4 version adds 2 to 20 atomic ops...

I agree, this seems to be a step backwards.

> I still not see the problem between the previous version (2.6.2[0-8]) that had a central
>  rwlock, that hurted performance on SMP because of cache line ping pong, and the solution
> having one rwlock per cpu.
> 
> We wanted to reduce the cache line ping pong first. This *is* the hurting point,
> by an order of magnitude.

Dave doesn't seem to like the rwlock approach. I don't see a way to
do anything asynchronously like call_rcu() to improve this, so to
bring up one of Stephens suggestions again:

>> >   * use on_each_cpu() somehow to do grace periood?

We could use this to replace the counters, presuming it is
indeed faster than waiting for a RCU grace period.

> 2) Second problem : Potential OOM
> 
> About freeing old rules with call_rcu() and/or schedule_work(), this is going
> to OOM pretty fast on small appliances with basic firewall setups loading
> rules one by one, as done by original topic reporter.
> 
> We had reports from guys using linux with 4MB of available ram (French provider free.fr on
> their applicance box), and we had to use SLAB_DESTROY_BY_RCU thing on conntrack
>  to avoid OOM for their setups. We dont want to use call_rcu() and queue 100 or 200 vfree().

Agreed.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
@ 2009-04-16 13:53                                                           ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 13:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Linus Torvalds, David Miller, jeff.chua.linux,
	paulmck, paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This is an alternative version of ip/ip6/arp tables locking using
>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>> update but still removes the expensive rwlock in earlier versions.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  The slow case involves acquiring the locks on
>> all cpu's. This version uses RCU for the table->base reference
>> but per-cpu-lock for counters.

> This version is a regression over 2.6.2[0-9], because of two points
> 
> 1) Much more atomic ops :
> 
> Because of additional
> 
>> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>> +			spin_unlock(&__get_cpu_var(ip_tables_lock));
> 
> added on each counter updates.
> 
> On many setups, each packet coming in or out of the machine has
> to update between 2 to 20 rule counters. So to avoid *one* atomic ops
> of read_unlock(), this v4 version adds 2 to 20 atomic ops...

I agree, this seems to be a step backwards.

> I still not see the problem between the previous version (2.6.2[0-8]) that had a central
>  rwlock, that hurted performance on SMP because of cache line ping pong, and the solution
> having one rwlock per cpu.
> 
> We wanted to reduce the cache line ping pong first. This *is* the hurting point,
> by an order of magnitude.

Dave doesn't seem to like the rwlock approach. I don't see a way to
do anything asynchronously like call_rcu() to improve this, so to
bring up one of Stephens suggestions again:

>> >   * use on_each_cpu() somehow to do grace periood?

We could use this to replace the counters, presuming it is
indeed faster than waiting for a RCU grace period.

> 2) Second problem : Potential OOM
> 
> About freeing old rules with call_rcu() and/or schedule_work(), this is going
> to OOM pretty fast on small appliances with basic firewall setups loading
> rules one by one, as done by original topic reporter.
> 
> We had reports from guys using linux with 4MB of available ram (French provider free.fr on
> their applicance box), and we had to use SLAB_DESTROY_BY_RCU thing on conntrack
>  to avoid OOM for their setups. We dont want to use call_rcu() and queue 100 or 200 vfree().

Agreed.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16  6:26                                                 ` Eric Dumazet
@ 2009-04-16 14:33                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 14:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, shemminger, kaber, jeff.chua.linux, paulus, mingo,
	torvalds, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Thu, Apr 16, 2009 at 08:26:58AM +0200, Eric Dumazet wrote:
> David Miller a écrit :
> > From: Eric Dumazet <dada1@cosmosbay.com>
> > Date: Wed, 15 Apr 2009 23:07:29 +0200
> > 
> >> Well, it seems original patch was not so bad after all
> >>
> >> http://lists.netfilter.org/pipermail/netfilter-devel/2006-January/023175.html
> >>
> >> So change per-cpu spinlocks to per-cpu rwlocks 
> >>
> >> and use read_lock() in ipt_do_table() to allow recursion...
> > 
> > Grumble, one more barrier to getting rid of rwlocks in the whole
> > tree. :-/
> > 
> > I really think we should entertain the idea where we don't RCU quiesce
> > when adding rules.  That was dismissed as not workable because the new
> > rule must be "visible" as soon as we return to userspace but let's get
> > real, effectively it will be.
> 
> We had to RCU quiesce to be sure old rules were not any more used before
> freeing them. Alternative is to defer freeing via call_rcu() but
> subject to OOM.
> 
> With 200 basic rules, size of rules table is about 40960 bytes per cpu.
> (88 pages taken on vmalloc virtual space on my 8 cpus machine)
> 0xfcaf8000-0xfcb03000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb04000-0xfcb0f000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb10000-0xfcb1b000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb1c000-0xfcb27000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb28000-0xfcb33000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb34000-0xfcb3f000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb40000-0xfcb4b000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 0xfcb4c000-0xfcb57000   45056 xt_alloc_table_info+0xa8/0xd0 pages=10 vmalloc
> 
> This kind of monolithic huge object is hard to handle with RCU semantic,
> more suitable for handling set of small objects (struct file for example),
> even if RCU can have a backoff of 10000 elements in its queue...

To be honest, the per-CPU-locking approach looks pretty good to me for
this particular case.  That said, the problem you mention above does
have some straightforward solutions.

One solution to consider would be to do the call_rcu(), but to keep a
counter of the number of calls, perhaps something like the following:

	call_rcu(...);
	if (++count > 50) {
		synchronize_rcu();
		count = 0;
	}

Of course, you might (or might not) need to atomically increment count,
and you of course would want to replace the "50" with some symbolic
constant, or perhaps even a variable whose value might be determined by
the size of the object and/or the amount of memory available.

Would this help?

> > If there are any stale object reference issues, we can use RCU object
> > destruction to handle that kind of thing.
> > 
> > I almost cringed when the per-spinlock idea was proposed, but per-cpu
> > rwlocks just takes things too far for my tastes.
> 
> In my humble opinion, this is a reasonnable compromise, and Stephen patch
> version 4 is ok for me.

Again, the per-CPU-locking approach looks good to me, as well.

But if it turns out that we really do need an RCU implementation with
really short grace periods (tens of microseconds typical latency on
mid-range multiprocessors, those with SGI Altix systems would suffer
a bit more), then it can be done.  It would need to be yet another
implementation of RCU for the following reasons:

o	High update-side overhead (broadcast IPIs via
	smp_call_function().  This is not a problem in this case,
	but would be a showstopper for (say) dcache.  I don't know
	of any way of fixing this.

o	Defeats power-conservation measures by waking up every CPU
	at every grace period.	(Might be fixable, for example, by using
	the same dyntick tricks used by preemptable and hierarchical RCU.
	But not recommended for first implementation.)

o	Poor update-side scalability.  (Definitely fixable, but the
	fix should be to the underlying smp_call_function() primitives.)

o	No ability to share grace periods among concurrent
	synchronize_rcu() primitves.  (Definitely fixable, but not
	recommended until needed.  Unlikely to be needed -- after all,
	if your grace period completes in 10 microseconds, just how many
	concurrent updates do you expect there to be???)

o	No call_rcu() style primtive.  (Definitely fixable, but not
	recommended until needed.  Besides, if the grace period only
	takes a few tens of microseconds, why exactly do you need an
	asynchronous interface?

If this is needed, one good starting point would be Mathieu Desnoyers's
user-level RCU primitive.  The main change would be replacing the POSIX
signals with smp_call_function().

Yet again, I don't see a real problem with the per-CPU locking approach
in this case, but anything to prevent Dave Miller from having to suffer
the pain of hashed locks!  (Can't say that I have ever used hashed locks
myself, but I could imagine that they might impose cache-thrashing and
deadlock issues.)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
  2009-04-16 13:53                                                           ` Patrick McHardy
@ 2009-04-16 14:47                                                             ` Paul E. McKenney
  -1 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 14:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 03:53:15PM +0200, Patrick McHardy wrote:
> Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> This is an alternative version of ip/ip6/arp tables locking using
>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>>> update but still removes the expensive rwlock in earlier versions.
>>>
>>> The idea for this came from an earlier version done by Eric Dumazet.
>>> Locking is done per-cpu, the fast path locks on the current cpu
>>> and updates counters.  The slow case involves acquiring the locks on
>>> all cpu's. This version uses RCU for the table->base reference
>>> but per-cpu-lock for counters.
>
>> This version is a regression over 2.6.2[0-9], because of two points
>> 1) Much more atomic ops :
>> Because of additional
>>> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>>>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>>> +			spin_unlock(&__get_cpu_var(ip_tables_lock));
>> added on each counter updates.
>> On many setups, each packet coming in or out of the machine has
>> to update between 2 to 20 rule counters. So to avoid *one* atomic ops
>> of read_unlock(), this v4 version adds 2 to 20 atomic ops...
>
> I agree, this seems to be a step backwards.
>
>> I still not see the problem between the previous version (2.6.2[0-8]) that 
>> had a central
>>  rwlock, that hurted performance on SMP because of cache line ping pong, 
>> and the solution
>> having one rwlock per cpu.
>> We wanted to reduce the cache line ping pong first. This *is* the hurting 
>> point,
>> by an order of magnitude.
>
> Dave doesn't seem to like the rwlock approach.

Well, we don't really need an rwlock, especially given that we really
don't want two "readers" incrementing the same counter concurrently.

A safer approach would be to maintain a flag in the task structure
tracking which (if any) of the per-CPU locks you hold.  Also maintain
a recursion-depth counter.  If the flag says you don't already hold
the lock, set it and acquire the lock.  Either way, increment the
recursion-depth counter:

	if (current->netfilter_lock_held != cur_cpu) {
		BUG_ON(current->netfilter_lock_held != CPU_NONE);
		spin_lock(per_cpu(..., cur_cpu));
		current->netfilter_lock_held = cur_cpu;
	}
	current->netfilter_lock_nesting++;

And reverse the process to unlock:

	if (--current->netfilter_lock_nesting == 0) {
		spin_unlock(per_cpu(..., cur_cpu));
		current->netfilter_lock_held = CPU_NONE;
	}

>                                                I don't see a way to
> do anything asynchronously like call_rcu() to improve this, so to
> bring up one of Stephens suggestions again:
>
>>> >   * use on_each_cpu() somehow to do grace periood?
>
> We could use this to replace the counters, presuming it is
> indeed faster than waiting for a RCU grace period.

One way to accomplish this is to take Mathieu Desnoyers's user-level
RCU implementation and drop it into the kernel, replacing the POSIX
signal handling with on_each_cpu(), smp_call_function(), or whatever.

>> 2) Second problem : Potential OOM
>> About freeing old rules with call_rcu() and/or schedule_work(), this is 
>> going
>> to OOM pretty fast on small appliances with basic firewall setups loading
>> rules one by one, as done by original topic reporter.
>> We had reports from guys using linux with 4MB of available ram (French 
>> provider free.fr on
>> their applicance box), and we had to use SLAB_DESTROY_BY_RCU thing on 
>> conntrack
>>  to avoid OOM for their setups. We dont want to use call_rcu() and queue 
>> 100 or 200 vfree().
>
> Agreed.

This is not a real problem be handled by doing a synchronize_rcu() every
so often as noted in a prior email elsewhere in this thread:

	call_rcu(...);
	if (++count > 50) {
		synchronize_rcu();
		count = 0;
	}

This choice of constant would reduce the grace-period pain to 2% of the
full effect, which should be acceptable, at least if I remember the
original problem report of 0.2 seconds growing to 6.0 seconds -- this
would give you:

	(6.0-0.2)/50+0.2 = .316

I would argue that 100 milliseconds is an OK penalty for a deprecated
feature.  But of course the per-CPU lock approach should avoid even that
penalty, albeit at some per-packet penalty.  However, my guess is that
this per-packet penalty is not measureable at the system level.

And if the penalty of a single uncontended lock -is- measureable, I will
be very quick to offer my congratulations, at least once I get my jaw
off my keyboard.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock and RCU (v5)
@ 2009-04-16 14:47                                                             ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 14:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 03:53:15PM +0200, Patrick McHardy wrote:
> Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> This is an alternative version of ip/ip6/arp tables locking using
>>> per-cpu locks.  This avoids the overhead of synchronize_net() during
>>> update but still removes the expensive rwlock in earlier versions.
>>>
>>> The idea for this came from an earlier version done by Eric Dumazet.
>>> Locking is done per-cpu, the fast path locks on the current cpu
>>> and updates counters.  The slow case involves acquiring the locks on
>>> all cpu's. This version uses RCU for the table->base reference
>>> but per-cpu-lock for counters.
>
>> This version is a regression over 2.6.2[0-9], because of two points
>> 1) Much more atomic ops :
>> Because of additional
>>> +			spin_lock(&__get_cpu_var(ip_tables_lock));
>>>  			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>>> +			spin_unlock(&__get_cpu_var(ip_tables_lock));
>> added on each counter updates.
>> On many setups, each packet coming in or out of the machine has
>> to update between 2 to 20 rule counters. So to avoid *one* atomic ops
>> of read_unlock(), this v4 version adds 2 to 20 atomic ops...
>
> I agree, this seems to be a step backwards.
>
>> I still not see the problem between the previous version (2.6.2[0-8]) that 
>> had a central
>>  rwlock, that hurted performance on SMP because of cache line ping pong, 
>> and the solution
>> having one rwlock per cpu.
>> We wanted to reduce the cache line ping pong first. This *is* the hurting 
>> point,
>> by an order of magnitude.
>
> Dave doesn't seem to like the rwlock approach.

Well, we don't really need an rwlock, especially given that we really
don't want two "readers" incrementing the same counter concurrently.

A safer approach would be to maintain a flag in the task structure
tracking which (if any) of the per-CPU locks you hold.  Also maintain
a recursion-depth counter.  If the flag says you don't already hold
the lock, set it and acquire the lock.  Either way, increment the
recursion-depth counter:

	if (current->netfilter_lock_held != cur_cpu) {
		BUG_ON(current->netfilter_lock_held != CPU_NONE);
		spin_lock(per_cpu(..., cur_cpu));
		current->netfilter_lock_held = cur_cpu;
	}
	current->netfilter_lock_nesting++;

And reverse the process to unlock:

	if (--current->netfilter_lock_nesting == 0) {
		spin_unlock(per_cpu(..., cur_cpu));
		current->netfilter_lock_held = CPU_NONE;
	}

>                                                I don't see a way to
> do anything asynchronously like call_rcu() to improve this, so to
> bring up one of Stephens suggestions again:
>
>>> >   * use on_each_cpu() somehow to do grace periood?
>
> We could use this to replace the counters, presuming it is
> indeed faster than waiting for a RCU grace period.

One way to accomplish this is to take Mathieu Desnoyers's user-level
RCU implementation and drop it into the kernel, replacing the POSIX
signal handling with on_each_cpu(), smp_call_function(), or whatever.

>> 2) Second problem : Potential OOM
>> About freeing old rules with call_rcu() and/or schedule_work(), this is 
>> going
>> to OOM pretty fast on small appliances with basic firewall setups loading
>> rules one by one, as done by original topic reporter.
>> We had reports from guys using linux with 4MB of available ram (French 
>> provider free.fr on
>> their applicance box), and we had to use SLAB_DESTROY_BY_RCU thing on 
>> conntrack
>>  to avoid OOM for their setups. We dont want to use call_rcu() and queue 
>> 100 or 200 vfree().
>
> Agreed.

This is not a real problem be handled by doing a synchronize_rcu() every
so often as noted in a prior email elsewhere in this thread:

	call_rcu(...);
	if (++count > 50) {
		synchronize_rcu();
		count = 0;
	}

This choice of constant would reduce the grace-period pain to 2% of the
full effect, which should be acceptable, at least if I remember the
original problem report of 0.2 seconds growing to 6.0 seconds -- this
would give you:

	(6.0-0.2)/50+0.2 = .316

I would argue that 100 milliseconds is an OK penalty for a deprecated
feature.  But of course the per-CPU lock approach should avoid even that
penalty, albeit at some per-packet penalty.  However, my guess is that
this per-packet penalty is not measureable at the system level.

And if the penalty of a single uncontended lock -is- measureable, I will
be very quick to offer my congratulations, at least once I get my jaw
off my keyboard.  ;-)

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 14:47                                                             ` Paul E. McKenney
@ 2009-04-16 16:10                                                               ` Eric Dumazet
  -1 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16 16:10 UTC (permalink / raw)
  To: paulmck
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Paul E. McKenney a écrit :

> Well, we don't really need an rwlock, especially given that we really
> don't want two "readers" incrementing the same counter concurrently.
> 
> A safer approach would be to maintain a flag in the task structure
> tracking which (if any) of the per-CPU locks you hold.  Also maintain
> a recursion-depth counter.  If the flag says you don't already hold
> the lock, set it and acquire the lock.  Either way, increment the
> recursion-depth counter:
> 
> 	if (current->netfilter_lock_held != cur_cpu) {
> 		BUG_ON(current->netfilter_lock_held != CPU_NONE);
> 		spin_lock(per_cpu(..., cur_cpu));
> 		current->netfilter_lock_held = cur_cpu;
> 	}
> 	current->netfilter_lock_nesting++;
> 
> And reverse the process to unlock:
> 
> 	if (--current->netfilter_lock_nesting == 0) {
> 		spin_unlock(per_cpu(..., cur_cpu));
> 		current->netfilter_lock_held = CPU_NONE;
> 	}
> 

Yes, you are right, we can avoid rwlock, but use a 'recursive' lock
or spin_trylock()

We can use one counter close to the spinlock, 
no need to add one or two fields to every "thread_info"

struct rec_lock {
	spinlock_t lock;
	int        count;
};
static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);


I also considered using regular spinlocks and spin_trylock() to "detect"
the recurse case without a global counter.

lock :
local_bh_disable();
int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);

unlock:

if (likely(locked))
	spin_unlock(&__get_cpu_var(arp_tables_lock));
local_bh_enable();

But we would lose some runtime features, I dont feel comfortable about
this trylock version. What others people think ?


Here is the resulting patch, based on Stephen v4

(Not sure we *need* recursive spinlock for the arp_tables, but it seems
better to have an uniform implementation)


[PATCH] netfilter: use per-cpu recursive spinlock (v6)

Yet another alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

We have to use recursive spinlocks because netfilter can sometimes
nest several calls to ipt_do_table() for a given cpu.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  131 +++++++++------------------
 net/ipv4/netfilter/ip_tables.c     |  130 +++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    |  127 +++++++++-----------------
 net/netfilter/x_tables.c           |   26 -----
 5 files changed, 138 insertions(+), 281 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652..1ff1a76 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d..9f935f2 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,6 +231,12 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
 	return (struct arpt_entry *)(base + offset);
 }
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -246,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -255,7 +262,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(arp_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,7 +341,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 	rcu_read_unlock_bh();
 
 	if (hotdrop)
@@ -716,74 +730,25 @@ static void get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
 	}
 }
 
@@ -792,7 +757,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +766,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1110,19 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1173,7 +1131,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1224,25 +1182,25 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);
@@ -1923,7 +1881,10 @@ static struct pernet_operations arp_tables_net_ops = {
 
 static int __init arp_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(arp_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&arp_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b6..1368b6d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,6 +297,12 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -317,6 +323,7 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -341,7 +348,12 @@ ipt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(ip_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,7 +448,8 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -902,75 +915,25 @@ get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
 	}
 }
 
@@ -979,7 +942,6 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +950,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1320,18 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1386,7 +1341,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1437,25 +1392,25 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2227,10 @@ static struct pernet_operations ip_tables_net_ops = {
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae85..5b03479 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,6 +329,12 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -347,6 +353,7 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -367,7 +374,12 @@ ip6t_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(ip_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -467,6 +479,8 @@ ip6t_do_table(struct sk_buff *skb,
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
 	rcu_read_unlock_bh();
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -931,73 +945,25 @@ get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
 	}
 }
 
@@ -1006,7 +972,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +980,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1351,19 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1424,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	spin_lock(&__get_cpu_var(ip6_tables_lock).lock);
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock).lock);
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2258,10 @@ static struct pernet_operations ip6_tables_net_ops = {
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a956..adc1b11 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -682,26 +668,21 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
 	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	newinfo->initial_entries = private->initial_entries;
 
-	synchronize_net();
-	return oldinfo;
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +715,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;


^ permalink raw reply related	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
@ 2009-04-16 16:10                                                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16 16:10 UTC (permalink / raw)
  To: paulmck
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Paul E. McKenney a écrit :

> Well, we don't really need an rwlock, especially given that we really
> don't want two "readers" incrementing the same counter concurrently.
> 
> A safer approach would be to maintain a flag in the task structure
> tracking which (if any) of the per-CPU locks you hold.  Also maintain
> a recursion-depth counter.  If the flag says you don't already hold
> the lock, set it and acquire the lock.  Either way, increment the
> recursion-depth counter:
> 
> 	if (current->netfilter_lock_held != cur_cpu) {
> 		BUG_ON(current->netfilter_lock_held != CPU_NONE);
> 		spin_lock(per_cpu(..., cur_cpu));
> 		current->netfilter_lock_held = cur_cpu;
> 	}
> 	current->netfilter_lock_nesting++;
> 
> And reverse the process to unlock:
> 
> 	if (--current->netfilter_lock_nesting == 0) {
> 		spin_unlock(per_cpu(..., cur_cpu));
> 		current->netfilter_lock_held = CPU_NONE;
> 	}
> 

Yes, you are right, we can avoid rwlock, but use a 'recursive' lock
or spin_trylock()

We can use one counter close to the spinlock, 
no need to add one or two fields to every "thread_info"

struct rec_lock {
	spinlock_t lock;
	int        count;
};
static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);


I also considered using regular spinlocks and spin_trylock() to "detect"
the recurse case without a global counter.

lock :
local_bh_disable();
int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);

unlock:

if (likely(locked))
	spin_unlock(&__get_cpu_var(arp_tables_lock));
local_bh_enable();

But we would lose some runtime features, I dont feel comfortable about
this trylock version. What others people think ?


Here is the resulting patch, based on Stephen v4

(Not sure we *need* recursive spinlock for the arp_tables, but it seems
better to have an uniform implementation)


[PATCH] netfilter: use per-cpu recursive spinlock (v6)

Yet another alternative version of ip/ip6/arp tables locking using
per-cpu locks.  This avoids the overhead of synchronize_net() during
update but still removes the expensive rwlock in earlier versions.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  The slow case involves acquiring the locks on
all cpu's.

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

We have to use recursive spinlocks because netfilter can sometimes
nest several calls to ipt_do_table() for a given cpu.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/netfilter/x_tables.h |    5 -
 net/ipv4/netfilter/arp_tables.c    |  131 +++++++++------------------
 net/ipv4/netfilter/ip_tables.c     |  130 +++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    |  127 +++++++++-----------------
 net/netfilter/x_tables.c           |   26 -----
 5 files changed, 138 insertions(+), 281 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652..1ff1a76 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d..9f935f2 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -231,6 +231,12 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
 	return (struct arpt_entry *)(base + offset);
 }
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, arp_tables_lock);
+
 unsigned int arpt_do_table(struct sk_buff *skb,
 			   unsigned int hook,
 			   const struct net_device *in,
@@ -246,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -255,7 +262,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(arp_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,7 +341,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 	rcu_read_unlock_bh();
 
 	if (hotdrop)
@@ -716,74 +730,25 @@ static void get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
-	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
 	}
 }
 
@@ -792,7 +757,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +766,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1110,19 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1173,7 +1131,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1224,25 +1182,25 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 
 	xt_table_unlock(t);
 	module_put(t->me);
@@ -1923,7 +1881,10 @@ static struct pernet_operations arp_tables_net_ops = {
 
 static int __init arp_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(arp_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&arp_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b6..1368b6d 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -297,6 +297,12 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -317,6 +323,7 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -341,7 +348,12 @@ ipt_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(ip_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,7 +448,8 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
@@ -902,75 +915,25 @@ get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
 	}
 }
 
@@ -979,7 +942,6 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +950,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1320,18 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1386,7 +1341,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 	struct xt_counters *paddc;
 	unsigned int num_counters;
 	const char *name;
-	int size;
+	int cpu, size;
 	void *ptmp;
 	struct xt_table *t;
 	const struct xt_table_info *private;
@@ -1437,25 +1392,25 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	cpu = raw_smp_processor_id();
+	spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
+	loc_cpu_entry = private->entries[cpu];
+	i = 0;
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2272,7 +2227,10 @@ static struct pernet_operations ip_tables_net_ops = {
 
 static int __init ip_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&ip_tables_net_ops);
 	if (ret < 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae85..5b03479 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,6 +329,12 @@ static void trace_packet(struct sk_buff *skb,
 }
 #endif
 
+struct rec_lock {
+	spinlock_t lock;
+	int	   count; /* recursion count */
+};
+static DEFINE_PER_CPU(struct rec_lock, ip6_tables_lock);
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ip6t_do_table(struct sk_buff *skb,
@@ -347,6 +353,7 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	struct rec_lock *rl;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -367,7 +374,12 @@ ip6t_do_table(struct sk_buff *skb,
 
 	rcu_read_lock_bh();
 	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
+	rl = &__get_cpu_var(ip_tables_lock);
+	if (likely(rl->count++ == 0))
+		spin_lock(&rl->lock);
+
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -467,6 +479,8 @@ ip6t_do_table(struct sk_buff *skb,
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
 	rcu_read_unlock_bh();
+	if (likely(--rl->count == 0))
+		spin_unlock(&rl->lock);
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -931,73 +945,25 @@ get_counters(const struct xt_table_info *t,
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
+	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
-	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
+		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
 	}
 }
 
@@ -1006,7 +972,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +980,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1351,19 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1424,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
-	i = 0;
+	local_bh_disable();
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	spin_lock(&__get_cpu_var(ip6_tables_lock).lock);
+	loc_cpu_entry = private->entries[smp_processor_id()];
+	i = 0;
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	spin_unlock(&__get_cpu_var(ip6_tables_lock).lock);
+	local_bh_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
@@ -2298,7 +2258,10 @@ static struct pernet_operations ip6_tables_net_ops = {
 
 static int __init ip6_tables_init(void)
 {
-	int ret;
+	int cpu, ret;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu(ip6_tables_lock, cpu).lock);
 
 	ret = register_pernet_subsys(&ip6_tables_net_ops);
 	if (ret < 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a956..adc1b11 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -682,26 +668,21 @@ xt_replace_table(struct xt_table *table,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
 	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	newinfo->initial_entries = private->initial_entries;
 
-	synchronize_net();
-	return oldinfo;
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +715,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 16:10                                                               ` Eric Dumazet
@ 2009-04-16 16:20                                                                 ` Eric Dumazet
  -1 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16 16:20 UTC (permalink / raw)
  To: paulmck
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Eric Dumazet a écrit :

> I also considered using regular spinlocks and spin_trylock() to "detect"
> the recurse case without a global counter.
> 
> lock :
> local_bh_disable();
> int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);
> 
> unlock:
> 
> if (likely(locked))
> 	spin_unlock(&__get_cpu_var(arp_tables_lock));
> local_bh_enable();
> 
> But we would lose some runtime features, I dont feel comfortable about
> this trylock version. What others people think ?
> 

Oh well, this wont work of course, forget about this trylock thing :)


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
@ 2009-04-16 16:20                                                                 ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16 16:20 UTC (permalink / raw)
  To: paulmck
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Eric Dumazet a écrit :

> I also considered using regular spinlocks and spin_trylock() to "detect"
> the recurse case without a global counter.
> 
> lock :
> local_bh_disable();
> int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);
> 
> unlock:
> 
> if (likely(locked))
> 	spin_unlock(&__get_cpu_var(arp_tables_lock));
> local_bh_enable();
> 
> But we would lose some runtime features, I dont feel comfortable about
> this trylock version. What others people think ?
> 

Oh well, this wont work of course, forget about this trylock thing :)

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 16:10                                                               ` Eric Dumazet
  (?)
  (?)
@ 2009-04-16 16:37                                                               ` Linus Torvalds
  2009-04-16 16:59                                                                 ` Patrick McHardy
  -1 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-16 16:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Patrick McHardy, Stephen Hemminger, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Thu, 16 Apr 2009, Eric Dumazet wrote:
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */
> +};
> +static DEFINE_PER_CPU(struct rec_lock, arp_tables_lock);

What the _fuck_ are you doing?

Stop sending these shit-for-brains crazy patches out. That's not a lock, 
that's a messy way of saying "I don't know what the hell I'm doing, but 
I'll mess things up".

Don't do recursive locks (or your own locking primitives in general), but 
goddammit, if you have to, at least know what the hell you're doing. Your 
thing is a piece of shit.

A recursive lock needs an owner, or it's not a lock at all. It's some 
random data structure that contains a lock that may or may not be taken, 
and that may actually _work_ depending on the exact patterns of taking the 
lock, but that's not an excuse.

The fact that code "happens to work by mistake" (and I'm not saying that 
your does - but it might just because of the per-cpu'ness of it, and I'm 
not even going to look at crap like that it closer to try to prove it one 
way or the other) does not make that code acceptable.

Because even if it works today, it's just a bug waiting to happen. The 
thing you did is _not_ a generic recursive lock, and it does _not_ work in 
general. Don't call it a "rec_lock". Don't write code that accesses it 
without any comments as if it was simple. Just DON'T.

Guys, this whole discussion has just been filled with crazy crap. Can 
somebody even explain why we care so deeply about some counters for 
something that we just _deleted_ and that have random values anyway?

I can see the counters being interesting while a firewall is active, but I 
sure don't see what's so wonderfully interesting after-the-fact about a 
counter on something that NO LONGER EXISTS that it has to be somehow 
"exactly right".

And it's certainly not interesting enough to merit this kind of random 
fragile crazy code.

Please. Get a grip, people!

Show of hands, here: tell me a single use that really _requires_ those 
exact counters of a netfilter rule that got deleted and is no longer 
active?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 16:37                                                               ` Linus Torvalds
@ 2009-04-16 16:59                                                                 ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-16 16:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, paulmck, Stephen Hemminger, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Linus Torvalds wrote:
> Guys, this whole discussion has just been filled with crazy crap. Can 
> somebody even explain why we care so deeply about some counters for 
> something that we just _deleted_ and that have random values anyway?
> 
> I can see the counters being interesting while a firewall is active, but I 
> sure don't see what's so wonderfully interesting after-the-fact about a 
> counter on something that NO LONGER EXISTS that it has to be somehow 
> "exactly right".

They're copied to userspace after replacing the ruleset, associated with
the rules that are still active after the change and then added to the
current counters in a second operation. The end result is that the
counters are accurate for rules not changed.

> Show of hands, here: tell me a single use that really _requires_ those 
> exact counters of a netfilter rule that got deleted and is no longer 
> active?

People use netfilter for accounting quite a lot. Having dynamic updates
is also not uncommon, so this might actually matter.



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 16:10                                                               ` Eric Dumazet
@ 2009-04-16 17:58                                                                 ` Paul E. McKenney
  -1 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 17:58 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 06:10:30PM +0200, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
> 
> > Well, we don't really need an rwlock, especially given that we really
> > don't want two "readers" incrementing the same counter concurrently.
> > 
> > A safer approach would be to maintain a flag in the task structure
> > tracking which (if any) of the per-CPU locks you hold.  Also maintain
> > a recursion-depth counter.  If the flag says you don't already hold
> > the lock, set it and acquire the lock.  Either way, increment the
> > recursion-depth counter:
> > 
> > 	if (current->netfilter_lock_held != cur_cpu) {
> > 		BUG_ON(current->netfilter_lock_held != CPU_NONE);
> > 		spin_lock(per_cpu(..., cur_cpu));
> > 		current->netfilter_lock_held = cur_cpu;
> > 	}
> > 	current->netfilter_lock_nesting++;
> > 
> > And reverse the process to unlock:
> > 
> > 	if (--current->netfilter_lock_nesting == 0) {
> > 		spin_unlock(per_cpu(..., cur_cpu));
> > 		current->netfilter_lock_held = CPU_NONE;
> > 	}
> > 
> 
> Yes, you are right, we can avoid rwlock, but use a 'recursive' lock
> or spin_trylock()
> 
> We can use one counter close to the spinlock, 
> no need to add one or two fields to every "thread_info"
> 
> struct rec_lock {
> 	spinlock_t lock;
> 	int        count;
> };
> static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);

Yep, much better approach!

> I also considered using regular spinlocks and spin_trylock() to "detect"
> the recurse case without a global counter.
> 
> lock :
> local_bh_disable();
> int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);

Hmmm...

What happens if some other CPU is actually holding the lock?  For
example, the updater?

> unlock:
> 
> if (likely(locked))
> 	spin_unlock(&__get_cpu_var(arp_tables_lock));
> local_bh_enable();
> 
> But we would lose some runtime features, I dont feel comfortable about
> this trylock version. What others people think ?

I do not believe that it actually works.

I much prefer your earlier idea of associating a counter with the lock.

But an owner field is also required, please see below.  Or please let me
know what I am missing.

							Thanx, Paul

> Here is the resulting patch, based on Stephen v4
> 
> (Not sure we *need* recursive spinlock for the arp_tables, but it seems
> better to have an uniform implementation)
> 
> 
> [PATCH] netfilter: use per-cpu recursive spinlock (v6)
> 
> Yet another alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> We have to use recursive spinlocks because netfilter can sometimes
> nest several calls to ipt_do_table() for a given cpu.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  include/linux/netfilter/x_tables.h |    5 -
>  net/ipv4/netfilter/arp_tables.c    |  131 +++++++++------------------
>  net/ipv4/netfilter/ip_tables.c     |  130 +++++++++-----------------
>  net/ipv6/netfilter/ip6_tables.c    |  127 +++++++++-----------------
>  net/netfilter/x_tables.c           |   26 -----
>  5 files changed, 138 insertions(+), 281 deletions(-)
> 
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 7b1a652..1ff1a76 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
>  
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
>  
> @@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
>  
>  /*
>   * This helper is performance critical and must be inlined
> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
> index 5ba533d..9f935f2 100644
> --- a/net/ipv4/netfilter/arp_tables.c
> +++ b/net/ipv4/netfilter/arp_tables.c
> @@ -231,6 +231,12 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
>  	return (struct arpt_entry *)(base + offset);
>  }
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */

We also need an owner field:

	int owner;

> +};
> +static DEFINE_PER_CPU(struct rec_lock, arp_tables_lock);
> +
>  unsigned int arpt_do_table(struct sk_buff *skb,
>  			   unsigned int hook,
>  			   const struct net_device *in,
> @@ -246,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  	void *table_base;
>  	const struct xt_table_info *private;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
>  		return NF_DROP;
> @@ -255,7 +262,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(arp_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);

But if some other CPU holds the lock, this code would fail to wait for
that other CPU to release the lock, right?  It also might corrupt the
rl->count field due to two CPUs accessing it concurrently non-atomically.

I suggest the following, preferably in a function or macro or something:

	cur_cpu = smp_processor_id();
	if (likely(rl->owner != cur_cpu) {
		spin_lock(&rl->lock);
		rl->owner = smp_processor_id();
		rl->count = 1;
	} else {
		rl->count++;
	}

And the inverse for unlock.

Or am I missing something subtle?

> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  	back = get_entry(table_base, private->underflow[hook]);
> @@ -273,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> +
>  			ADD_COUNTER(e->counters, hdr_len, 1);
>  
>  			t = arpt_get_target(e);
> @@ -328,7 +341,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  	rcu_read_unlock_bh();
>  
>  	if (hotdrop)
> @@ -716,74 +730,25 @@ static void get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
>  	ARPT_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
>  				   counters,
>  				   &i);
> -	}
> -}
> -
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct arpt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	ARPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct arpt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -792,7 +757,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	 * (other than comefrom, which userspace doesn't care
> @@ -802,30 +766,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> +		return ERR_PTR(-ENOMEM);
>  
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int copy_entries_to_user(unsigned int total_size,
> @@ -1165,6 +1110,19 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct arpt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  			   int compat)
>  {
> @@ -1173,7 +1131,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
>  	const char *name;
> -	int size;
> +	int cpu, size;
>  	void *ptmp;
>  	struct xt_table *t;
>  	const struct xt_table_info *private;
> @@ -1224,25 +1182,25 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[smp_processor_id()];
> +	cpu = raw_smp_processor_id();
> +	spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
> +	loc_cpu_entry = private->entries[cpu];
> +	i = 0;
>  	ARPT_ENTRY_ITERATE(loc_cpu_entry,
>  			   private->size,
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	preempt_enable();
> +	spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  
>  	xt_table_unlock(t);
>  	module_put(t->me);
> @@ -1923,7 +1881,10 @@ static struct pernet_operations arp_tables_net_ops = {
>  
>  static int __init arp_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(arp_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&arp_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
> index 810c0b6..1368b6d 100644
> --- a/net/ipv4/netfilter/ip_tables.c
> +++ b/net/ipv4/netfilter/ip_tables.c
> @@ -297,6 +297,12 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */
> +};
> +static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);
> +
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
>  ipt_do_table(struct sk_buff *skb,
> @@ -317,6 +323,7 @@ ipt_do_table(struct sk_buff *skb,
>  	struct xt_table_info *private;
>  	struct xt_match_param mtpar;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	/* Initialization */
>  	ip = ip_hdr(skb);
> @@ -341,7 +348,12 @@ ipt_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(ip_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);
> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -436,7 +448,8 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
> @@ -902,75 +915,25 @@ get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -979,7 +942,6 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +950,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> +		return ERR_PTR(-ENOMEM);
>  
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1377,6 +1320,18 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1386,7 +1341,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
>  	const char *name;
> -	int size;
> +	int cpu, size;
>  	void *ptmp;
>  	struct xt_table *t;
>  	const struct xt_table_info *private;
> @@ -1437,25 +1392,25 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	cpu = raw_smp_processor_id();
> +	spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
> +	loc_cpu_entry = private->entries[cpu];
> +	i = 0;
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> @@ -2272,7 +2227,10 @@ static struct pernet_operations ip_tables_net_ops = {
>  
>  static int __init ip_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(ip_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&ip_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
> index 800ae85..5b03479 100644
> --- a/net/ipv6/netfilter/ip6_tables.c
> +++ b/net/ipv6/netfilter/ip6_tables.c
> @@ -329,6 +329,12 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */
> +};
> +static DEFINE_PER_CPU(struct rec_lock, ip6_tables_lock);
> +
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
>  ip6t_do_table(struct sk_buff *skb,
> @@ -347,6 +353,7 @@ ip6t_do_table(struct sk_buff *skb,
>  	struct xt_table_info *private;
>  	struct xt_match_param mtpar;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	/* Initialization */
>  	indev = in ? in->name : nulldevname;
> @@ -367,7 +374,12 @@ ip6t_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(ip_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);
> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -467,6 +479,8 @@ ip6t_do_table(struct sk_buff *skb,
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
>  	rcu_read_unlock_bh();
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -931,73 +945,25 @@ get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
>  	IP6T_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ip6t_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IP6T_ENTRY_ITERATE(t->entries[cpu],
> -			   t->size,
> -			   add_counter_to_entry,
> -			   counters,
> -			   &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct ip6t_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				   zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -1006,7 +972,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -1015,30 +980,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> +		return ERR_PTR(-ENOMEM);
>  
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1405,6 +1351,19 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ip6t_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		int compat)
> @@ -1465,25 +1424,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
> +	local_bh_disable();
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	spin_lock(&__get_cpu_var(ip6_tables_lock).lock);
> +	loc_cpu_entry = private->entries[smp_processor_id()];
> +	i = 0;
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	spin_unlock(&__get_cpu_var(ip6_tables_lock).lock);
> +	local_bh_enable();
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> @@ -2298,7 +2258,10 @@ static struct pernet_operations ip6_tables_net_ops = {
>  
>  static int __init ip6_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(ip6_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&ip6_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
> index 509a956..adc1b11 100644
> --- a/net/netfilter/x_tables.c
> +++ b/net/netfilter/x_tables.c
> @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
>  }
>  EXPORT_SYMBOL(xt_free_table_info);
>  
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> -{
> -	unsigned int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> -
> -}
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> -
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  				    const char *name)
> @@ -682,26 +668,21 @@ xt_replace_table(struct xt_table *table,
>  	      struct xt_table_info *newinfo,
>  	      int *error)
>  {
> -	struct xt_table_info *oldinfo, *private;
> +	struct xt_table_info *private;
>  
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
> -	oldinfo = private;
>  	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
> +	newinfo->initial_entries = private->initial_entries;
>  
> -	synchronize_net();
> -	return oldinfo;
> +	return private;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
>  
> @@ -734,7 +715,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
>  
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
>  
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;
> 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
@ 2009-04-16 17:58                                                                 ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 17:58 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, Linus Torvalds, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 06:10:30PM +0200, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
> 
> > Well, we don't really need an rwlock, especially given that we really
> > don't want two "readers" incrementing the same counter concurrently.
> > 
> > A safer approach would be to maintain a flag in the task structure
> > tracking which (if any) of the per-CPU locks you hold.  Also maintain
> > a recursion-depth counter.  If the flag says you don't already hold
> > the lock, set it and acquire the lock.  Either way, increment the
> > recursion-depth counter:
> > 
> > 	if (current->netfilter_lock_held != cur_cpu) {
> > 		BUG_ON(current->netfilter_lock_held != CPU_NONE);
> > 		spin_lock(per_cpu(..., cur_cpu));
> > 		current->netfilter_lock_held = cur_cpu;
> > 	}
> > 	current->netfilter_lock_nesting++;
> > 
> > And reverse the process to unlock:
> > 
> > 	if (--current->netfilter_lock_nesting == 0) {
> > 		spin_unlock(per_cpu(..., cur_cpu));
> > 		current->netfilter_lock_held = CPU_NONE;
> > 	}
> > 
> 
> Yes, you are right, we can avoid rwlock, but use a 'recursive' lock
> or spin_trylock()
> 
> We can use one counter close to the spinlock, 
> no need to add one or two fields to every "thread_info"
> 
> struct rec_lock {
> 	spinlock_t lock;
> 	int        count;
> };
> static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);

Yep, much better approach!

> I also considered using regular spinlocks and spin_trylock() to "detect"
> the recurse case without a global counter.
> 
> lock :
> local_bh_disable();
> int locked = spin_trylock(&__get_cpu_var(arp_tables_lock);

Hmmm...

What happens if some other CPU is actually holding the lock?  For
example, the updater?

> unlock:
> 
> if (likely(locked))
> 	spin_unlock(&__get_cpu_var(arp_tables_lock));
> local_bh_enable();
> 
> But we would lose some runtime features, I dont feel comfortable about
> this trylock version. What others people think ?

I do not believe that it actually works.

I much prefer your earlier idea of associating a counter with the lock.

But an owner field is also required, please see below.  Or please let me
know what I am missing.

							Thanx, Paul

> Here is the resulting patch, based on Stephen v4
> 
> (Not sure we *need* recursive spinlock for the arp_tables, but it seems
> better to have an uniform implementation)
> 
> 
> [PATCH] netfilter: use per-cpu recursive spinlock (v6)
> 
> Yet another alternative version of ip/ip6/arp tables locking using
> per-cpu locks.  This avoids the overhead of synchronize_net() during
> update but still removes the expensive rwlock in earlier versions.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  The slow case involves acquiring the locks on
> all cpu's.
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> We have to use recursive spinlocks because netfilter can sometimes
> nest several calls to ipt_do_table() for a given cpu.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  include/linux/netfilter/x_tables.h |    5 -
>  net/ipv4/netfilter/arp_tables.c    |  131 +++++++++------------------
>  net/ipv4/netfilter/ip_tables.c     |  130 +++++++++-----------------
>  net/ipv6/netfilter/ip6_tables.c    |  127 +++++++++-----------------
>  net/netfilter/x_tables.c           |   26 -----
>  5 files changed, 138 insertions(+), 281 deletions(-)
> 
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 7b1a652..1ff1a76 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
>  
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
>  
> @@ -434,8 +431,6 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
>  
>  /*
>   * This helper is performance critical and must be inlined
> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
> index 5ba533d..9f935f2 100644
> --- a/net/ipv4/netfilter/arp_tables.c
> +++ b/net/ipv4/netfilter/arp_tables.c
> @@ -231,6 +231,12 @@ static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
>  	return (struct arpt_entry *)(base + offset);
>  }
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */

We also need an owner field:

	int owner;

> +};
> +static DEFINE_PER_CPU(struct rec_lock, arp_tables_lock);
> +
>  unsigned int arpt_do_table(struct sk_buff *skb,
>  			   unsigned int hook,
>  			   const struct net_device *in,
> @@ -246,6 +252,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  	void *table_base;
>  	const struct xt_table_info *private;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
>  		return NF_DROP;
> @@ -255,7 +262,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(arp_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);

But if some other CPU holds the lock, this code would fail to wait for
that other CPU to release the lock, right?  It also might corrupt the
rl->count field due to two CPUs accessing it concurrently non-atomically.

I suggest the following, preferably in a function or macro or something:

	cur_cpu = smp_processor_id();
	if (likely(rl->owner != cur_cpu) {
		spin_lock(&rl->lock);
		rl->owner = smp_processor_id();
		rl->count = 1;
	} else {
		rl->count++;
	}

And the inverse for unlock.

Or am I missing something subtle?

> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  	back = get_entry(table_base, private->underflow[hook]);
> @@ -273,6 +285,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> +
>  			ADD_COUNTER(e->counters, hdr_len, 1);
>  
>  			t = arpt_get_target(e);
> @@ -328,7 +341,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  	rcu_read_unlock_bh();
>  
>  	if (hotdrop)
> @@ -716,74 +730,25 @@ static void get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
>  	ARPT_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
>  				   counters,
>  				   &i);
> -	}
> -}
> -
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct arpt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	ARPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct arpt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -792,7 +757,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	 * (other than comefrom, which userspace doesn't care
> @@ -802,30 +766,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> +		return ERR_PTR(-ENOMEM);
>  
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int copy_entries_to_user(unsigned int total_size,
> @@ -1165,6 +1110,19 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct arpt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  			   int compat)
>  {
> @@ -1173,7 +1131,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
>  	const char *name;
> -	int size;
> +	int cpu, size;
>  	void *ptmp;
>  	struct xt_table *t;
>  	const struct xt_table_info *private;
> @@ -1224,25 +1182,25 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[smp_processor_id()];
> +	cpu = raw_smp_processor_id();
> +	spin_lock_bh(&per_cpu(arp_tables_lock, cpu).lock);
> +	loc_cpu_entry = private->entries[cpu];
> +	i = 0;
>  	ARPT_ENTRY_ITERATE(loc_cpu_entry,
>  			   private->size,
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	preempt_enable();
> +	spin_unlock_bh(&per_cpu(arp_tables_lock, cpu).lock);
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  
>  	xt_table_unlock(t);
>  	module_put(t->me);
> @@ -1923,7 +1881,10 @@ static struct pernet_operations arp_tables_net_ops = {
>  
>  static int __init arp_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(arp_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&arp_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
> index 810c0b6..1368b6d 100644
> --- a/net/ipv4/netfilter/ip_tables.c
> +++ b/net/ipv4/netfilter/ip_tables.c
> @@ -297,6 +297,12 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */
> +};
> +static DEFINE_PER_CPU(struct rec_lock, ip_tables_lock);
> +
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
>  ipt_do_table(struct sk_buff *skb,
> @@ -317,6 +323,7 @@ ipt_do_table(struct sk_buff *skb,
>  	struct xt_table_info *private;
>  	struct xt_match_param mtpar;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	/* Initialization */
>  	ip = ip_hdr(skb);
> @@ -341,7 +348,12 @@ ipt_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(ip_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);
> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -436,7 +448,8 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
> @@ -902,75 +915,25 @@ get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -979,7 +942,6 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +950,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> +		return ERR_PTR(-ENOMEM);
>  
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1377,6 +1320,18 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1386,7 +1341,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
>  	const char *name;
> -	int size;
> +	int cpu, size;
>  	void *ptmp;
>  	struct xt_table *t;
>  	const struct xt_table_info *private;
> @@ -1437,25 +1392,25 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	cpu = raw_smp_processor_id();
> +	spin_lock_bh(&per_cpu(ip_tables_lock, cpu).lock);
> +	loc_cpu_entry = private->entries[cpu];
> +	i = 0;
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	spin_unlock_bh(&per_cpu(ip_tables_lock, cpu).lock);
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> @@ -2272,7 +2227,10 @@ static struct pernet_operations ip_tables_net_ops = {
>  
>  static int __init ip_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(ip_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&ip_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
> index 800ae85..5b03479 100644
> --- a/net/ipv6/netfilter/ip6_tables.c
> +++ b/net/ipv6/netfilter/ip6_tables.c
> @@ -329,6 +329,12 @@ static void trace_packet(struct sk_buff *skb,
>  }
>  #endif
>  
> +struct rec_lock {
> +	spinlock_t lock;
> +	int	   count; /* recursion count */
> +};
> +static DEFINE_PER_CPU(struct rec_lock, ip6_tables_lock);
> +
>  /* Returns one of the generic firewall policies, like NF_ACCEPT. */
>  unsigned int
>  ip6t_do_table(struct sk_buff *skb,
> @@ -347,6 +353,7 @@ ip6t_do_table(struct sk_buff *skb,
>  	struct xt_table_info *private;
>  	struct xt_match_param mtpar;
>  	struct xt_target_param tgpar;
> +	struct rec_lock *rl;
>  
>  	/* Initialization */
>  	indev = in ? in->name : nulldevname;
> @@ -367,7 +374,12 @@ ip6t_do_table(struct sk_buff *skb,
>  
>  	rcu_read_lock_bh();
>  	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
> +	rl = &__get_cpu_var(ip_tables_lock);
> +	if (likely(rl->count++ == 0))
> +		spin_lock(&rl->lock);
> +
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -467,6 +479,8 @@ ip6t_do_table(struct sk_buff *skb,
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
>  	rcu_read_unlock_bh();
> +	if (likely(--rl->count == 0))
> +		spin_unlock(&rl->lock);
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -931,73 +945,25 @@ get_counters(const struct xt_table_info *t,
>  	curcpu = raw_smp_processor_id();
>  
>  	i = 0;
> +	spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
>  	IP6T_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu).lock);
>  
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		spin_lock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> -	}
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ip6t_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IP6T_ENTRY_ITERATE(t->entries[cpu],
> -			   t->size,
> -			   add_counter_to_entry,
> -			   counters,
> -			   &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct ip6t_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				   zero_entry_counter, NULL);
> +		spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu).lock);
>  	}
>  }
>  
> @@ -1006,7 +972,6 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -1015,30 +980,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> +		return ERR_PTR(-ENOMEM);
>  
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1405,6 +1351,19 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ip6t_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		int compat)
> @@ -1465,25 +1424,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
> -	i = 0;
> +	local_bh_disable();
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	spin_lock(&__get_cpu_var(ip6_tables_lock).lock);
> +	loc_cpu_entry = private->entries[smp_processor_id()];
> +	i = 0;
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	spin_unlock(&__get_cpu_var(ip6_tables_lock).lock);
> +	local_bh_enable();
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> @@ -2298,7 +2258,10 @@ static struct pernet_operations ip6_tables_net_ops = {
>  
>  static int __init ip6_tables_init(void)
>  {
> -	int ret;
> +	int cpu, ret;
> +
> +	for_each_possible_cpu(cpu)
> +		spin_lock_init(&per_cpu(ip6_tables_lock, cpu).lock);
>  
>  	ret = register_pernet_subsys(&ip6_tables_net_ops);
>  	if (ret < 0)
> diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
> index 509a956..adc1b11 100644
> --- a/net/netfilter/x_tables.c
> +++ b/net/netfilter/x_tables.c
> @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
>  }
>  EXPORT_SYMBOL(xt_free_table_info);
>  
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> -{
> -	unsigned int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> -
> -}
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> -
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  				    const char *name)
> @@ -682,26 +668,21 @@ xt_replace_table(struct xt_table *table,
>  	      struct xt_table_info *newinfo,
>  	      int *error)
>  {
> -	struct xt_table_info *oldinfo, *private;
> +	struct xt_table_info *private;
>  
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
> -	oldinfo = private;
>  	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
> +	newinfo->initial_entries = private->initial_entries;
>  
> -	synchronize_net();
> -	return oldinfo;
> +	return private;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
>  
> @@ -734,7 +715,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
>  
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
>  
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;
> 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 17:58                                                                 ` Paul E. McKenney
  (?)
@ 2009-04-16 18:41                                                                 ` Eric Dumazet
  2009-04-16 20:49                                                                   ` [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7) Stephen Hemminger
  2009-04-17  0:13                                                                     ` Paul E. McKenney
  -1 siblings, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-16 18:41 UTC (permalink / raw)
  To: paulmck
  Cc: Patrick McHardy, Stephen Hemminger, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



Paul E. McKenney a écrit :
> 
> But if some other CPU holds the lock, this code would fail to wait for
> that other CPU to release the lock, right?  It also might corrupt the
> rl->count field due to two CPUs accessing it concurrently non-atomically.

If another cpu holds the lock, this cpu will spin on its own lock.

Remember other cpus dont touch rl->count. This is a private field, only touched
by the cpu on its own per_cpu data. There is no possible 'corruption'


So the owner of the per_cpu data does :

/*
 * disable preemption, get rl = &__get_cpu_var(arp_tables_lock);
 * then :
 */
lock_time :
if (++rl->count == 0)
	spin_lock(&rl->lock);

unlock_time:
if (likely(--rl->count == 0))
	spin_unlock(&rl->lock);


while other cpus only do :

spin_lock(&rl->lock);
/* work on data */
spin_unlock(&rl->lock);

So they cannot corrupt 'count' stuff.

> 
> I suggest the following, preferably in a function or macro or something:
> 
> 	cur_cpu = smp_processor_id();
> 	if (likely(rl->owner != cur_cpu) {
> 		spin_lock(&rl->lock);
> 		rl->owner = smp_processor_id();
> 		rl->count = 1;
> 	} else {
> 		rl->count++;
> 	}
> 
> And the inverse for unlock.
> 
> Or am I missing something subtle?

Apparently Linus missed it too, and reacted badly to my mail.
I dont know why we discuss of this stuff on lkml either...

I stop working on this subject and consider drinking dome hard stuf and
watching tv :)

See you


^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7)
  2009-04-16 18:41                                                                 ` Eric Dumazet
@ 2009-04-16 20:49                                                                   ` Stephen Hemminger
  2009-04-16 21:02                                                                     ` Linus Torvalds
  2009-04-17  0:13                                                                     ` Paul E. McKenney
  1 sibling, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-16 20:49 UTC (permalink / raw)
  To: Eric Dumazet, paulmck, Patrick McHardy, David Miller, Linus Torvalds
  Cc: jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

This version of x_tables (ip/ip6/arp) locking uses a per-cpu
rwlock that can be nested. It is sort of like earlier brwlock 
(fast reader, slow writer). The locking is isolated so future improvements
can concentrate on measuring/optimizing xt_table_info_lock. I tried
other versions based on recursive spin locks and sequence counters and 
for me, the risk of inventing own locking primitives not worth it at this time.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2).

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Lockdep reports bogus warnings on this, so using raw_write_lock
might be necessary.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
 include/linux/netfilter/x_tables.h |   34 +++++++++--
 net/ipv4/netfilter/arp_tables.c    |  110 ++++++++-----------------------------
 net/ipv4/netfilter/ip_tables.c     |  110 +++++++------------------------------
 net/ipv6/netfilter/ip6_tables.c    |  108 ++++++++----------------------------
 net/netfilter/x_tables.c           |   63 ++++++++++++++-------
 5 files changed, 144 insertions(+), 281 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-16 13:40:57.256734671 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-16 13:40:58.858044088 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,35 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+
+DECLARE_PER_CPU(rwlock_t, xt_info_locks);
+/**
+ * xt_table_info_lock - recursive read lock for xt table info
+ *
+ * Used for current CPU to read table and update counters.
+ * Allows recursive locking, so bottom half allowed
+ * but preempt disabled
+ */
+static inline void xt_table_info_lock(void)
+{
+	preempt_disable();
+	read_lock(&__get_cpu_var(xt_info_locks));
+	preempt_enable_no_resched();
+}
+
+/**
+ * xt_table_info_unlock - release recursive table info lock
+ *
+ * Used after read table and update counters.
+ */
+static inline void xt_table_info_unlock(void)
+{
+	read_unlock(&__get_cpu_var(xt_info_locks));
+}
+
+extern void xt_table_info_lock_all(void) 	__acquires(xt_table_info_all);
+extern void xt_table_info_unlock_all(void) 	__releases(xt_table_info_all);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-16 13:40:57.241798716 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-16 13:40:58.862043774 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-16 13:40:57.174740286 -0700
+++ b/net/netfilter/x_tables.c	2009-04-16 13:40:58.880757376 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,6 +662,40 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(rwlock_t, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+/**
+ * xt_table_info_lock_all - lock xt table info for update
+ *
+ * Locks out all readers, and blocks bottom half
+ */
+void xt_table_info_lock_all(void)
+{
+	int i;
+
+	local_bh_disable();
+	for_each_possible_cpu(i)
+		write_lock(&per_cpu(xt_info_locks, i));
+
+}
+EXPORT_SYMBOL_GPL(xt_table_info_lock_all);
+
+/**
+ * xt_table_info_unlock_all - lock xt table info for update
+ *
+ * Unlocks all readers, and unblocks bottom half
+ */
+void xt_table_info_unlock_all(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		write_unlock(&per_cpu(xt_info_locks, i));
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(xt_table_info_unlock_all);
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -685,22 +705,21 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	xt_table_info_lock_all();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		xt_table_info_unlock_all();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	table->private =  newinfo;
+	newinfo->initial_entries = private->initial_entries;
+	xt_table_info_unlock_all();
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +753,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1149,6 +1167,9 @@ static int __init xt_init(void)
 {
 	int i, rv;
 
+	for_each_possible_cpu(i)
+		rwlock_init(&per_cpu(xt_info_locks, i));
+
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
 		return -ENOMEM;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-16 13:40:57.205741715 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-16 13:40:58.882756612 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
 	}
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-16 13:40:57.226788977 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-16 13:40:58.897816063 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -734,65 +734,11 @@ static void get_counters(const struct xt
 	}
 }
 
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
- unlock_up_free:
-	mutex_unlock(&t->lock);
 
+ unlock_up_free:
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7)
  2009-04-16 20:49                                                                   ` [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7) Stephen Hemminger
@ 2009-04-16 21:02                                                                     ` Linus Torvalds
  2009-04-16 23:04                                                                       ` Ingo Molnar
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-16 21:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, paulmck, Patrick McHardy, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Thu, 16 Apr 2009, Stephen Hemminger wrote:
>
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> rwlock that can be nested. It is sort of like earlier brwlock 
> (fast reader, slow writer). The locking is isolated so future improvements
> can concentrate on measuring/optimizing xt_table_info_lock. I tried
> other versions based on recursive spin locks and sequence counters and 
> for me, the risk of inventing own locking primitives not worth it at this time.

This is stil scary.

Do we guarantee that read-locks nest in the presense of a waiting writer 
on another CPU? Now, I know we used to (ie readers always nested happily 
with readers even if there were pending writers), and then we broke it. I 
don't know that we ever unbroke it.

IOW, at least at some point we deadlocked on this (due to trying to be 
fair, and not lettign in readers while earlier writers were waiting):

	CPU#1			CPU#2

	read_lock

				write_lock
				.. spins with write bit set, waiting for
				   readers to go away ..

	recursive read_lock
	.. spins due to the write bit
	   being. BOOM: deadlock  ..

Now, I _think_ we avoid this, but somebody should double-check.

Also, I have still yet to hear the answer to why we care about stale 
counters of dead rules so much that we couldn't just free them later with 
RCU.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16 13:11                                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Patrick McHardy
@ 2009-04-16 22:33                                                       ` David Miller
  2009-04-16 23:49                                                         ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-16 22:33 UTC (permalink / raw)
  To: kaber
  Cc: torvalds, shemminger, dada1, jeff.chua.linux, paulmck, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 16 Apr 2009 15:11:31 +0200

> Linus Torvalds wrote:
>> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
>>> The counters are the bigger problem, otherwise we could just free
>>> table
>>> info via rcu.  Do we really have to support: replace where the counter
>>> values coming out to user space are always exactly accurate, or is it
>>> allowed to replace a rule and maybe lose some counter ticks (worst
>>> case
>>> NCPU-1).
>> Why not just read the counters fromt he old one at RCU free time (they
>> are guaranteed to be stable at that point, since we're all done with
>> those entries), and apply them at that point to the current setup?
> 
> We need the counters immediately to copy them to userspace, so waiting
> for an asynchronous RCU free is not going to work.

It just occurred to me that since all netfilter packet handling
goes through one place, we could have a sort-of "netfilter RCU"
of sorts to solve this problem.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7)
  2009-04-16 21:02                                                                     ` Linus Torvalds
@ 2009-04-16 23:04                                                                       ` Ingo Molnar
  0 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-16 23:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Eric Dumazet, paulmck, Patrick McHardy,
	David Miller, jeff.chua.linux, paulus, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, 16 Apr 2009, Stephen Hemminger wrote:
> >
> > This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> > rwlock that can be nested. It is sort of like earlier brwlock 
> > (fast reader, slow writer). The locking is isolated so future improvements
> > can concentrate on measuring/optimizing xt_table_info_lock. I tried
> > other versions based on recursive spin locks and sequence counters and 
> > for me, the risk of inventing own locking primitives not worth it at this time.
> 
> This is stil scary.
> 
> Do we guarantee that read-locks nest in the presense of a waiting 
> writer on another CPU? Now, I know we used to (ie readers always 
> nested happily with readers even if there were pending writers), 
> and then we broke it. I don't know that we ever unbroke it.
> 
> IOW, at least at some point we deadlocked on this (due to trying 
> to be fair, and not lettign in readers while earlier writers were 
> waiting):
> 
> 	CPU#1			CPU#2
> 
> 	read_lock
> 
> 				write_lock
> 				.. spins with write bit set, waiting for
> 				   readers to go away ..
> 
> 	recursive read_lock
> 	.. spins due to the write bit
> 	   being. BOOM: deadlock  ..
> 
> Now, I _think_ we avoid this, but somebody should double-check.

This is a narrow special-case where the spin-rwlock is safe, and the 
rwsem is unsafe.

But it should work for rwlocks - it always worked and the networking 
code always relied on that AFAIK.

Here's the x86 assembly code of the write-side slowpath:

ENTRY(__write_lock_failed)
        CFI_STARTPROC
        LOCK_PREFIX
        addl $RW_LOCK_BIAS,(%rdi)
1:      rep
        nop
        cmpl $RW_LOCK_BIAS,(%rdi)
        jne 1b
        LOCK_PREFIX
        subl $RW_LOCK_BIAS,(%rdi)
        jnz  __write_lock_failed
        ret
        CFI_ENDPROC

the fastpath decreased the value with RW_LOCK_BIAS, and when we 
enter this function we undo that effect by adding RW_LOCK_BIAS. Then 
we spin (without signalling our write-intent) passively until the 
count reaches RW_LOCK_BIAS. Then we try to lock it again and bring 
it to zero (meaning no other readers or writers - we got the lock).

This is pretty much the most unfair strategy possible for writers - 
but this is how rwlocks always behaved - and they do so mostly for 
recursive use within networking.

This is why the tasklist_lock was always so suspect to insane 
starvation symptoms on really large SMP systems, and this is why 
write_lock_irq(&tasklist_lock) was always a dangerous operation to 
do. (it can spin for a _long_ time with irqs off.)

It's not the most optimal of situations. Some patches are in the 
works to fix the irqs-off artifact (on ia64 - no x86 patches yet 
AFAICS) - but that's just papering it over.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16 22:33                                                       ` David Miller
@ 2009-04-16 23:49                                                         ` Paul E. McKenney
  2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
  0 siblings, 2 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-16 23:49 UTC (permalink / raw)
  To: David Miller
  Cc: kaber, torvalds, shemminger, dada1, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Thu, 16 Apr 2009 15:11:31 +0200
> 
> > Linus Torvalds wrote:
> >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> >>> The counters are the bigger problem, otherwise we could just free
> >>> table
> >>> info via rcu.  Do we really have to support: replace where the counter
> >>> values coming out to user space are always exactly accurate, or is it
> >>> allowed to replace a rule and maybe lose some counter ticks (worst
> >>> case
> >>> NCPU-1).
> >> Why not just read the counters fromt he old one at RCU free time (they
> >> are guaranteed to be stable at that point, since we're all done with
> >> those entries), and apply them at that point to the current setup?
> > 
> > We need the counters immediately to copy them to userspace, so waiting
> > for an asynchronous RCU free is not going to work.
> 
> It just occurred to me that since all netfilter packet handling
> goes through one place, we could have a sort-of "netfilter RCU"
> of sorts to solve this problem.

OK, I am putting one together...

It will be needed sooner or later, though I suspect per-CPU locking
would work fine in this case.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-16 23:49                                                         ` Paul E. McKenney
@ 2009-04-16 23:52                                                           ` Stephen Hemminger
  2009-04-17  0:15                                                             ` Jeff Chua
                                                                               ` (2 more replies)
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
  1 sibling, 3 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-16 23:52 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, kaber, torvalds, dada1, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

This version of x_tables (ip/ip6/arp) locking uses a per-cpu
recursive lock that can be nested. It is sort of like existing kernel_lock,
rwlock_t and even old 2.4 brlock.

"Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
It needs to ensure that the rules are not being changed while packet
is being processed.

"Writer" is used in two cases: first is replacing rules in which case
all packets in flight have to be processed before rules are swapped,
then counters are read from the old (stale) info. Second case is where
counters need to be read on the fly, in this case all CPU's are blocked
from further rule processing until values are aggregated.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2). 


The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Future optimizations possible:
  - Lockdep doesn't really handle this well
  - hot plug CPU case, if kernel is built with large # of CPU's, skip
    the inactive ones; migrate values when CPU is removed.
  - reading counters could be incremental by CPU.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
 include/linux/netfilter/x_tables.h |   10 +--
 net/ipv4/netfilter/arp_tables.c    |  110 ++++++++----------------------------
 net/ipv4/netfilter/ip_tables.c     |  110 +++++++-----------------------------
 net/ipv6/netfilter/ip6_tables.c    |  108 ++++++++---------------------------
 net/netfilter/x_tables.c           |  113 ++++++++++++++++++++++++++++++-------
 5 files changed, 170 insertions(+), 281 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-16 15:09:53.082406828 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-16 15:10:17.154966874 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,11 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+extern void xt_table_info_lock(void);
+extern void xt_table_info_unlock(void);
+extern void xt_table_info_lock_all(void);
+extern void xt_table_info_unlock_all(void);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-16 15:09:53.066406011 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-16 15:10:17.155966637 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-16 15:09:53.028984288 -0700
+++ b/net/netfilter/x_tables.c	2009-04-16 16:35:36.996150064 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,6 +662,90 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+struct xt_lock {
+	spinlock_t lock;
+	int   	   depth;	/* # readers - 1 */
+};
+
+static DEFINE_PER_CPU(struct xt_lock, xt_info_locks);
+
+static void xt_lock_init(struct xt_lock *lock)
+{
+	spin_lock_init(&lock->lock);
+	lock->depth = -1;
+}
+
+/**
+ * xt_table_info_lock - recursive read lock for xt table info
+ *
+ * Used for current CPU to read table and update counters.
+ * Allows recursive locking, on same CPU.
+ */
+void xt_table_info_lock(void)
+{
+	struct xt_lock *lock;
+
+	preempt_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (likely(++lock->depth == 0))
+		spin_lock(&lock->lock);
+}
+EXPORT_SYMBOL_GPL(xt_table_info_lock);
+
+/**
+ * xt_table_info_unlock - release recursive table info lock
+ *
+ * Used after read table and update counters.
+ */
+void xt_table_info_unlock(void)
+{
+	struct xt_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	BUG_ON(lock->depth < 0);
+	if (likely(--lock->depth < 0))
+		spin_unlock(&lock->lock);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(xt_table_info_unlock);
+
+
+/**
+ * xt_table_info_lock_all - lock xt table info for update
+ *
+ * Locks out all readers, and blocks bottom half
+ */
+void xt_table_info_lock_all(void)
+{
+	int i;
+
+	local_bh_disable();
+	for_each_possible_cpu(i) {
+		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock(&lock->lock);
+		BUG_ON(lock->depth != -1);
+	}
+}
+EXPORT_SYMBOL_GPL(xt_table_info_lock_all);
+
+/**
+ * xt_table_info_unlock_all - lock xt table info for update
+ *
+ * Unlocks all readers, and unblocks bottom half
+ */
+void xt_table_info_unlock_all(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
+		BUG_ON(lock->depth != -1);
+		spin_unlock(&lock->lock);
+	}
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(xt_table_info_unlock_all);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -685,22 +755,21 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	xt_table_info_lock_all();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		xt_table_info_unlock_all();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	table->private =  newinfo;
+	newinfo->initial_entries = private->initial_entries;
+	xt_table_info_unlock_all();
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +803,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1149,6 +1217,9 @@ static int __init xt_init(void)
 {
 	int i, rv;
 
+	for_each_possible_cpu(i)
+		xt_lock_init(&per_cpu(xt_info_locks, i));
+
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
 		return -ENOMEM;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-16 15:09:53.041965912 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-16 15:10:17.158972154 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
 	}
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-16 15:09:53.052406917 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-16 15:10:17.160029716 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_table_info_lock();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_table_info_unlock();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -734,65 +734,11 @@ static void get_counters(const struct xt
 	}
 }
 
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_table_info_lock_all();
+	get_counters(private, counters);
+	xt_table_info_unlock_all();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_table_info_lock_all();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
- unlock_up_free:
-	mutex_unlock(&t->lock);
 
+ unlock_up_free:
+	xt_table_info_unlock_all();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
  2009-04-16 18:41                                                                 ` Eric Dumazet
@ 2009-04-17  0:13                                                                     ` Paul E. McKenney
  2009-04-17  0:13                                                                     ` Paul E. McKenney
  1 sibling, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  0:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 08:41:58PM +0200, Eric Dumazet wrote:
> 
> 
> Paul E. McKenney a écrit :
> > 
> > But if some other CPU holds the lock, this code would fail to wait for
> > that other CPU to release the lock, right?  It also might corrupt the
> > rl->count field due to two CPUs accessing it concurrently non-atomically.
> 
> If another cpu holds the lock, this cpu will spin on its own lock.
> 
> Remember other cpus dont touch rl->count. This is a private field, only touched
> by the cpu on its own per_cpu data. There is no possible 'corruption'

Ah!!!  I must confess that I didn't make it that far into the code...

> So the owner of the per_cpu data does :
> 
> /*
>  * disable preemption, get rl = &__get_cpu_var(arp_tables_lock);
>  * then :
>  */
> lock_time :
> if (++rl->count == 0)
> 	spin_lock(&rl->lock);
> 
> unlock_time:
> if (likely(--rl->count == 0))
> 	spin_unlock(&rl->lock);
> 
> 
> while other cpus only do :
> 
> spin_lock(&rl->lock);
> /* work on data */
> spin_unlock(&rl->lock);
> 
> So they cannot corrupt 'count' stuff.

OK, that does function.  Hurts my head, though.  ;-)

> > I suggest the following, preferably in a function or macro or something:
> > 
> > 	cur_cpu = smp_processor_id();
> > 	if (likely(rl->owner != cur_cpu) {
> > 		spin_lock(&rl->lock);
> > 		rl->owner = smp_processor_id();
> > 		rl->count = 1;
> > 	} else {
> > 		rl->count++;
> > 	}
> > 
> > And the inverse for unlock.
> > 
> > Or am I missing something subtle?
> 
> Apparently Linus missed it too, and reacted badly to my mail.
> I dont know why we discuss of this stuff on lkml either...

Encapsulating them so that they appear in the same place might (or might
not) have gotten the fact that you were not doing a recursive lock
through my head.  Even so, the name "rec_lock" might have overwhelmed
the code structure in my mind.  ;-)

> I stop working on this subject and consider drinking dome hard stuf and
> watching tv :)

That -is- extreme!  ;-)

							Thanx, Paul

> See you
> 

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive spinlock  (v6)
@ 2009-04-17  0:13                                                                     ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  0:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 08:41:58PM +0200, Eric Dumazet wrote:
> 
> 
> Paul E. McKenney a écrit :
> > 
> > But if some other CPU holds the lock, this code would fail to wait for
> > that other CPU to release the lock, right?  It also might corrupt the
> > rl->count field due to two CPUs accessing it concurrently non-atomically.
> 
> If another cpu holds the lock, this cpu will spin on its own lock.
> 
> Remember other cpus dont touch rl->count. This is a private field, only touched
> by the cpu on its own per_cpu data. There is no possible 'corruption'

Ah!!!  I must confess that I didn't make it that far into the code...

> So the owner of the per_cpu data does :
> 
> /*
>  * disable preemption, get rl = &__get_cpu_var(arp_tables_lock);
>  * then :
>  */
> lock_time :
> if (++rl->count == 0)
> 	spin_lock(&rl->lock);
> 
> unlock_time:
> if (likely(--rl->count == 0))
> 	spin_unlock(&rl->lock);
> 
> 
> while other cpus only do :
> 
> spin_lock(&rl->lock);
> /* work on data */
> spin_unlock(&rl->lock);
> 
> So they cannot corrupt 'count' stuff.

OK, that does function.  Hurts my head, though.  ;-)

> > I suggest the following, preferably in a function or macro or something:
> > 
> > 	cur_cpu = smp_processor_id();
> > 	if (likely(rl->owner != cur_cpu) {
> > 		spin_lock(&rl->lock);
> > 		rl->owner = smp_processor_id();
> > 		rl->count = 1;
> > 	} else {
> > 		rl->count++;
> > 	}
> > 
> > And the inverse for unlock.
> > 
> > Or am I missing something subtle?
> 
> Apparently Linus missed it too, and reacted badly to my mail.
> I dont know why we discuss of this stuff on lkml either...

Encapsulating them so that they appear in the same place might (or might
not) have gotten the fact that you were not doing a recursive lock
through my head.  Even so, the name "rec_lock" might have overwhelmed
the code structure in my mind.  ;-)

> I stop working on this subject and consider drinking dome hard stuf and
> watching tv :)

That -is- extreme!  ;-)

							Thanx, Paul

> See you
> 
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
@ 2009-04-17  0:15                                                             ` Jeff Chua
  2009-04-17  5:55                                                             ` Peter Zijlstra
  2009-04-17  6:03                                                             ` Eric Dumazet
  2 siblings, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-17  0:15 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, dada1, paulus, mingo,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

On Fri, Apr 17, 2009 at 7:52 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.

Tested and working. As fast as before.

Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-16 23:49                                                         ` Paul E. McKenney
  2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
@ 2009-04-17  1:28                                                           ` Paul E. McKenney
  2009-04-17  2:19                                                             ` Mathieu Desnoyers
                                                                               ` (3 more replies)
  1 sibling, 4 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  1:28 UTC (permalink / raw)
  To: David Miller
  Cc: kaber, torvalds, shemminger, dada1, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > From: Patrick McHardy <kaber@trash.net>
> > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > 
> > > Linus Torvalds wrote:
> > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > >>> The counters are the bigger problem, otherwise we could just free
> > >>> table
> > >>> info via rcu.  Do we really have to support: replace where the counter
> > >>> values coming out to user space are always exactly accurate, or is it
> > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > >>> case
> > >>> NCPU-1).
> > >> Why not just read the counters fromt he old one at RCU free time (they
> > >> are guaranteed to be stable at that point, since we're all done with
> > >> those entries), and apply them at that point to the current setup?
> > > 
> > > We need the counters immediately to copy them to userspace, so waiting
> > > for an asynchronous RCU free is not going to work.
> > 
> > It just occurred to me that since all netfilter packet handling
> > goes through one place, we could have a sort-of "netfilter RCU"
> > of sorts to solve this problem.
> 
> OK, I am putting one together...
> 
> It will be needed sooner or later, though I suspect per-CPU locking
> would work fine in this case.

And here is a crude first cut.  Untested, probably does not even compile.

Straight conversion of Mathieu Desnoyers's user-space RCU implementation
at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
a little, but he must bear the bulk of the guilt).  Pick on srcu.h
and srcu.c out of sheer laziness.  User-space testing gives deep
sub-microsecond grace-period latencies, so should be fast enough, at
least if you don't mind two smp_call_function() invocations per grace
period and spinning on each instance of a per-CPU variable.

Again, I believe per-CPU locking should work fine for the netfilter
counters, but I guess "friends don't let friends use hashed locks".
(I would not know for sure, never having used them myself, except of
course to protect hash tables.)

Most definitely -not- for inclusion at this point.  Next step is to hack
up the relevant rcutorture code and watch it explode on contact.  ;-)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 include/linux/srcu.h |   30 ++++++++++++++++++++++++
 kernel/srcu.c        |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index aca0eee..4577cd0 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -50,4 +50,34 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 long srcu_batches_completed(struct srcu_struct *sp);
 
+/* Single bit for grace-period index, low-order bits are nesting counter. */
+#define RCU_FGP_COUNT		1UL
+#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
+#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
+
+extern long rcu_fgp_ctr;
+DECLARE_PER_CPU(long, rcu_fgp_active_readers);
+
+static inline void rcu_read_lock_fgp(void)
+{
+	long tmp;
+	long *uarp;
+
+	preempt_disable();
+	uarp = &__get_cpu_var(rcu_fgp_active_readers);
+	tmp = *uarp;
+	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
+		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */
+	else
+		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
+	barrier();
+}
+
+static inline void rcu_read_unlock_fgp(void)
+{
+	barrier();
+	__get_cpu_var(rcu_fgp_active_readers)--;
+	preempt_enable();
+}
+
 #endif
diff --git a/kernel/srcu.c b/kernel/srcu.c
index b0aeeaf..a5dc464 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -255,3 +255,66 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
 EXPORT_SYMBOL_GPL(srcu_read_unlock);
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+DEFINE_MUTEX(rcu_fgp_mutex);
+long rcu_fgp_ctr = RCU_FGP_COUNT;
+DEFINE_PER_CPU(long, rcu_fgp_active_readers);
+
+/*
+ * Determine if the specified counter value indicates that we need to
+ * wait on the corresponding CPU to exit an rcu_fgp read-side critical
+ * section.  Return non-zero if so.
+ *
+ * Assumes that rcu_fgp_mutex is held, and thus that rcu_fgp_ctr is
+ * unchanging.
+ */
+static inline int rcu_old_fgp_ongoing(long *value)
+{
+	long v = ACCESS_ONCE(*value);
+
+	return (v & RCU_FGP_NEST_MASK) &&
+	       ((v ^ rcu_fgp_ctr) & RCU_FGP_PARITY);
+}
+
+static void rcu_fgp_wait_for_quiescent_state(void)
+{
+	int cpu;
+	long *uarp;
+
+	for_each_online_cpu(cpu) {
+		uarp = &per_cpu(rcu_fgp_active_readers, cpu);
+		while (rcu_old_fgp_ongoing(uarp))
+			cpu_relax();
+	}
+}
+
+static void rcu_fgp_do_mb(void *unused)
+{
+	smp_mb();  /* Each CPU does a memory barrier. */
+}
+
+void synchronize_rcu_fgp(void)
+{
+	mutex_lock(&rcu_fgp_mutex);
+	
+	/* CPUs must see earlier change before parity flip. */
+	smp_call_function(rcu_fgp_do_mb, NULL, 1);
+
+	/*
+	 * We must flip twice to correctly handle tasks that stall
+	 * in rcu_read_lock_fgp() between the time that they fetch
+	 * rcu_fgp_ctr and the time that the store to their CPU's
+	 * rcu_fgp_active_readers.  No matter when they resume
+	 * execution, we will wait for them to get to the corresponding
+	 * rcu_read_unlock_fgp().
+	 */
+	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 0 -> 1 */
+	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
+	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 1 -> 0 */
+	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
+
+	/* Prevent CPUs from reordering out of prior RCU critical sections. */
+	smp_call_function(rcu_fgp_do_mb, NULL, 1);
+
+	mutex_unlock(&rcu_fgp_mutex);
+}

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
@ 2009-04-17  2:19                                                             ` Mathieu Desnoyers
  2009-04-17  5:05                                                               ` Paul E. McKenney
  2009-04-17  4:50                                                             ` Stephen Hemminger
                                                                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-17  2:19 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

* Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > From: Patrick McHardy <kaber@trash.net>
> > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > 
> > > > Linus Torvalds wrote:
> > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > >>> The counters are the bigger problem, otherwise we could just free
> > > >>> table
> > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > >>> values coming out to user space are always exactly accurate, or is it
> > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > >>> case
> > > >>> NCPU-1).
> > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > >> are guaranteed to be stable at that point, since we're all done with
> > > >> those entries), and apply them at that point to the current setup?
> > > > 
> > > > We need the counters immediately to copy them to userspace, so waiting
> > > > for an asynchronous RCU free is not going to work.
> > > 
> > > It just occurred to me that since all netfilter packet handling
> > > goes through one place, we could have a sort-of "netfilter RCU"
> > > of sorts to solve this problem.
> > 
> > OK, I am putting one together...
> > 
> > It will be needed sooner or later, though I suspect per-CPU locking
> > would work fine in this case.
> 
> And here is a crude first cut.  Untested, probably does not even compile.
> 
> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> a little, but he must bear the bulk of the guilt).

I'm innocent, I swear ;-)

That should give very impressive performance results.

Please see comments below,

>   Pick on srcu.h
> and srcu.c out of sheer laziness.  User-space testing gives deep
> sub-microsecond grace-period latencies, so should be fast enough, at
> least if you don't mind two smp_call_function() invocations per grace
> period and spinning on each instance of a per-CPU variable.
> 
> Again, I believe per-CPU locking should work fine for the netfilter
> counters, but I guess "friends don't let friends use hashed locks".
> (I would not know for sure, never having used them myself, except of
> course to protect hash tables.)
> 
> Most definitely -not- for inclusion at this point.  Next step is to hack
> up the relevant rcutorture code and watch it explode on contact.  ;-)
> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
> 
>  include/linux/srcu.h |   30 ++++++++++++++++++++++++
>  kernel/srcu.c        |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 93 insertions(+)
> 
> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> index aca0eee..4577cd0 100644
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
> @@ -50,4 +50,34 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
>  void synchronize_srcu(struct srcu_struct *sp);
>  long srcu_batches_completed(struct srcu_struct *sp);
>  
> +/* Single bit for grace-period index, low-order bits are nesting counter. */
> +#define RCU_FGP_COUNT		1UL
> +#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
> +#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
> +
> +extern long rcu_fgp_ctr;
> +DECLARE_PER_CPU(long, rcu_fgp_active_readers);
> +
> +static inline void rcu_read_lock_fgp(void)
> +{
> +	long tmp;
> +	long *uarp;
> +
> +	preempt_disable();
> +	uarp = &__get_cpu_var(rcu_fgp_active_readers);

OK, so we are translating the original implementation from per-thread to
per-cpu, with preemption disabled. Fine with me if we can't afford the
per-thread unsigned long nor can't afford to iterate on each thread when
waiting for RCU quiescent state.

> +	tmp = *uarp;
> +	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
> +		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */

ACCESS_ONCE(rcu_fgp_ctr) could not hurt here I think. Given the
surrounding code, that does not seem like a necessity, but that would
express that it is really an atomic read.

> +	else
> +		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
> +	barrier();

I kind of expect an IPI with a smp_mb() to promote this barrier() to a
smp_mb() when the update side needs to wait for a quiescent state. I
guess a comment telling this here would not hurt.

> +}
> +
> +static inline void rcu_read_unlock_fgp(void)
> +{
> +	barrier();

Same here.

> +	__get_cpu_var(rcu_fgp_active_readers)--;
> +	preempt_enable();
> +}
> +
>  #endif
> diff --git a/kernel/srcu.c b/kernel/srcu.c
> index b0aeeaf..a5dc464 100644
> --- a/kernel/srcu.c
> +++ b/kernel/srcu.c
> @@ -255,3 +255,66 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
>  EXPORT_SYMBOL_GPL(srcu_read_unlock);
>  EXPORT_SYMBOL_GPL(synchronize_srcu);
>  EXPORT_SYMBOL_GPL(srcu_batches_completed);
> +
> +DEFINE_MUTEX(rcu_fgp_mutex);
> +long rcu_fgp_ctr = RCU_FGP_COUNT;

Saying why we populate the value 1 here (RCU_FGP_COUNT) as an
optimization for the read-side might help understanding this choice.

> +DEFINE_PER_CPU(long, rcu_fgp_active_readers);
> +
> +/*
> + * Determine if the specified counter value indicates that we need to
> + * wait on the corresponding CPU to exit an rcu_fgp read-side critical
> + * section.  Return non-zero if so.
> + *
> + * Assumes that rcu_fgp_mutex is held, and thus that rcu_fgp_ctr is
> + * unchanging.
> + */
> +static inline int rcu_old_fgp_ongoing(long *value)
> +{
> +	long v = ACCESS_ONCE(*value);
> +
> +	return (v & RCU_FGP_NEST_MASK) &&
> +	       ((v ^ rcu_fgp_ctr) & RCU_FGP_PARITY);
> +}
> +
> +static void rcu_fgp_wait_for_quiescent_state(void)
> +{
> +	int cpu;
> +	long *uarp;
> +
> +	for_each_online_cpu(cpu) {
> +		uarp = &per_cpu(rcu_fgp_active_readers, cpu);
> +		while (rcu_old_fgp_ongoing(uarp))
> +			cpu_relax();

I would be tempted to add a comment here telling hot cpu hotunplug
cannot let us wait forever, given all read-side critical sections we can
be busy-waiting for are required to have preemption disabled, and are
therefore cpu-hotplug safe.

> +	}
> +}
> +
> +static void rcu_fgp_do_mb(void *unused)
> +{
> +	smp_mb();  /* Each CPU does a memory barrier. */
> +}

Ah, here it is. Commenting that it matches the two barrier()s I identified
above would be good.

> +
> +void synchronize_rcu_fgp(void)
> +{
> +	mutex_lock(&rcu_fgp_mutex);
> +	
> +	/* CPUs must see earlier change before parity flip. */
> +	smp_call_function(rcu_fgp_do_mb, NULL, 1);

/*
 * Call a function on all other processors
 */
int smp_call_function(void(*func)(void *info), void *info, int wait);

I guess you meant on_each_cpu ? That should include "self", given we
also need the smp_mb().

> +
> +	/*
> +	 * We must flip twice to correctly handle tasks that stall
> +	 * in rcu_read_lock_fgp() between the time that they fetch
> +	 * rcu_fgp_ctr and the time that the store to their CPU's
> +	 * rcu_fgp_active_readers.  No matter when they resume
> +	 * execution, we will wait for them to get to the corresponding
> +	 * rcu_read_unlock_fgp().
> +	 */
> +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 0 -> 1 */
> +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 1 -> 0 */
> +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> +
> +	/* Prevent CPUs from reordering out of prior RCU critical sections. */
> +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> +

Same as above.

Mathieu, who can still recognise his original userspace implementation
:-)

> +	mutex_unlock(&rcu_fgp_mutex);
> +}

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
  2009-04-17  2:19                                                             ` Mathieu Desnoyers
@ 2009-04-17  4:50                                                             ` Stephen Hemminger
  2009-04-17  5:08                                                               ` Paul E. McKenney
  2009-04-17  5:16                                                                 ` Eric Dumazet
  2009-04-17  6:12                                                             ` Peter Zijlstra
  2009-04-18  9:40                                                             ` Evgeniy Polyakov
  3 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-17  4:50 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, kaber, torvalds, dada1, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

On Thu, 16 Apr 2009 18:28:12 -0700
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > From: Patrick McHardy <kaber@trash.net>
> > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > 
> > > > Linus Torvalds wrote:
> > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > >>> The counters are the bigger problem, otherwise we could just free
> > > >>> table
> > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > >>> values coming out to user space are always exactly accurate, or is it
> > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > >>> case
> > > >>> NCPU-1).
> > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > >> are guaranteed to be stable at that point, since we're all done with
> > > >> those entries), and apply them at that point to the current setup?
> > > > 
> > > > We need the counters immediately to copy them to userspace, so waiting
> > > > for an asynchronous RCU free is not going to work.
> > > 
> > > It just occurred to me that since all netfilter packet handling
> > > goes through one place, we could have a sort-of "netfilter RCU"
> > > of sorts to solve this problem.
> > 
> > OK, I am putting one together...
> > 
> > It will be needed sooner or later, though I suspect per-CPU locking
> > would work fine in this case.
> 
> And here is a crude first cut.  Untested, probably does not even compile.
> 
> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> and srcu.c out of sheer laziness.  User-space testing gives deep
> sub-microsecond grace-period latencies, so should be fast enough, at
> least if you don't mind two smp_call_function() invocations per grace
> period and spinning on each instance of a per-CPU variable.
> 
> Again, I believe per-CPU locking should work fine for the netfilter
> counters, but I guess "friends don't let friends use hashed locks".
> (I would not know for sure, never having used them myself, except of
> course to protect hash tables.)
> 
> Most definitely -not- for inclusion at this point.  Next step is to hack
> up the relevant rcutorture code and watch it explode on contact.  ;-)
> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

I am glad to see this worked on, but would rather not use RCU in this case
of iptables. It would be good for some of the other long grace period sutff.

The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
hard to debug and more convoluted so it probably would be a long term maintaince
nightmare.  The issue was the variable size skip structure so it made
for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
as fast, it is easier to understand.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  2:19                                                             ` Mathieu Desnoyers
@ 2009-04-17  5:05                                                               ` Paul E. McKenney
  2009-04-17  5:44                                                                 ` Mathieu Desnoyers
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  5:05 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Thu, Apr 16, 2009 at 10:19:02PM -0400, Mathieu Desnoyers wrote:
> * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > > From: Patrick McHardy <kaber@trash.net>
> > > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > > 
> > > > > Linus Torvalds wrote:
> > > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > > >>> The counters are the bigger problem, otherwise we could just free
> > > > >>> table
> > > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > > >>> values coming out to user space are always exactly accurate, or is it
> > > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > > >>> case
> > > > >>> NCPU-1).
> > > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > > >> are guaranteed to be stable at that point, since we're all done with
> > > > >> those entries), and apply them at that point to the current setup?
> > > > > 
> > > > > We need the counters immediately to copy them to userspace, so waiting
> > > > > for an asynchronous RCU free is not going to work.
> > > > 
> > > > It just occurred to me that since all netfilter packet handling
> > > > goes through one place, we could have a sort-of "netfilter RCU"
> > > > of sorts to solve this problem.
> > > 
> > > OK, I am putting one together...
> > > 
> > > It will be needed sooner or later, though I suspect per-CPU locking
> > > would work fine in this case.
> > 
> > And here is a crude first cut.  Untested, probably does not even compile.
> > 
> > Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> > at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> > a little, but he must bear the bulk of the guilt).
> 
> I'm innocent, I swear ;-)

That is what they -all- say!!!  ;-)

> That should give very impressive performance results.

I wouldn't expect more than about three or four orders of magnitude
improvement on the update side compared to Classic RCU, but who knows?

> Please see comments below,
> 
> >   Pick on srcu.h
> > and srcu.c out of sheer laziness.  User-space testing gives deep
> > sub-microsecond grace-period latencies, so should be fast enough, at
> > least if you don't mind two smp_call_function() invocations per grace
> > period and spinning on each instance of a per-CPU variable.
> > 
> > Again, I believe per-CPU locking should work fine for the netfilter
> > counters, but I guess "friends don't let friends use hashed locks".
> > (I would not know for sure, never having used them myself, except of
> > course to protect hash tables.)
> > 
> > Most definitely -not- for inclusion at this point.  Next step is to hack
> > up the relevant rcutorture code and watch it explode on contact.  ;-)
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> > 
> >  include/linux/srcu.h |   30 ++++++++++++++++++++++++
> >  kernel/srcu.c        |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 93 insertions(+)
> > 
> > diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> > index aca0eee..4577cd0 100644
> > --- a/include/linux/srcu.h
> > +++ b/include/linux/srcu.h
> > @@ -50,4 +50,34 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
> >  void synchronize_srcu(struct srcu_struct *sp);
> >  long srcu_batches_completed(struct srcu_struct *sp);
> >  
> > +/* Single bit for grace-period index, low-order bits are nesting counter. */
> > +#define RCU_FGP_COUNT		1UL
> > +#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
> > +#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
> > +
> > +extern long rcu_fgp_ctr;
> > +DECLARE_PER_CPU(long, rcu_fgp_active_readers);
> > +
> > +static inline void rcu_read_lock_fgp(void)
> > +{
> > +	long tmp;
> > +	long *uarp;
> > +
> > +	preempt_disable();
> > +	uarp = &__get_cpu_var(rcu_fgp_active_readers);
> 
> OK, so we are translating the original implementation from per-thread to
> per-cpu, with preemption disabled. Fine with me if we can't afford the
> per-thread unsigned long nor can't afford to iterate on each thread when
> waiting for RCU quiescent state.

The iterating on each thread was what stopped me.

> > +	tmp = *uarp;
> > +	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
> > +		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */
> 
> ACCESS_ONCE(rcu_fgp_ctr) could not hurt here I think. Given the
> surrounding code, that does not seem like a necessity, but that would
> express that it is really an atomic read.

I believe that it is safe.  Only one bit of rcu_fgp_ctr ever changes,
so we should be immune from load tearing.  We only load it once and
only do one thing with it, and we have a barrier() before (as part
of preempt_disable()) and after, so I don't think that the compiler
has much latitude here.  In theory, we could get store tearing through
*uarp, but if gcc did that, much of the kernel would go down in flames.

In contrast, in the user-mode version, there was no barrier() on entry,
permitting the compiler much more mischief.

> > +	else
> > +		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
> > +	barrier();
> 
> I kind of expect an IPI with a smp_mb() to promote this barrier() to a
> smp_mb() when the update side needs to wait for a quiescent state. I
> guess a comment telling this here would not hurt.

If you insist.  ;-)

> > +}
> > +
> > +static inline void rcu_read_unlock_fgp(void)
> > +{
> > +	barrier();
> 
> Same here.

Likewise!

> > +	__get_cpu_var(rcu_fgp_active_readers)--;
> > +	preempt_enable();
> > +}
> > +
> >  #endif
> > diff --git a/kernel/srcu.c b/kernel/srcu.c
> > index b0aeeaf..a5dc464 100644
> > --- a/kernel/srcu.c
> > +++ b/kernel/srcu.c
> > @@ -255,3 +255,66 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
> >  EXPORT_SYMBOL_GPL(srcu_read_unlock);
> >  EXPORT_SYMBOL_GPL(synchronize_srcu);
> >  EXPORT_SYMBOL_GPL(srcu_batches_completed);
> > +
> > +DEFINE_MUTEX(rcu_fgp_mutex);
> > +long rcu_fgp_ctr = RCU_FGP_COUNT;
> 
> Saying why we populate the value 1 here (RCU_FGP_COUNT) as an
> optimization for the read-side might help understanding this choice.

Good point, done.

> > +DEFINE_PER_CPU(long, rcu_fgp_active_readers);
> > +
> > +/*
> > + * Determine if the specified counter value indicates that we need to
> > + * wait on the corresponding CPU to exit an rcu_fgp read-side critical
> > + * section.  Return non-zero if so.
> > + *
> > + * Assumes that rcu_fgp_mutex is held, and thus that rcu_fgp_ctr is
> > + * unchanging.
> > + */
> > +static inline int rcu_old_fgp_ongoing(long *value)
> > +{
> > +	long v = ACCESS_ONCE(*value);
> > +
> > +	return (v & RCU_FGP_NEST_MASK) &&
> > +	       ((v ^ rcu_fgp_ctr) & RCU_FGP_PARITY);
> > +}
> > +
> > +static void rcu_fgp_wait_for_quiescent_state(void)
> > +{
> > +	int cpu;
> > +	long *uarp;
> > +
> > +	for_each_online_cpu(cpu) {
> > +		uarp = &per_cpu(rcu_fgp_active_readers, cpu);
> > +		while (rcu_old_fgp_ongoing(uarp))
> > +			cpu_relax();
> 
> I would be tempted to add a comment here telling hot cpu hotunplug
> cannot let us wait forever, given all read-side critical sections we can
> be busy-waiting for are required to have preemption disabled, and are
> therefore cpu-hotplug safe.

Good point -- I hadn't even considered CPU hotplug, so got very lucky.

> > +	}
> > +}
> > +
> > +static void rcu_fgp_do_mb(void *unused)
> > +{
> > +	smp_mb();  /* Each CPU does a memory barrier. */
> > +}
> 
> Ah, here it is. Commenting that it matches the two barrier()s I identified
> above would be good.

Good point, reworded.

> > +
> > +void synchronize_rcu_fgp(void)
> > +{
> > +	mutex_lock(&rcu_fgp_mutex);
> > +	
> > +	/* CPUs must see earlier change before parity flip. */
> > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> 
> /*
>  * Call a function on all other processors
>  */
> int smp_call_function(void(*func)(void *info), void *info, int wait);
> 
> I guess you meant on_each_cpu ? That should include "self", given we
> also need the smp_mb().

Hmmm...  Why do we need "self"?  Because synchronize_rcu_fgp() blocks,
it had jolly well better not be within a read-side critical section.

So, what am I missing here?

> > +
> > +	/*
> > +	 * We must flip twice to correctly handle tasks that stall
> > +	 * in rcu_read_lock_fgp() between the time that they fetch
> > +	 * rcu_fgp_ctr and the time that the store to their CPU's
> > +	 * rcu_fgp_active_readers.  No matter when they resume
> > +	 * execution, we will wait for them to get to the corresponding
> > +	 * rcu_read_unlock_fgp().
> > +	 */
> > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 0 -> 1 */
> > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 1 -> 0 */
> > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > +
> > +	/* Prevent CPUs from reordering out of prior RCU critical sections. */
> > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> > +
> 
> Same as above.

Same as above.  ;-)

> Mathieu, who can still recognise his original userspace implementation
> :-)

Yeah, I never was all that good at disguising code anyway.  But I did
keep a couple of changes.  ;-)

Updated patch below.

							Thanx, Paul

------------------------------------------------------------------------

And here is a crude second cut.  Untested, probably does not even compile.

Straight conversion of Mathieu Desnoyers's user-space RCU implementation
at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
a little, but he must bear the bulk of the guilt).  Pick on srcu.h
and srcu.c out of sheer laziness.  User-space testing gives deep
sub-microsecond grace-period latencies, so should be fast enough, at
least if you don't mind two smp_call_function() invocations per grace
period and spinning on each instance of a per-CPU variable.

Again, I believe per-CPU locking should work fine for the netfilter
counters, but I guess "friends don't let friends use hashed locks".
(I would not know for sure, never having used them myself, except of
course to protect hash tables.)

Most definitely -not- for inclusion at this point.  Next step is to hack
up the relevant rcutorture code and watch it explode on contact.  ;-)

Changes since v1:

o	Applied Mathieu's feedback.

o	Added docbook headers and other comments.

o	Added the rcu_fgp_batches_completed API required by rcutorture.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 include/linux/srcu.h |   42 ++++++++++++++++++++++++
 kernel/srcu.c        |   89 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  4:50                                                             ` Stephen Hemminger
@ 2009-04-17  5:08                                                               ` Paul E. McKenney
  2009-04-17  5:16                                                                 ` Eric Dumazet
  1 sibling, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  5:08 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, kaber, torvalds, dada1, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

On Thu, Apr 16, 2009 at 09:50:33PM -0700, Stephen Hemminger wrote:
> On Thu, 16 Apr 2009 18:28:12 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > > From: Patrick McHardy <kaber@trash.net>
> > > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > > 
> > > > > Linus Torvalds wrote:
> > > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > > >>> The counters are the bigger problem, otherwise we could just free
> > > > >>> table
> > > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > > >>> values coming out to user space are always exactly accurate, or is it
> > > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > > >>> case
> > > > >>> NCPU-1).
> > > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > > >> are guaranteed to be stable at that point, since we're all done with
> > > > >> those entries), and apply them at that point to the current setup?
> > > > > 
> > > > > We need the counters immediately to copy them to userspace, so waiting
> > > > > for an asynchronous RCU free is not going to work.
> > > > 
> > > > It just occurred to me that since all netfilter packet handling
> > > > goes through one place, we could have a sort-of "netfilter RCU"
> > > > of sorts to solve this problem.
> > > 
> > > OK, I am putting one together...
> > > 
> > > It will be needed sooner or later, though I suspect per-CPU locking
> > > would work fine in this case.
> > 
> > And here is a crude first cut.  Untested, probably does not even compile.
> > 
> > Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> > at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> > a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> > and srcu.c out of sheer laziness.  User-space testing gives deep
> > sub-microsecond grace-period latencies, so should be fast enough, at
> > least if you don't mind two smp_call_function() invocations per grace
> > period and spinning on each instance of a per-CPU variable.
> > 
> > Again, I believe per-CPU locking should work fine for the netfilter
> > counters, but I guess "friends don't let friends use hashed locks".
> > (I would not know for sure, never having used them myself, except of
> > course to protect hash tables.)
> > 
> > Most definitely -not- for inclusion at this point.  Next step is to hack
> > up the relevant rcutorture code and watch it explode on contact.  ;-)
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> I am glad to see this worked on, but would rather not use RCU in this case
> of iptables. It would be good for some of the other long grace period sutff.

Agreed, as noted above.  Mostly just getting tired of people complaining
about long grace periods.  Again, this patch cannot replace standard RCU
for reasons noted earlier in this thread.

> The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
> hard to debug and more convoluted so it probably would be a long term maintaince
> nightmare.  The issue was the variable size skip structure so it made
> for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
> as fast, it is easier to understand.

Your per-CPU-lock patch looked more straightforward to me than did the
RCU patch.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  4:50                                                             ` Stephen Hemminger
@ 2009-04-17  5:16                                                                 ` Eric Dumazet
  2009-04-17  5:16                                                                 ` Eric Dumazet
  1 sibling, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-17  5:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> On Thu, 16 Apr 2009 18:28:12 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
>> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
>>> On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
>>>> From: Patrick McHardy <kaber@trash.net>
>>>> Date: Thu, 16 Apr 2009 15:11:31 +0200
>>>>
>>>>> Linus Torvalds wrote:
>>>>>> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
>>>>>>> The counters are the bigger problem, otherwise we could just free
>>>>>>> table
>>>>>>> info via rcu.  Do we really have to support: replace where the counter
>>>>>>> values coming out to user space are always exactly accurate, or is it
>>>>>>> allowed to replace a rule and maybe lose some counter ticks (worst
>>>>>>> case
>>>>>>> NCPU-1).
>>>>>> Why not just read the counters fromt he old one at RCU free time (they
>>>>>> are guaranteed to be stable at that point, since we're all done with
>>>>>> those entries), and apply them at that point to the current setup?
>>>>> We need the counters immediately to copy them to userspace, so waiting
>>>>> for an asynchronous RCU free is not going to work.
>>>> It just occurred to me that since all netfilter packet handling
>>>> goes through one place, we could have a sort-of "netfilter RCU"
>>>> of sorts to solve this problem.
>>> OK, I am putting one together...
>>>
>>> It will be needed sooner or later, though I suspect per-CPU locking
>>> would work fine in this case.
>> And here is a crude first cut.  Untested, probably does not even compile.
>>
>> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
>> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
>> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
>> and srcu.c out of sheer laziness.  User-space testing gives deep
>> sub-microsecond grace-period latencies, so should be fast enough, at
>> least if you don't mind two smp_call_function() invocations per grace
>> period and spinning on each instance of a per-CPU variable.
>>
>> Again, I believe per-CPU locking should work fine for the netfilter
>> counters, but I guess "friends don't let friends use hashed locks".
>> (I would not know for sure, never having used them myself, except of
>> course to protect hash tables.)
>>
>> Most definitely -not- for inclusion at this point.  Next step is to hack
>> up the relevant rcutorture code and watch it explode on contact.  ;-)
>>
>> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> I am glad to see this worked on, but would rather not use RCU in this case
> of iptables. It would be good for some of the other long grace period sutff.
> 
> The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
> hard to debug and more convoluted so it probably would be a long term maintaince
> nightmare.  The issue was the variable size skip structure so it made
> for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
> as fast, it is easier to understand.

I agree that for 2.6.30, we could use a per-cpu spinlock as your last patch did,
this would be very risky to push this new RCU right now. 

But this new stuff looks very promising, (no more locked ops on fast path),
and considering new percpu_{add|sub...} infra, very fast :

static inline void rcu_read_unlock_fgp(void)
{
	barrier();
	percpu_sub(rcu_fgp_active_readers, 1); /* one instruction on x86 */
	preempt_enable();
}

I wonder if IPI are really necessary on x86 if we use percpu_sub() since
it already contains a barrier, and rcu_read_lock_fgp(void) also ends with
a barrier() call...


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-17  5:16                                                                 ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-17  5:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> On Thu, 16 Apr 2009 18:28:12 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
>> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
>>> On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
>>>> From: Patrick McHardy <kaber@trash.net>
>>>> Date: Thu, 16 Apr 2009 15:11:31 +0200
>>>>
>>>>> Linus Torvalds wrote:
>>>>>> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
>>>>>>> The counters are the bigger problem, otherwise we could just free
>>>>>>> table
>>>>>>> info via rcu.  Do we really have to support: replace where the counter
>>>>>>> values coming out to user space are always exactly accurate, or is it
>>>>>>> allowed to replace a rule and maybe lose some counter ticks (worst
>>>>>>> case
>>>>>>> NCPU-1).
>>>>>> Why not just read the counters fromt he old one at RCU free time (they
>>>>>> are guaranteed to be stable at that point, since we're all done with
>>>>>> those entries), and apply them at that point to the current setup?
>>>>> We need the counters immediately to copy them to userspace, so waiting
>>>>> for an asynchronous RCU free is not going to work.
>>>> It just occurred to me that since all netfilter packet handling
>>>> goes through one place, we could have a sort-of "netfilter RCU"
>>>> of sorts to solve this problem.
>>> OK, I am putting one together...
>>>
>>> It will be needed sooner or later, though I suspect per-CPU locking
>>> would work fine in this case.
>> And here is a crude first cut.  Untested, probably does not even compile.
>>
>> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
>> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
>> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
>> and srcu.c out of sheer laziness.  User-space testing gives deep
>> sub-microsecond grace-period latencies, so should be fast enough, at
>> least if you don't mind two smp_call_function() invocations per grace
>> period and spinning on each instance of a per-CPU variable.
>>
>> Again, I believe per-CPU locking should work fine for the netfilter
>> counters, but I guess "friends don't let friends use hashed locks".
>> (I would not know for sure, never having used them myself, except of
>> course to protect hash tables.)
>>
>> Most definitely -not- for inclusion at this point.  Next step is to hack
>> up the relevant rcutorture code and watch it explode on contact.  ;-)
>>
>> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> I am glad to see this worked on, but would rather not use RCU in this case
> of iptables. It would be good for some of the other long grace period sutff.
> 
> The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
> hard to debug and more convoluted so it probably would be a long term maintaince
> nightmare.  The issue was the variable size skip structure so it made
> for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
> as fast, it is easier to understand.

I agree that for 2.6.30, we could use a per-cpu spinlock as your last patch did,
this would be very risky to push this new RCU right now. 

But this new stuff looks very promising, (no more locked ops on fast path),
and considering new percpu_{add|sub...} infra, very fast :

static inline void rcu_read_unlock_fgp(void)
{
	barrier();
	percpu_sub(rcu_fgp_active_readers, 1); /* one instruction on x86 */
	preempt_enable();
}

I wonder if IPI are really necessary on x86 if we use percpu_sub() since
it already contains a barrier, and rcu_read_lock_fgp(void) also ends with
a barrier() call...

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  5:16                                                                 ` Eric Dumazet
@ 2009-04-17  5:40                                                                   ` Paul E. McKenney
  -1 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  5:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, Apr 17, 2009 at 07:16:32AM +0200, Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> > On Thu, 16 Apr 2009 18:28:12 -0700
> > "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> > 
> >> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> >>> On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> >>>> From: Patrick McHardy <kaber@trash.net>
> >>>> Date: Thu, 16 Apr 2009 15:11:31 +0200
> >>>>
> >>>>> Linus Torvalds wrote:
> >>>>>> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> >>>>>>> The counters are the bigger problem, otherwise we could just free
> >>>>>>> table
> >>>>>>> info via rcu.  Do we really have to support: replace where the counter
> >>>>>>> values coming out to user space are always exactly accurate, or is it
> >>>>>>> allowed to replace a rule and maybe lose some counter ticks (worst
> >>>>>>> case
> >>>>>>> NCPU-1).
> >>>>>> Why not just read the counters fromt he old one at RCU free time (they
> >>>>>> are guaranteed to be stable at that point, since we're all done with
> >>>>>> those entries), and apply them at that point to the current setup?
> >>>>> We need the counters immediately to copy them to userspace, so waiting
> >>>>> for an asynchronous RCU free is not going to work.
> >>>> It just occurred to me that since all netfilter packet handling
> >>>> goes through one place, we could have a sort-of "netfilter RCU"
> >>>> of sorts to solve this problem.
> >>> OK, I am putting one together...
> >>>
> >>> It will be needed sooner or later, though I suspect per-CPU locking
> >>> would work fine in this case.
> >> And here is a crude first cut.  Untested, probably does not even compile.
> >>
> >> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> >> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> >> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> >> and srcu.c out of sheer laziness.  User-space testing gives deep
> >> sub-microsecond grace-period latencies, so should be fast enough, at
> >> least if you don't mind two smp_call_function() invocations per grace
> >> period and spinning on each instance of a per-CPU variable.
> >>
> >> Again, I believe per-CPU locking should work fine for the netfilter
> >> counters, but I guess "friends don't let friends use hashed locks".
> >> (I would not know for sure, never having used them myself, except of
> >> course to protect hash tables.)
> >>
> >> Most definitely -not- for inclusion at this point.  Next step is to hack
> >> up the relevant rcutorture code and watch it explode on contact.  ;-)
> >>
> >> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > 
> > I am glad to see this worked on, but would rather not use RCU in this case
> > of iptables. It would be good for some of the other long grace period sutff.
> > 
> > The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
> > hard to debug and more convoluted so it probably would be a long term maintaince
> > nightmare.  The issue was the variable size skip structure so it made
> > for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
> > as fast, it is easier to understand.
> 
> I agree that for 2.6.30, we could use a per-cpu spinlock as your last patch did,
> this would be very risky to push this new RCU right now. 

I completely agree that this RCU is absolutely -not- 2.6.30 material.  ;-)

> But this new stuff looks very promising, (no more locked ops on fast path),
> and considering new percpu_{add|sub...} infra, very fast :
> 
> static inline void rcu_read_unlock_fgp(void)
> {
> 	barrier();
> 	percpu_sub(rcu_fgp_active_readers, 1); /* one instruction on x86 */
> 	preempt_enable();
> }

Very cool!!!  If I had seen this, I had forgotten about it.  I will
give it a try, but only after getting it working the old way.  (What,
me paranoid?)

> I wonder if IPI are really necessary on x86 if we use percpu_sub() since
> it already contains a barrier, and rcu_read_lock_fgp(void) also ends with
> a barrier() call...

Hmmmm...  But x86 can still execute a later load before an earlier
store, so it seems to me that there would be the potential for even
an x86 CPU to pull loads from the critical section up before the final
store of the percpu_sub(), right?  If so, we really do still need the
IPIs on x86.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
@ 2009-04-17  5:40                                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17  5:40 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, Apr 17, 2009 at 07:16:32AM +0200, Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> > On Thu, 16 Apr 2009 18:28:12 -0700
> > "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> > 
> >> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> >>> On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> >>>> From: Patrick McHardy <kaber@trash.net>
> >>>> Date: Thu, 16 Apr 2009 15:11:31 +0200
> >>>>
> >>>>> Linus Torvalds wrote:
> >>>>>> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> >>>>>>> The counters are the bigger problem, otherwise we could just free
> >>>>>>> table
> >>>>>>> info via rcu.  Do we really have to support: replace where the counter
> >>>>>>> values coming out to user space are always exactly accurate, or is it
> >>>>>>> allowed to replace a rule and maybe lose some counter ticks (worst
> >>>>>>> case
> >>>>>>> NCPU-1).
> >>>>>> Why not just read the counters fromt he old one at RCU free time (they
> >>>>>> are guaranteed to be stable at that point, since we're all done with
> >>>>>> those entries), and apply them at that point to the current setup?
> >>>>> We need the counters immediately to copy them to userspace, so waiting
> >>>>> for an asynchronous RCU free is not going to work.
> >>>> It just occurred to me that since all netfilter packet handling
> >>>> goes through one place, we could have a sort-of "netfilter RCU"
> >>>> of sorts to solve this problem.
> >>> OK, I am putting one together...
> >>>
> >>> It will be needed sooner or later, though I suspect per-CPU locking
> >>> would work fine in this case.
> >> And here is a crude first cut.  Untested, probably does not even compile.
> >>
> >> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> >> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> >> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> >> and srcu.c out of sheer laziness.  User-space testing gives deep
> >> sub-microsecond grace-period latencies, so should be fast enough, at
> >> least if you don't mind two smp_call_function() invocations per grace
> >> period and spinning on each instance of a per-CPU variable.
> >>
> >> Again, I believe per-CPU locking should work fine for the netfilter
> >> counters, but I guess "friends don't let friends use hashed locks".
> >> (I would not know for sure, never having used them myself, except of
> >> course to protect hash tables.)
> >>
> >> Most definitely -not- for inclusion at this point.  Next step is to hack
> >> up the relevant rcutorture code and watch it explode on contact.  ;-)
> >>
> >> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > 
> > I am glad to see this worked on, but would rather not use RCU in this case
> > of iptables. It would be good for some of the other long grace period sutff.
> > 
> > The code to per-cpu entry consolidation by alloc/flip in 2.6.30-rc2 was
> > hard to debug and more convoluted so it probably would be a long term maintaince
> > nightmare.  The issue was the variable size skip structure so it made
> > for lots of iterators, etc. If the non-RCU per-cpu spinlock version is just
> > as fast, it is easier to understand.
> 
> I agree that for 2.6.30, we could use a per-cpu spinlock as your last patch did,
> this would be very risky to push this new RCU right now. 

I completely agree that this RCU is absolutely -not- 2.6.30 material.  ;-)

> But this new stuff looks very promising, (no more locked ops on fast path),
> and considering new percpu_{add|sub...} infra, very fast :
> 
> static inline void rcu_read_unlock_fgp(void)
> {
> 	barrier();
> 	percpu_sub(rcu_fgp_active_readers, 1); /* one instruction on x86 */
> 	preempt_enable();
> }

Very cool!!!  If I had seen this, I had forgotten about it.  I will
give it a try, but only after getting it working the old way.  (What,
me paranoid?)

> I wonder if IPI are really necessary on x86 if we use percpu_sub() since
> it already contains a barrier, and rcu_read_lock_fgp(void) also ends with
> a barrier() call...

Hmmmm...  But x86 can still execute a later load before an earlier
store, so it seems to me that there would be the potential for even
an x86 CPU to pull loads from the critical section up before the final
store of the percpu_sub(), right?  If so, we really do still need the
IPIs on x86.

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  5:05                                                               ` Paul E. McKenney
@ 2009-04-17  5:44                                                                 ` Mathieu Desnoyers
  2009-04-17 14:51                                                                   ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-17  5:44 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

* Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> On Thu, Apr 16, 2009 at 10:19:02PM -0400, Mathieu Desnoyers wrote:
> > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > > On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > > > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > > > From: Patrick McHardy <kaber@trash.net>
> > > > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > > > 
> > > > > > Linus Torvalds wrote:
> > > > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > > > >>> The counters are the bigger problem, otherwise we could just free
> > > > > >>> table
> > > > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > > > >>> values coming out to user space are always exactly accurate, or is it
> > > > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > > > >>> case
> > > > > >>> NCPU-1).
> > > > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > > > >> are guaranteed to be stable at that point, since we're all done with
> > > > > >> those entries), and apply them at that point to the current setup?
> > > > > > 
> > > > > > We need the counters immediately to copy them to userspace, so waiting
> > > > > > for an asynchronous RCU free is not going to work.
> > > > > 
> > > > > It just occurred to me that since all netfilter packet handling
> > > > > goes through one place, we could have a sort-of "netfilter RCU"
> > > > > of sorts to solve this problem.
> > > > 
> > > > OK, I am putting one together...
> > > > 
> > > > It will be needed sooner or later, though I suspect per-CPU locking
> > > > would work fine in this case.
> > > 
> > > And here is a crude first cut.  Untested, probably does not even compile.
> > > 
> > > Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> > > at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> > > a little, but he must bear the bulk of the guilt).
> > 
> > I'm innocent, I swear ;-)
> 
> That is what they -all- say!!!  ;-)
> 
> > That should give very impressive performance results.
> 
> I wouldn't expect more than about three or four orders of magnitude
> improvement on the update side compared to Classic RCU, but who knows?
> 
> > Please see comments below,
> > 
> > >   Pick on srcu.h
> > > and srcu.c out of sheer laziness.  User-space testing gives deep
> > > sub-microsecond grace-period latencies, so should be fast enough, at
> > > least if you don't mind two smp_call_function() invocations per grace
> > > period and spinning on each instance of a per-CPU variable.
> > > 
> > > Again, I believe per-CPU locking should work fine for the netfilter
> > > counters, but I guess "friends don't let friends use hashed locks".
> > > (I would not know for sure, never having used them myself, except of
> > > course to protect hash tables.)
> > > 
> > > Most definitely -not- for inclusion at this point.  Next step is to hack
> > > up the relevant rcutorture code and watch it explode on contact.  ;-)
> > > 
> > > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > > ---
> > > 
> > >  include/linux/srcu.h |   30 ++++++++++++++++++++++++
> > >  kernel/srcu.c        |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 93 insertions(+)
> > > 
> > > diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> > > index aca0eee..4577cd0 100644
> > > --- a/include/linux/srcu.h
> > > +++ b/include/linux/srcu.h
> > > @@ -50,4 +50,34 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
> > >  void synchronize_srcu(struct srcu_struct *sp);
> > >  long srcu_batches_completed(struct srcu_struct *sp);
> > >  
> > > +/* Single bit for grace-period index, low-order bits are nesting counter. */
> > > +#define RCU_FGP_COUNT		1UL
> > > +#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
> > > +#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
> > > +
> > > +extern long rcu_fgp_ctr;
> > > +DECLARE_PER_CPU(long, rcu_fgp_active_readers);
> > > +
> > > +static inline void rcu_read_lock_fgp(void)
> > > +{
> > > +	long tmp;
> > > +	long *uarp;
> > > +
> > > +	preempt_disable();
> > > +	uarp = &__get_cpu_var(rcu_fgp_active_readers);
> > 
> > OK, so we are translating the original implementation from per-thread to
> > per-cpu, with preemption disabled. Fine with me if we can't afford the
> > per-thread unsigned long nor can't afford to iterate on each thread when
> > waiting for RCU quiescent state.
> 
> The iterating on each thread was what stopped me.
> 
> > > +	tmp = *uarp;
> > > +	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
> > > +		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */
> > 
> > ACCESS_ONCE(rcu_fgp_ctr) could not hurt here I think. Given the
> > surrounding code, that does not seem like a necessity, but that would
> > express that it is really an atomic read.
> 
> I believe that it is safe.  Only one bit of rcu_fgp_ctr ever changes,
> so we should be immune from load tearing.  We only load it once and
> only do one thing with it, and we have a barrier() before (as part
> of preempt_disable()) and after, so I don't think that the compiler
> has much latitude here.  In theory, we could get store tearing through
> *uarp, but if gcc did that, much of the kernel would go down in flames.
> 
> In contrast, in the user-mode version, there was no barrier() on entry,
> permitting the compiler much more mischief.
> 

True.

> > > +	else
> > > +		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
> > > +	barrier();
> > 
> > I kind of expect an IPI with a smp_mb() to promote this barrier() to a
> > smp_mb() when the update side needs to wait for a quiescent state. I
> > guess a comment telling this here would not hurt.
> 
> If you insist.  ;-)
> 
> > > +}
> > > +
> > > +static inline void rcu_read_unlock_fgp(void)
> > > +{
> > > +	barrier();
> > 
> > Same here.
> 
> Likewise!
> 
> > > +	__get_cpu_var(rcu_fgp_active_readers)--;
> > > +	preempt_enable();
> > > +}
> > > +
> > >  #endif
> > > diff --git a/kernel/srcu.c b/kernel/srcu.c
> > > index b0aeeaf..a5dc464 100644
> > > --- a/kernel/srcu.c
> > > +++ b/kernel/srcu.c
> > > @@ -255,3 +255,66 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
> > >  EXPORT_SYMBOL_GPL(srcu_read_unlock);
> > >  EXPORT_SYMBOL_GPL(synchronize_srcu);
> > >  EXPORT_SYMBOL_GPL(srcu_batches_completed);
> > > +
> > > +DEFINE_MUTEX(rcu_fgp_mutex);
> > > +long rcu_fgp_ctr = RCU_FGP_COUNT;
> > 
> > Saying why we populate the value 1 here (RCU_FGP_COUNT) as an
> > optimization for the read-side might help understanding this choice.
> 
> Good point, done.
> 
> > > +DEFINE_PER_CPU(long, rcu_fgp_active_readers);
> > > +
> > > +/*
> > > + * Determine if the specified counter value indicates that we need to
> > > + * wait on the corresponding CPU to exit an rcu_fgp read-side critical
> > > + * section.  Return non-zero if so.
> > > + *
> > > + * Assumes that rcu_fgp_mutex is held, and thus that rcu_fgp_ctr is
> > > + * unchanging.
> > > + */
> > > +static inline int rcu_old_fgp_ongoing(long *value)
> > > +{
> > > +	long v = ACCESS_ONCE(*value);
> > > +
> > > +	return (v & RCU_FGP_NEST_MASK) &&
> > > +	       ((v ^ rcu_fgp_ctr) & RCU_FGP_PARITY);
> > > +}
> > > +
> > > +static void rcu_fgp_wait_for_quiescent_state(void)
> > > +{
> > > +	int cpu;
> > > +	long *uarp;
> > > +
> > > +	for_each_online_cpu(cpu) {
> > > +		uarp = &per_cpu(rcu_fgp_active_readers, cpu);
> > > +		while (rcu_old_fgp_ongoing(uarp))
> > > +			cpu_relax();
> > 
> > I would be tempted to add a comment here telling hot cpu hotunplug
> > cannot let us wait forever, given all read-side critical sections we can
> > be busy-waiting for are required to have preemption disabled, and are
> > therefore cpu-hotplug safe.
> 
> Good point -- I hadn't even considered CPU hotplug, so got very lucky.
> 
> > > +	}
> > > +}
> > > +
> > > +static void rcu_fgp_do_mb(void *unused)
> > > +{
> > > +	smp_mb();  /* Each CPU does a memory barrier. */
> > > +}
> > 
> > Ah, here it is. Commenting that it matches the two barrier()s I identified
> > above would be good.
> 
> Good point, reworded.
> 
> > > +
> > > +void synchronize_rcu_fgp(void)
> > > +{
> > > +	mutex_lock(&rcu_fgp_mutex);
> > > +	
> > > +	/* CPUs must see earlier change before parity flip. */
> > > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> > 
> > /*
> >  * Call a function on all other processors
> >  */
> > int smp_call_function(void(*func)(void *info), void *info, int wait);
> > 
> > I guess you meant on_each_cpu ? That should include "self", given we
> > also need the smp_mb().
> 
> Hmmm...  Why do we need "self"?  Because synchronize_rcu_fgp() blocks,
> it had jolly well better not be within a read-side critical section.
> 
> So, what am I missing here?
> 

I mean that I think we also need some smp_mb()s on the writer side,
don't we ? If we want the changes done by the writer (assign pointer) to
be shown to the readers before the writer starts flipping the parity, a
smp_mb() is needed at the beginning of synchronize_rcu_fgp() (actually
at the same location where you call the rcu_fgp_do_mb ipis), same at the
end (so we order parity flipping with the next assign pointer).

Or maybe it's getting late and I am missing the obvious.

Mathieu

> > > +
> > > +	/*
> > > +	 * We must flip twice to correctly handle tasks that stall
> > > +	 * in rcu_read_lock_fgp() between the time that they fetch
> > > +	 * rcu_fgp_ctr and the time that the store to their CPU's
> > > +	 * rcu_fgp_active_readers.  No matter when they resume
> > > +	 * execution, we will wait for them to get to the corresponding
> > > +	 * rcu_read_unlock_fgp().
> > > +	 */
> > > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 0 -> 1 */
> > > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 1 -> 0 */
> > > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > > +
> > > +	/* Prevent CPUs from reordering out of prior RCU critical sections. */
> > > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> > > +
> > 
> > Same as above.
> 
> Same as above.  ;-)
> 
> > Mathieu, who can still recognise his original userspace implementation
> > :-)
> 
> Yeah, I never was all that good at disguising code anyway.  But I did
> keep a couple of changes.  ;-)
> 
> Updated patch below.
> 
> 							Thanx, Paul
> 
> ------------------------------------------------------------------------
> 
> And here is a crude second cut.  Untested, probably does not even compile.
> 
> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> and srcu.c out of sheer laziness.  User-space testing gives deep
> sub-microsecond grace-period latencies, so should be fast enough, at
> least if you don't mind two smp_call_function() invocations per grace
> period and spinning on each instance of a per-CPU variable.
> 
> Again, I believe per-CPU locking should work fine for the netfilter
> counters, but I guess "friends don't let friends use hashed locks".
> (I would not know for sure, never having used them myself, except of
> course to protect hash tables.)
> 
> Most definitely -not- for inclusion at this point.  Next step is to hack
> up the relevant rcutorture code and watch it explode on contact.  ;-)
> 
> Changes since v1:
> 
> o	Applied Mathieu's feedback.
> 
> o	Added docbook headers and other comments.
> 
> o	Added the rcu_fgp_batches_completed API required by rcutorture.
> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
> 
>  include/linux/srcu.h |   42 ++++++++++++++++++++++++
>  kernel/srcu.c        |   89 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 131 insertions(+)
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
  2009-04-17  0:15                                                             ` Jeff Chua
@ 2009-04-17  5:55                                                             ` Peter Zijlstra
  2009-04-17  6:03                                                             ` Eric Dumazet
  2 siblings, 0 replies; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-17  5:55 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, dada1, jeff.chua.linux,
	paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Thu, 2009-04-16 at 16:52 -0700, Stephen Hemminger wrote:

>   - Lockdep doesn't really handle this well

> +/**
> + * xt_table_info_lock_all - lock xt table info for update
> + *
> + * Locks out all readers, and blocks bottom half
> + */
> +void xt_table_info_lock_all(void)
> +{
> +	int i;
> +
> +	local_bh_disable();
> +	for_each_possible_cpu(i) {
> +		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
> +		spin_lock(&lock->lock);
> +		BUG_ON(lock->depth != -1);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xt_table_info_lock_all);

Quite so, this is the old MAX_LOCK_DEPTH < NR_CPUS issue for large
systems.

Last time this came up David found another way of solving the problem.
Not having fully read this thread, I cannot suggest one myself -- except
that RCU domains as suggested by David sound good.




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
  2009-04-17  0:15                                                             ` Jeff Chua
  2009-04-17  5:55                                                             ` Peter Zijlstra
@ 2009-04-17  6:03                                                             ` Eric Dumazet
  2009-04-17  6:14                                                                 ` Eric Dumazet
  2009-04-17 11:17                                                                 ` Patrick McHardy
  2 siblings, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-17  6:03 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Future optimizations possible:
>   - Lockdep doesn't really handle this well
>   - hot plug CPU case, if kernel is built with large # of CPU's, skip
>     the inactive ones; migrate values when CPU is removed.
>   - reading counters could be incremental by CPU.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> 

I like this version 8 of the patch, as it mixes all ideas we had,
but have two questions.

Previous netfilter code (and 2.6.30-rc2 one too) disable BH, not only preemption.

I see xt_table_info_lock_all(void) does block BH, so this one is safe.

I let Patrick or other tell us if its safe to run ipt_do_table()
with preemption disabled but BH enabled, I really dont know.

Also, please dont call this a 'recursive lock', since it is not a general
recursive lock, as pointed by Linus and Paul.

Second question is about MAX_LOCK_DEPTH

Why dont use this kind of construct to get rid of this limit ?

+void xt_table_info_lock_all(void)
> +{
> +	int i;
> +
> +	local_bh_disable();
> +	for_each_possible_cpu(i) {
> +		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
> +		spin_lock(&lock->lock);
> +		preempt_enable_no_resched();
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xt_table_info_lock_all);


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
  2009-04-17  2:19                                                             ` Mathieu Desnoyers
  2009-04-17  4:50                                                             ` Stephen Hemminger
@ 2009-04-17  6:12                                                             ` Peter Zijlstra
  2009-04-17 16:33                                                               ` Paul E. McKenney
  2009-04-18  9:40                                                             ` Evgeniy Polyakov
  3 siblings, 1 reply; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-17  6:12 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Thu, 2009-04-16 at 18:28 -0700, Paul E. McKenney wrote:
> On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > From: Patrick McHardy <kaber@trash.net>
> > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > 
> > > > Linus Torvalds wrote:
> > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > >>> The counters are the bigger problem, otherwise we could just free
> > > >>> table
> > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > >>> values coming out to user space are always exactly accurate, or is it
> > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > >>> case
> > > >>> NCPU-1).
> > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > >> are guaranteed to be stable at that point, since we're all done with
> > > >> those entries), and apply them at that point to the current setup?
> > > > 
> > > > We need the counters immediately to copy them to userspace, so waiting
> > > > for an asynchronous RCU free is not going to work.
> > > 
> > > It just occurred to me that since all netfilter packet handling
> > > goes through one place, we could have a sort-of "netfilter RCU"
> > > of sorts to solve this problem.
> > 
> > OK, I am putting one together...
> > 
> > It will be needed sooner or later, though I suspect per-CPU locking
> > would work fine in this case.
> 
> And here is a crude first cut.  Untested, probably does not even compile.
> 
> Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> and srcu.c out of sheer laziness.  User-space testing gives deep
> sub-microsecond grace-period latencies, so should be fast enough, at
> least if you don't mind two smp_call_function() invocations per grace
> period and spinning on each instance of a per-CPU variable.
> 
> Again, I believe per-CPU locking should work fine for the netfilter
> counters, but I guess "friends don't let friends use hashed locks".
> (I would not know for sure, never having used them myself, except of
> course to protect hash tables.)
> 
> Most definitely -not- for inclusion at this point.  Next step is to hack
> up the relevant rcutorture code and watch it explode on contact.  ;-)

One comment, its again a global thing..

I've been playing with the idea for a while now to make all RCU
implementations into proper objects so that you can do things like:

  struct atomic_rcu_domain my_rcu_domain = create_atomic_rcu();

  atomic_rcu_read_lock(&my_rcu_domain());
  ...

  atomic_rcu_read_unlock(&my_rcu_domain());

and

  call_atomic_rcu(&my_rcu_domain, &my_obj->rcu_head, do_something);

etc..

We would have:

  atomic_rcu  --  'classic' non preemptible RCU (treercu these days)
  sleep_rcu   --  'preemptible' RCU

Then have 3 default domains:

sched_rcu     -- always atomic_rcu
rcu           -- depends on PREEMPT_RCU
preempt_rcu   -- always sleep_rcu

This would allow generic code to:
  1) use preemptible RCU for those cases where needed
  2) create smaller RCU domains where needed, such as in this case
  3) mostly do away with SRCU

Now I realize that the presented RCU implementation has a different
grace period method than the existing ones that use the timer tick to
drive the state machine, so 2) might not be too relevant here. But maybe
we can do something with different grace periods too.

Anyway, just an idea because I always get a little offended at the hard
coded global variables in all these RCU implementations :-)




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-17  6:03                                                             ` Eric Dumazet
@ 2009-04-17  6:14                                                                 ` Eric Dumazet
  2009-04-17 11:17                                                                 ` Patrick McHardy
  1 sibling, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-17  6:14 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
>> recursive lock that can be nested. It is sort of like existing kernel_lock,
>> rwlock_t and even old 2.4 brlock.
>>
>> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
>> It needs to ensure that the rules are not being changed while packet
>> is being processed.
>>
>> "Writer" is used in two cases: first is replacing rules in which case
>> all packets in flight have to be processed before rules are swapped,
>> then counters are read from the old (stale) info. Second case is where
>> counters need to be read on the fly, in this case all CPU's are blocked
>> from further rule processing until values are aggregated.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  This reduces the contention of a
>> single reader lock (in 2.6.29) without the delay of synchronize_net()
>> (in 2.6.30-rc2). 
>>
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Future optimizations possible:
>>   - Lockdep doesn't really handle this well
>>   - hot plug CPU case, if kernel is built with large # of CPU's, skip
>>     the inactive ones; migrate values when CPU is removed.
>>   - reading counters could be incremental by CPU.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
>>
> 
> I like this version 8 of the patch, as it mixes all ideas we had,
> but have two questions.
> 
> Previous netfilter code (and 2.6.30-rc2 one too) disable BH, not only preemption.
> 
> I see xt_table_info_lock_all(void) does block BH, so this one is safe.
> 
> I let Patrick or other tell us if its safe to run ipt_do_table()
> with preemption disabled but BH enabled, I really dont know.
> 
> Also, please dont call this a 'recursive lock', since it is not a general
> recursive lock, as pointed by Linus and Paul.
> 
> Second question is about MAX_LOCK_DEPTH

I meant here the ~256 limit we have on preempt_count, not related to LOCKDEP

> 
> Why dont use this kind of construct to get rid of this limit ?
> 
> +void xt_table_info_lock_all(void)
>> +{
>> +	int i;
>> +
>> +	local_bh_disable();
>> +	for_each_possible_cpu(i) {
>> +		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
>> +		spin_lock(&lock->lock);
>> +		preempt_enable_no_resched();
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(xt_table_info_lock_all);



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
@ 2009-04-17  6:14                                                                 ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-17  6:14 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, David Miller, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
>> recursive lock that can be nested. It is sort of like existing kernel_lock,
>> rwlock_t and even old 2.4 brlock.
>>
>> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
>> It needs to ensure that the rules are not being changed while packet
>> is being processed.
>>
>> "Writer" is used in two cases: first is replacing rules in which case
>> all packets in flight have to be processed before rules are swapped,
>> then counters are read from the old (stale) info. Second case is where
>> counters need to be read on the fly, in this case all CPU's are blocked
>> from further rule processing until values are aggregated.
>>
>> The idea for this came from an earlier version done by Eric Dumazet.
>> Locking is done per-cpu, the fast path locks on the current cpu
>> and updates counters.  This reduces the contention of a
>> single reader lock (in 2.6.29) without the delay of synchronize_net()
>> (in 2.6.30-rc2). 
>>
>>
>> The mutex that was added for 2.6.30 in xt_table is unnecessary since
>> there already is a mutex for xt[af].mutex that is held.
>>
>> Future optimizations possible:
>>   - Lockdep doesn't really handle this well
>>   - hot plug CPU case, if kernel is built with large # of CPU's, skip
>>     the inactive ones; migrate values when CPU is removed.
>>   - reading counters could be incremental by CPU.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
>>
> 
> I like this version 8 of the patch, as it mixes all ideas we had,
> but have two questions.
> 
> Previous netfilter code (and 2.6.30-rc2 one too) disable BH, not only preemption.
> 
> I see xt_table_info_lock_all(void) does block BH, so this one is safe.
> 
> I let Patrick or other tell us if its safe to run ipt_do_table()
> with preemption disabled but BH enabled, I really dont know.
> 
> Also, please dont call this a 'recursive lock', since it is not a general
> recursive lock, as pointed by Linus and Paul.
> 
> Second question is about MAX_LOCK_DEPTH

I meant here the ~256 limit we have on preempt_count, not related to LOCKDEP

> 
> Why dont use this kind of construct to get rid of this limit ?
> 
> +void xt_table_info_lock_all(void)
>> +{
>> +	int i;
>> +
>> +	local_bh_disable();
>> +	for_each_possible_cpu(i) {
>> +		struct xt_lock *lock = &per_cpu(xt_info_locks, i);
>> +		spin_lock(&lock->lock);
>> +		preempt_enable_no_resched();
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(xt_table_info_lock_all);


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  5:40                                                                   ` Paul E. McKenney
  (?)
@ 2009-04-17  8:07                                                                   ` David Miller
  2009-04-17 15:00                                                                     ` Paul E. McKenney
  2009-04-17 17:22                                                                     ` Peter Zijlstra
  -1 siblings, 2 replies; 254+ messages in thread
From: David Miller @ 2009-04-17  8:07 UTC (permalink / raw)
  To: paulmck
  Cc: dada1, shemminger, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 16 Apr 2009 22:40:32 -0700

> I completely agree that this RCU is absolutely -not- 2.6.30 material.  ;-)

I don't understand why we're writing such complicated code.

Oh I see why, it's because not every arch uses the generic SMP helpers
yet :-)

Because if they did universally, we could solve this problem so
simply, by merely sending a remote softirq to every online cpu.  Once
those all complete we have enough of a quiesce period, every cpu must
have exited any netfilter packet processing code path they were in.

And we could know they complete using an atomic counter or something.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-17  6:03                                                             ` Eric Dumazet
@ 2009-04-17 11:17                                                                 ` Patrick McHardy
  2009-04-17 11:17                                                                 ` Patrick McHardy
  1 sibling, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-17 11:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, paulmck, David Miller, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
>> recursive lock that can be nested. It is sort of like existing kernel_lock,
>> rwlock_t and even old 2.4 brlock.
>>
>> ...
> I like this version 8 of the patch, as it mixes all ideas we had,
> but have two questions.
> 
> Previous netfilter code (and 2.6.30-rc2 one too) disable BH, not only preemption.
> 
> I see xt_table_info_lock_all(void) does block BH, so this one is safe.
> 
> I let Patrick or other tell us if its safe to run ipt_do_table()
> with preemption disabled but BH enabled, I really dont know.

No, on jumps the return position is stored in the per-cpu copy
of the ruleset and we must prevent BH context corrupting the
value of something running in process context.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
@ 2009-04-17 11:17                                                                 ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-17 11:17 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, paulmck, David Miller, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
>> recursive lock that can be nested. It is sort of like existing kernel_lock,
>> rwlock_t and even old 2.4 brlock.
>>
>> ...
> I like this version 8 of the patch, as it mixes all ideas we had,
> but have two questions.
> 
> Previous netfilter code (and 2.6.30-rc2 one too) disable BH, not only preemption.
> 
> I see xt_table_info_lock_all(void) does block BH, so this one is safe.
> 
> I let Patrick or other tell us if its safe to run ipt_do_table()
> with preemption disabled but BH enabled, I really dont know.

No, on jumps the return position is stored in the per-cpu copy
of the ruleset and we must prevent BH context corrupting the
value of something running in process context.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  5:44                                                                 ` Mathieu Desnoyers
@ 2009-04-17 14:51                                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17 14:51 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Fri, Apr 17, 2009 at 01:44:51AM -0400, Mathieu Desnoyers wrote:
> * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > On Thu, Apr 16, 2009 at 10:19:02PM -0400, Mathieu Desnoyers wrote:
> > > * Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > > > On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:

[ . . . ]

> > > > +
> > > > +void synchronize_rcu_fgp(void)
> > > > +{
> > > > +	mutex_lock(&rcu_fgp_mutex);
> > > > +	
> > > > +	/* CPUs must see earlier change before parity flip. */
> > > > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> > > 
> > > /*
> > >  * Call a function on all other processors
> > >  */
> > > int smp_call_function(void(*func)(void *info), void *info, int wait);
> > > 
> > > I guess you meant on_each_cpu ? That should include "self", given we
> > > also need the smp_mb().
> > 
> > Hmmm...  Why do we need "self"?  Because synchronize_rcu_fgp() blocks,
> > it had jolly well better not be within a read-side critical section.
> > 
> > So, what am I missing here?
> 
> I mean that I think we also need some smp_mb()s on the writer side,
> don't we ? If we want the changes done by the writer (assign pointer) to
> be shown to the readers before the writer starts flipping the parity, a
> smp_mb() is needed at the beginning of synchronize_rcu_fgp() (actually
> at the same location where you call the rcu_fgp_do_mb ipis), same at the
> end (so we order parity flipping with the next assign pointer).
> 
> Or maybe it's getting late and I am missing the obvious.

The smp_call_function() itself must have barriers in order to ensure
that the other CPUs see the updates to its parameter block.

But see my upcoming response to Dave and Peter.

							Thanx, Paul

> Mathieu
> 
> > > > +
> > > > +	/*
> > > > +	 * We must flip twice to correctly handle tasks that stall
> > > > +	 * in rcu_read_lock_fgp() between the time that they fetch
> > > > +	 * rcu_fgp_ctr and the time that the store to their CPU's
> > > > +	 * rcu_fgp_active_readers.  No matter when they resume
> > > > +	 * execution, we will wait for them to get to the corresponding
> > > > +	 * rcu_read_unlock_fgp().
> > > > +	 */
> > > > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 0 -> 1 */
> > > > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > > > +	ACCESS_ONCE(rcu_fgp_ctr) ^= RCU_FGP_PARITY;  /* flip parity 1 -> 0 */
> > > > +	rcu_fgp_wait_for_quiescent_state();	     /* wait for old readers */
> > > > +
> > > > +	/* Prevent CPUs from reordering out of prior RCU critical sections. */
> > > > +	smp_call_function(rcu_fgp_do_mb, NULL, 1);
> > > > +
> > > 
> > > Same as above.
> > 
> > Same as above.  ;-)
> > 
> > > Mathieu, who can still recognise his original userspace implementation
> > > :-)
> > 
> > Yeah, I never was all that good at disguising code anyway.  But I did
> > keep a couple of changes.  ;-)
> > 
> > Updated patch below.
> > 
> > 							Thanx, Paul
> > 
> > ------------------------------------------------------------------------
> > 
> > And here is a crude second cut.  Untested, probably does not even compile.
> > 
> > Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> > at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> > a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> > and srcu.c out of sheer laziness.  User-space testing gives deep
> > sub-microsecond grace-period latencies, so should be fast enough, at
> > least if you don't mind two smp_call_function() invocations per grace
> > period and spinning on each instance of a per-CPU variable.
> > 
> > Again, I believe per-CPU locking should work fine for the netfilter
> > counters, but I guess "friends don't let friends use hashed locks".
> > (I would not know for sure, never having used them myself, except of
> > course to protect hash tables.)
> > 
> > Most definitely -not- for inclusion at this point.  Next step is to hack
> > up the relevant rcutorture code and watch it explode on contact.  ;-)
> > 
> > Changes since v1:
> > 
> > o	Applied Mathieu's feedback.
> > 
> > o	Added docbook headers and other comments.
> > 
> > o	Added the rcu_fgp_batches_completed API required by rcutorture.
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> > 
> >  include/linux/srcu.h |   42 ++++++++++++++++++++++++
> >  kernel/srcu.c        |   89 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 131 insertions(+)
> > 
> 
> -- 
> Mathieu Desnoyers
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  8:07                                                                   ` David Miller
@ 2009-04-17 15:00                                                                     ` Paul E. McKenney
  2009-04-17 17:22                                                                     ` Peter Zijlstra
  1 sibling, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17 15:00 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, shemminger, kaber, torvalds, jeff.chua.linux, paulus,
	mingo, laijs, jengelh, r000n, linux-kernel, netfilter-devel,
	netdev, benh, mathieu.desnoyers

On Fri, Apr 17, 2009 at 01:07:10AM -0700, David Miller wrote:
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Date: Thu, 16 Apr 2009 22:40:32 -0700
> 
> > I completely agree that this RCU is absolutely -not- 2.6.30 material.  ;-)
> 
> I don't understand why we're writing such complicated code.
> 
> Oh I see why, it's because not every arch uses the generic SMP helpers
> yet :-)

;-)

> Because if they did universally, we could solve this problem so
> simply, by merely sending a remote softirq to every online cpu.  Once
> those all complete we have enough of a quiesce period, every cpu must
> have exited any netfilter packet processing code path they were in.
> 
> And we could know they complete using an atomic counter or something.

I was with you until you got to the atomic counter, which would require
dealing with CPU hotplug.

But your point is a very good one.  We already do have a flavor of
RCU that waits for softirq code, namely rcu-bh.  And it is used
exclusively by the networking code:

     14 net/ipv4/
     13 net/decnet/
     10 net/core/
      6 net/ipv6/
      4 kernel/		[rcutorture, so these four uses don't count.]
      3 net/netfilter/
      2 net/mac80211/
      2 net/packet/

So both yours and Peter's implicit point is quite correct -- the kernel
really does not need yet another flavor of RCU.  So maybe I should instead
be thinking in terms of making the existing rcu-bh be better adapted to
the networking code, like maybe a fast synchronize_rcu_bh().

Or do the networking uses of rcu-bh need it to work exactly the way that
it does now?

							Thanx, Paul

kernel/rcutorture.c __acquires 392 rcu_read_lock_bh();
kernel/rcutorture.c __releases 398 rcu_read_unlock_bh();
kernel/rcutorture.c rcu_bh_torture_deferred_free 408 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
kernel/rcutorture.c rcu_bh_torture_synchronize 429 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
net/core/dev.c dev_queue_xmit 1844 rcu_read_lock_bh();
net/core/dev.c dev_queue_xmit 1909 rcu_read_unlock_bh();
net/core/dev.c dev_queue_xmit 1915 rcu_read_unlock_bh();
net/core/filter.c sk_filter 88 rcu_read_lock_bh();
net/core/filter.c sk_filter 95 rcu_read_unlock_bh();
net/core/filter.c sk_filter_delayed_uncharge 477 call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
net/core/filter.c sk_attach_filter 517 rcu_read_lock_bh();
net/core/filter.c sk_attach_filter 520 rcu_read_unlock_bh();
net/core/filter.c sk_detach_filter 532 rcu_read_lock_bh();
net/core/filter.c sk_detach_filter 539 rcu_read_unlock_bh();
net/decnet/dn_route.c dnrt_free 148 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
net/decnet/dn_route.c dnrt_drop 154 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
net/decnet/dn_route.c __dn_route_output_key 1161 rcu_read_lock_bh();
net/decnet/dn_route.c __dn_route_output_key 1170 rcu_read_unlock_bh();
net/decnet/dn_route.c __dn_route_output_key 1175 rcu_read_unlock_bh();
net/decnet/dn_route.c dn_cache_dump 1623 rcu_read_lock_bh();
net/decnet/dn_route.c dn_cache_dump 1634 rcu_read_unlock_bh();
net/decnet/dn_route.c dn_cache_dump 1639 rcu_read_unlock_bh();
net/decnet/dn_route.c dn_rt_cache_get_first 1659 rcu_read_lock_bh();
net/decnet/dn_route.c dn_rt_cache_get_first 1663 rcu_read_unlock_bh();
net/decnet/dn_route.c dn_rt_cache_get_next 1674 rcu_read_unlock_bh();
net/decnet/dn_route.c dn_rt_cache_get_next 1677 rcu_read_lock_bh();
net/decnet/dn_route.c dn_rt_cache_seq_stop 1704 rcu_read_unlock_bh();
net/ipv4/fib_trie.c free_leaf 339 call_rcu_bh(&l->rcu, __leaf_free_rcu);
net/ipv4/route.c rt_cache_get_first 289 rcu_read_lock_bh();
net/ipv4/route.c rt_cache_get_first 297 rcu_read_unlock_bh();
net/ipv4/route.c __rt_cache_get_next 309 rcu_read_unlock_bh();
net/ipv4/route.c __rt_cache_get_next 314 rcu_read_lock_bh();
net/ipv4/route.c rt_cache_seq_stop 367 rcu_read_unlock_bh();
net/ipv4/route.c rt_free 613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
net/ipv4/route.c rt_drop 619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
net/ipv4/route.c __ip_route_output_key 2665 rcu_read_lock_bh();
net/ipv4/route.c __ip_route_output_key 2679 rcu_read_unlock_bh();
net/ipv4/route.c __ip_route_output_key 2685 rcu_read_unlock_bh();
net/ipv4/route.c ip_rt_dump 2983 rcu_read_lock_bh();
net/ipv4/route.c ip_rt_dump 2995 rcu_read_unlock_bh();
net/ipv4/route.c ip_rt_dump 3000 rcu_read_unlock_bh();
net/ipv6/addrconf.c ipv6_add_addr 603 rcu_read_lock_bh();
net/ipv6/addrconf.c ipv6_add_addr 682 rcu_read_unlock_bh();
net/ipv6/addrconf.c ipv6_regen_rndid 1641 rcu_read_lock_bh();
net/ipv6/addrconf.c ipv6_regen_rndid 1665 rcu_read_unlock_bh();
net/ipv6/addrconf.c ipv6_ifa_notify 3967 rcu_read_lock_bh();
net/ipv6/addrconf.c ipv6_ifa_notify 3970 rcu_read_unlock_bh();
net/mac80211/wme.c ieee80211_requeue 256 rcu_read_lock_bh();
net/mac80211/wme.c ieee80211_requeue 294 rcu_read_unlock_bh();
net/netfilter/nf_conntrack_core.c nf_conntrack_tuple_taken 408 rcu_read_lock_bh();
net/netfilter/nf_conntrack_core.c nf_conntrack_tuple_taken 413 rcu_read_unlock_bh();
net/netfilter/nf_conntrack_core.c nf_conntrack_tuple_taken 418 rcu_read_unlock_bh();
net/packet/af_packet.c run_filter 459 rcu_read_lock_bh();
net/packet/af_packet.c run_filter 463 rcu_read_unlock_bh();

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  6:12                                                             ` Peter Zijlstra
@ 2009-04-17 16:33                                                               ` Paul E. McKenney
  2009-04-17 16:51                                                                 ` Peter Zijlstra
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17 16:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, Apr 17, 2009 at 08:12:14AM +0200, Peter Zijlstra wrote:
> On Thu, 2009-04-16 at 18:28 -0700, Paul E. McKenney wrote:
> > On Thu, Apr 16, 2009 at 04:49:55PM -0700, Paul E. McKenney wrote:
> > > On Thu, Apr 16, 2009 at 03:33:54PM -0700, David Miller wrote:
> > > > From: Patrick McHardy <kaber@trash.net>
> > > > Date: Thu, 16 Apr 2009 15:11:31 +0200
> > > > 
> > > > > Linus Torvalds wrote:
> > > > >> On Wed, 15 Apr 2009, Stephen Hemminger wrote:
> > > > >>> The counters are the bigger problem, otherwise we could just free
> > > > >>> table
> > > > >>> info via rcu.  Do we really have to support: replace where the counter
> > > > >>> values coming out to user space are always exactly accurate, or is it
> > > > >>> allowed to replace a rule and maybe lose some counter ticks (worst
> > > > >>> case
> > > > >>> NCPU-1).
> > > > >> Why not just read the counters fromt he old one at RCU free time (they
> > > > >> are guaranteed to be stable at that point, since we're all done with
> > > > >> those entries), and apply them at that point to the current setup?
> > > > > 
> > > > > We need the counters immediately to copy them to userspace, so waiting
> > > > > for an asynchronous RCU free is not going to work.
> > > > 
> > > > It just occurred to me that since all netfilter packet handling
> > > > goes through one place, we could have a sort-of "netfilter RCU"
> > > > of sorts to solve this problem.
> > > 
> > > OK, I am putting one together...
> > > 
> > > It will be needed sooner or later, though I suspect per-CPU locking
> > > would work fine in this case.
> > 
> > And here is a crude first cut.  Untested, probably does not even compile.
> > 
> > Straight conversion of Mathieu Desnoyers's user-space RCU implementation
> > at git://lttng.org/userspace-rcu.git to the kernel (and yes, I did help
> > a little, but he must bear the bulk of the guilt).  Pick on srcu.h
> > and srcu.c out of sheer laziness.  User-space testing gives deep
> > sub-microsecond grace-period latencies, so should be fast enough, at
> > least if you don't mind two smp_call_function() invocations per grace
> > period and spinning on each instance of a per-CPU variable.
> > 
> > Again, I believe per-CPU locking should work fine for the netfilter
> > counters, but I guess "friends don't let friends use hashed locks".
> > (I would not know for sure, never having used them myself, except of
> > course to protect hash tables.)
> > 
> > Most definitely -not- for inclusion at this point.  Next step is to hack
> > up the relevant rcutorture code and watch it explode on contact.  ;-)
> 
> One comment, its again a global thing..
> 
> I've been playing with the idea for a while now to make all RCU
> implementations into proper objects so that you can do things like:
> 
>   struct atomic_rcu_domain my_rcu_domain = create_atomic_rcu();
> 
>   atomic_rcu_read_lock(&my_rcu_domain());
>   ...
> 
>   atomic_rcu_read_unlock(&my_rcu_domain());
> 
> and
> 
>   call_atomic_rcu(&my_rcu_domain, &my_obj->rcu_head, do_something);
> 
> etc..
> 
> We would have:
> 
>   atomic_rcu  --  'classic' non preemptible RCU (treercu these days)
>   sleep_rcu   --  'preemptible' RCU
> 
> Then have 3 default domains:
> 
> sched_rcu     -- always atomic_rcu

This is the call_rcu_sched() variant.

> rcu           -- depends on PREEMPT_RCU

This is the call_rcu() variant.

> preempt_rcu   -- always sleep_rcu

I guess that this one could allow sleeping on mutexes...  Does anyone
need to do that?

> This would allow generic code to:
>   1) use preemptible RCU for those cases where needed
>   2) create smaller RCU domains where needed, such as in this case
>   3) mostly do away with SRCU

#3 would be good!  But...

At an API level, there are two differences between SRCU and the other
RCU implementations:

a.	The return value from srcu_read_lock() is passed to
	srcu_read_unlock().

b.	There is a control block passed in to each SRCU primitive.

Difference (a) could potentially be taken care of with a few tricks I
am trying in the process of getting preemptrcu merged into treercu.

Your approach to (b) certainly makes it uniform, there are >500
occurrences of rcu_read_lock() and rcu_read_unlock() each, but only
a very few occurrences of srcu_read_lock() and srcu_read_unlock()
(like exactly one each!).  So adding an argument to rcu_read_lock()
does not sound at all reasonable.

> Now I realize that the presented RCU implementation has a different
> grace period method than the existing ones that use the timer tick to
> drive the state machine, so 2) might not be too relevant here. But maybe
> we can do something with different grace periods too.
> 
> Anyway, just an idea because I always get a little offended at the hard
> coded global variables in all these RCU implementations :-)

I am thinking in terms of adding a synchronize_rcu_bh() with the desired
properties.  That way we avoid yet another RCU flavor.  (What can I say?
I got carried away!)  Also, since the rcu-bh flavor is used only by
networking, we have a fair amount of freedom to tweak it.  It will take
longer than introducing a new flavor, but Steve Hemminger has a good
solution already, and RCU really isn't the thing to do quick hacks on.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17 16:33                                                               ` Paul E. McKenney
@ 2009-04-17 16:51                                                                 ` Peter Zijlstra
  2009-04-17 21:29                                                                   ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-17 16:51 UTC (permalink / raw)
  To: paulmck
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, 2009-04-17 at 09:33 -0700, Paul E. McKenney wrote:

> > One comment, its again a global thing..
> > 
> > I've been playing with the idea for a while now to make all RCU
> > implementations into proper objects so that you can do things like:
> > 
> >   struct atomic_rcu_domain my_rcu_domain = create_atomic_rcu();
> > 
> >   atomic_rcu_read_lock(&my_rcu_domain());
> >   ...
> > 
> >   atomic_rcu_read_unlock(&my_rcu_domain());
> > 
> > and
> > 
> >   call_atomic_rcu(&my_rcu_domain, &my_obj->rcu_head, do_something);
> > 
> > etc..
> > 
> > We would have:
> > 
> >   atomic_rcu  --  'classic' non preemptible RCU (treercu these days)
> >   sleep_rcu   --  'preemptible' RCU
> > 
> > Then have 3 default domains:
> > 
> > sched_rcu     -- always atomic_rcu
> 
> This is the call_rcu_sched() variant.
> 
> > rcu           -- depends on PREEMPT_RCU
> 
> This is the call_rcu() variant.
> 
> > preempt_rcu   -- always sleep_rcu
> 
> I guess that this one could allow sleeping on mutexes...  Does anyone
> need to do that?

I almost did a few times, but never quite got the code that needed it
working good enough for it to go anywhere.

> > This would allow generic code to:
> >   1) use preemptible RCU for those cases where needed
> >   2) create smaller RCU domains where needed, such as in this case
> >   3) mostly do away with SRCU
> 
> #3 would be good!  But...
> 
> At an API level, there are two differences between SRCU and the other
> RCU implementations:
> 
> a.	The return value from srcu_read_lock() is passed to
> 	srcu_read_unlock().
> 
> b.	There is a control block passed in to each SRCU primitive.
> 
> Difference (a) could potentially be taken care of with a few tricks I
> am trying in the process of getting preemptrcu merged into treercu.

Right, incrementing one cpu and decrementing another doesn't change the
sum over all cpus :-)

> Your approach to (b) certainly makes it uniform, there are >500
> occurrences of rcu_read_lock() and rcu_read_unlock() each, but only
> a very few occurrences of srcu_read_lock() and srcu_read_unlock()
> (like exactly one each!).  So adding an argument to rcu_read_lock()
> does not sound at all reasonable.

static inline void rcu_read_lock(void)
{
	atomic_rcu_read_lock(&global_atomic_rcu_context);
}

static inline void rcu_read_unlock(void)
{
	atomic_rcu_read_unlock(&global_atomic_rcu_context);
}

static inline void call_rcu(struct rcu_head *rcuhead, void (*func)(struct rcu_head *))
{
	call_atomic_rcu(&global_atomic_rcu_context, rcuhead, func);
}

etc.. Should take care of that, no?

> > Now I realize that the presented RCU implementation has a different
> > grace period method than the existing ones that use the timer tick to
> > drive the state machine, so 2) might not be too relevant here. But maybe
> > we can do something with different grace periods too.
> > 
> > Anyway, just an idea because I always get a little offended at the hard
> > coded global variables in all these RCU implementations :-)
> 
> I am thinking in terms of adding a synchronize_rcu_bh() with the desired
> properties.  That way we avoid yet another RCU flavor.  (What can I say?
> I got carried away!)  Also, since the rcu-bh flavor is used only by
> networking, we have a fair amount of freedom to tweak it. 

Right. I was thinking along the way of providing a watermark (either
nr_queued based, or time based) and once it exceeds try to drive it from
read_unlock. Or similar. unlock driven RCU implementations have the best
grace period time every, except for all the down sides ;-)

>  It will take
> longer than introducing a new flavor, but Steve Hemminger has a good
> solution already, and RCU really isn't the thing to do quick hacks on.

Ok, back on topic :-)

I wouldn't exactly call it a good solution, it does a
for_each_possible_cpu() spin_lock();

 1) that should probably be for_each_online_cpu()

 2) that doesn't scale at all, I'm sure dave's 256-way hurts like mad
    when inserting tons of rules and we do that for every iptable
    modification.

 3) there is no way lockdep can track all that :(

Do we _really_ _really_ __HAVE__ to serialize this? So far I've heard
Patrick say: there might be, a use case. That doesn't sound like: we
should make it dead slow for everybody else.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8)
  2009-04-17  6:14                                                                 ` Eric Dumazet
  (?)
@ 2009-04-17 17:08                                                                 ` Peter Zijlstra
  -1 siblings, 0 replies; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-17 17:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, paulmck, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Fri, 2009-04-17 at 08:14 +0200, Eric Dumazet wrote:
> > Also, please dont call this a 'recursive lock', since it is not a general
> > recursive lock, as pointed by Linus and Paul.
> > 
> > Second question is about MAX_LOCK_DEPTH
> 
> I meant here the ~256 limit we have on preempt_count, not related to LOCKDEP

Very good point, so 256 nested spin_lock() instances will make the
kernel unhappy -- since we now (almost?) support up to 4096 cpus, this
seems like a no-no.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  8:07                                                                   ` David Miller
  2009-04-17 15:00                                                                     ` Paul E. McKenney
@ 2009-04-17 17:22                                                                     ` Peter Zijlstra
  2009-04-17 17:32                                                                       ` Linus Torvalds
  1 sibling, 1 reply; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-17 17:22 UTC (permalink / raw)
  To: David Miller
  Cc: paulmck, dada1, shemminger, kaber, torvalds, jeff.chua.linux,
	paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, 2009-04-17 at 01:07 -0700, David Miller wrote:
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Date: Thu, 16 Apr 2009 22:40:32 -0700
> 
> > I completely agree that this RCU is absolutely -not- 2.6.30 material.  ;-)
> 
> I don't understand why we're writing such complicated code.
> 
> Oh I see why, it's because not every arch uses the generic SMP helpers
> yet :-)
> 
> Because if they did universally, we could solve this problem so
> simply, by merely sending a remote softirq to every online cpu.  Once
> those all complete we have enough of a quiesce period, every cpu must
> have exited any netfilter packet processing code path they were in.
> 
> And we could know they complete using an atomic counter or something.

Since its a full broadcast, we can do that _today_ using on_each_cpu().
But whatever way we turn this, this will be a very expensive operation.
Imagine doing that on your 256-way for every iptables rules change.




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17 17:22                                                                     ` Peter Zijlstra
@ 2009-04-17 17:32                                                                       ` Linus Torvalds
  0 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-17 17:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, paulmck, dada1, shemminger, kaber, jeff.chua.linux,
	paulus, mingo, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers



On Fri, 17 Apr 2009, Peter Zijlstra wrote:
> 
> Since its a full broadcast, we can do that _today_ using on_each_cpu().
> But whatever way we turn this, this will be a very expensive operation.
> Imagine doing that on your 256-way for every iptables rules change.

Well, you _could_ just have a per-CPU bit of "have I used nf rules since 
the last update", and skip those CPU's. Use memory ordering to check the 
bits (set the bit _before_ looking up a NF rule, and check them _after_ 
doing the update, and have a barrier in between if you really think it 
matters).

Remember: the cost was never about a single filter rule update. The cost 
of a single one is almost immaterial, as long as it's not in hundreds of 
milliseconds. It's the cost of people building up things incrementally 
that caused this thing.

So if you have 200 "iptables" commands in a sequence, and especially 
during bootup, a trivial "has the old rule been ever even looked at on 
this CPU" would already fix the issue. Because it would always be zero in 
the only case where it matters.

This is, of course, what we do for the TLB flushing issue. We don't want 
to send IPI's to all CPU's, and in 99.999% of all cases we don't need to, 
because the other CPU's never even loaded the MM.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17 16:51                                                                 ` Peter Zijlstra
@ 2009-04-17 21:29                                                                   ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-17 21:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Fri, Apr 17, 2009 at 06:51:37PM +0200, Peter Zijlstra wrote:
> On Fri, 2009-04-17 at 09:33 -0700, Paul E. McKenney wrote:
> 
> > > One comment, its again a global thing..
> > > 
> > > I've been playing with the idea for a while now to make all RCU
> > > implementations into proper objects so that you can do things like:
> > > 
> > >   struct atomic_rcu_domain my_rcu_domain = create_atomic_rcu();
> > > 
> > >   atomic_rcu_read_lock(&my_rcu_domain());
> > >   ...
> > > 
> > >   atomic_rcu_read_unlock(&my_rcu_domain());
> > > 
> > > and
> > > 
> > >   call_atomic_rcu(&my_rcu_domain, &my_obj->rcu_head, do_something);
> > > 
> > > etc..
> > > 
> > > We would have:
> > > 
> > >   atomic_rcu  --  'classic' non preemptible RCU (treercu these days)
> > >   sleep_rcu   --  'preemptible' RCU
> > > 
> > > Then have 3 default domains:
> > > 
> > > sched_rcu     -- always atomic_rcu
> > 
> > This is the call_rcu_sched() variant.
> > 
> > > rcu           -- depends on PREEMPT_RCU
> > 
> > This is the call_rcu() variant.
> > 
> > > preempt_rcu   -- always sleep_rcu
> > 
> > I guess that this one could allow sleeping on mutexes...  Does anyone
> > need to do that?
> 
> I almost did a few times, but never quite got the code that needed it
> working good enough for it to go anywhere.

It probably would not be hard to enable preemptable RCU in a
!CONFIG_PREEMPT configuration, which would allow mutexes to be acquired
in these read-side critical sections.  After I fix any relevant bugs,
of course...

> > > This would allow generic code to:
> > >   1) use preemptible RCU for those cases where needed
> > >   2) create smaller RCU domains where needed, such as in this case
> > >   3) mostly do away with SRCU
> > 
> > #3 would be good!  But...
> > 
> > At an API level, there are two differences between SRCU and the other
> > RCU implementations:
> > 
> > a.	The return value from srcu_read_lock() is passed to
> > 	srcu_read_unlock().
> > 
> > b.	There is a control block passed in to each SRCU primitive.
> > 
> > Difference (a) could potentially be taken care of with a few tricks I
> > am trying in the process of getting preemptrcu merged into treercu.
> 
> Right, incrementing one cpu and decrementing another doesn't change the
> sum over all cpus :-)

Well, I am trying to get rid of the summing over all CPUs -- really hard
to make a reasonable hierarchy that way.  But yes.  ;-)

> > Your approach to (b) certainly makes it uniform, there are >500
> > occurrences of rcu_read_lock() and rcu_read_unlock() each, but only
> > a very few occurrences of srcu_read_lock() and srcu_read_unlock()
> > (like exactly one each!).  So adding an argument to rcu_read_lock()
> > does not sound at all reasonable.
> 
> static inline void rcu_read_lock(void)
> {
> 	atomic_rcu_read_lock(&global_atomic_rcu_context);
> }
> 
> static inline void rcu_read_unlock(void)
> {
> 	atomic_rcu_read_unlock(&global_atomic_rcu_context);
> }
> 
> static inline void call_rcu(struct rcu_head *rcuhead, void (*func)(struct rcu_head *))
> {
> 	call_atomic_rcu(&global_atomic_rcu_context, rcuhead, func);
> }
> 
> etc.. Should take care of that, no?

Given that for classic and hierarchical RCU, rcu_read_lock() and
rcu_read_unlock() just map to preempt_disable() and preempt_enable(),
how is this helping?

> > > Now I realize that the presented RCU implementation has a different
> > > grace period method than the existing ones that use the timer tick to
> > > drive the state machine, so 2) might not be too relevant here. But maybe
> > > we can do something with different grace periods too.
> > > 
> > > Anyway, just an idea because I always get a little offended at the hard
> > > coded global variables in all these RCU implementations :-)
> > 
> > I am thinking in terms of adding a synchronize_rcu_bh() with the desired
> > properties.  That way we avoid yet another RCU flavor.  (What can I say?
> > I got carried away!)  Also, since the rcu-bh flavor is used only by
> > networking, we have a fair amount of freedom to tweak it. 
> 
> Right. I was thinking along the way of providing a watermark (either
> nr_queued based, or time based) and once it exceeds try to drive it from
> read_unlock. Or similar. unlock driven RCU implementations have the best
> grace period time every, except for all the down sides ;-)

Jim Houston did an unlock-driven implementation some years back:

http://marc.theaimsgroup.com/?l=linux-kernel&m=109387402400673&w=2

The read-side overhead can be a problem.  And I have gotten grace-period
latencies under 100ns without driving the grace period from the update
side.  Of course, these implementations have their downsides as well.  ;-)

> >  It will take
> > longer than introducing a new flavor, but Steve Hemminger has a good
> > solution already, and RCU really isn't the thing to do quick hacks on.
> 
> Ok, back on topic :-)
> 
> I wouldn't exactly call it a good solution,

Compared to the global rwlock, it is a wonderful solution.  ;-)

>                                             it does a
> for_each_possible_cpu() spin_lock();
> 
>  1) that should probably be for_each_online_cpu()

In principle I agree.  In practice, this is an infrequently executed
slow path, right?

Or are you concerned about real-time latencies while loading new
iptables or some such?

>  2) that doesn't scale at all, I'm sure dave's 256-way hurts like mad
>     when inserting tons of rules and we do that for every iptable
>     modification.

There of course is a point beyond which this method is slower than
a full RCU grace period.  But I bet that Dave's 256-way machine
is not anywhere near big enough to reach that point.  Maybe he can
try it and tell us what happens.  ;-)

>  3) there is no way lockdep can track all that :(

This is a good point.  I understand the need to acquire the locks, but
am not fully clear on why we cannot acquire one CPU's lock, gather its
counters, release that lock, acquire the next CPU's lock, and so on.
Maybe a code-complexity issue?

Please keep in mind that we are trying to hit 2.6.30 with this fix, so
simplicity is even more important than it usually be.  Yes, I have some
idea of the irony of me saying much of anything about simplicity.  ;-)

> Do we _really_ _really_ __HAVE__ to serialize this? So far I've heard
> Patrick say: there might be, a use case. That doesn't sound like: we
> should make it dead slow for everybody else.

We are making it faster than it used to be by quite a bit by getting rid
of the global lock, so this does sound like a good approach.  Here is my
reasoning:

1.	The update-side performance is good, as verified by Jeff Chua.

2.	The per-packet read-side performance is slowed by roughly the
	overhead of an uncontended lock, which comes to about 60ns
	on my laptop.  At some point, this 60ns will become critical,
	but I do not believe that we are there yet.

	When it does become critical, a new patch can be produced.
	Such a patch can of course be backported as required -- this
	is a reasonably isolated piece of code, right?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
                                                                               ` (2 preceding siblings ...)
  2009-04-17  6:12                                                             ` Peter Zijlstra
@ 2009-04-18  9:40                                                             ` Evgeniy Polyakov
  2009-04-18 14:14                                                               ` Paul E. McKenney
  3 siblings, 1 reply; 254+ messages in thread
From: Evgeniy Polyakov @ 2009-04-18  9:40 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Hi.

On Thu, Apr 16, 2009 at 06:28:12PM -0700, Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> +/* Single bit for grace-period index, low-order bits are nesting counter. */
> +#define RCU_FGP_COUNT		1UL
> +#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
> +#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
> +
> +extern long rcu_fgp_ctr;
> +DECLARE_PER_CPU(long, rcu_fgp_active_readers);
> +
> +static inline void rcu_read_lock_fgp(void)
> +{
> +	long tmp;
> +	long *uarp;
> +
> +	preempt_disable();
> +	uarp = &__get_cpu_var(rcu_fgp_active_readers);
> +	tmp = *uarp;
> +	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
> +		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */
> +	else
> +		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
> +	barrier();
> +}
> +
> +static inline void rcu_read_unlock_fgp(void)
> +{
> +	barrier();
> +	__get_cpu_var(rcu_fgp_active_readers)--;

Shouldn't it be rcu_fgp_active_readers - RCU_FGP_COUNT?
Although it is 1 by definition, it is more clear when understanding
what's going on here.

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3)
  2009-04-18  9:40                                                             ` Evgeniy Polyakov
@ 2009-04-18 14:14                                                               ` Paul E. McKenney
  2009-04-20 17:34                                                                 ` [PATCH] netfilter: use per-cpu recursive lock (v10) Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-18 14:14 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: David Miller, kaber, torvalds, shemminger, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Sat, Apr 18, 2009 at 01:40:01PM +0400, Evgeniy Polyakov wrote:
> Hi.
> 
> On Thu, Apr 16, 2009 at 06:28:12PM -0700, Paul E. McKenney (paulmck@linux.vnet.ibm.com) wrote:
> > +/* Single bit for grace-period index, low-order bits are nesting counter. */
> > +#define RCU_FGP_COUNT		1UL
> > +#define RCU_FGP_PARITY		(1UL << (sizeof(long) << 2))
> > +#define RCU_FGP_NEST_MASK	(RCU_FGP_PARITY - 1)
> > +
> > +extern long rcu_fgp_ctr;
> > +DECLARE_PER_CPU(long, rcu_fgp_active_readers);
> > +
> > +static inline void rcu_read_lock_fgp(void)
> > +{
> > +	long tmp;
> > +	long *uarp;
> > +
> > +	preempt_disable();
> > +	uarp = &__get_cpu_var(rcu_fgp_active_readers);
> > +	tmp = *uarp;
> > +	if (likely(!(tmp & RCU_FGP_NEST_MASK)))
> > +		*uarp = rcu_fgp_ctr;  /* Outermost rcu_read_lock(). */
> > +	else
> > +		*uarp = tmp + RCU_FGP_COUNT;  /* Nested rcu_read_lock(). */
> > +	barrier();
> > +}
> > +
> > +static inline void rcu_read_unlock_fgp(void)
> > +{
> > +	barrier();
> > +	__get_cpu_var(rcu_fgp_active_readers)--;
> 
> Shouldn't it be rcu_fgp_active_readers - RCU_FGP_COUNT?
> Although it is 1 by definition, it is more clear when understanding
> what's going on here.

Excellent point, fixed!

						Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-18 14:14                                                               ` Paul E. McKenney
@ 2009-04-20 17:34                                                                 ` Stephen Hemminger
  2009-04-20 18:21                                                                   ` Paul E. McKenney
  2009-04-20 18:25                                                                     ` Eric Dumazet
  0 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-20 17:34 UTC (permalink / raw)
  To: paulmck
  Cc: Evgeniy Polyakov, David Miller, kaber, torvalds, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

This version of x_tables (ip/ip6/arp) locking uses a per-cpu
recursive lock that can be nested. It is sort of like existing kernel_lock,
rwlock_t and even old 2.4 brlock.

"Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
It needs to ensure that the rules are not being changed while packet
is being processed.

"Writer" is used in two cases: first is replacing rules in which case
all packets in flight have to be processed before rules are swapped,
then counters are read from the old (stale) info. Second case is where
counters need to be read on the fly, in this case all CPU's are blocked
from further rule processing until values are aggregated.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2). 

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
Changes from earlier patches.
  - function name changes
  - disable bottom half in info_rdlock

These should still be addressed but beyond the scope of the problem
  - lockdep mapping; really a tradeoff between LOCKDEP special clutter
    and clarity
  - Figure out how to stop sparse warning
  - hot plug CPU case, if kernel is built with large # of CPU's, skip
    the inactive ones; migrate values when CPU is removed.

 include/linux/netfilter/x_tables.h |   10 +--
 net/ipv4/netfilter/arp_tables.c    |  110 +++++++---------------------------
 net/ipv4/netfilter/ip_tables.c     |  110 +++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  108 +++++++---------------------------
 net/netfilter/x_tables.c           |  117 ++++++++++++++++++++++++++++++-------
 5 files changed, 174 insertions(+), 281 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-20 07:58:17.609890831 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-20 09:39:34.163891182 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,11 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+extern void xt_info_rdlock_bh(void) __acquires(xt_info_lock);
+extern void xt_info_rdunlock_bh(void) __releases(xt_info_lock);
+extern void xt_info_wrlock_bh(void) __acquires(xt_info_lock);
+extern void xt_info_wrunlock_bh(void) __releases(xt_info_lock);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-20 07:58:17.590949808 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-20 09:25:16.452078280 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-20 07:58:17.558895273 -0700
+++ b/net/netfilter/x_tables.c	2009-04-20 10:29:00.719320837 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,6 +662,94 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+/*
+ * The info table entries are per-cpu, and are usually updated
+ * only by the current CPU.
+ */
+
+struct xt_info_lock {
+	spinlock_t 	   lock;
+	int   	   	   depth;	/* # readers - 1 */
+};
+static DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+static void xt_info_lock_init(struct xt_info_lock *lock)
+{
+	spin_lock_init(&lock->lock);
+	lock->depth = -1;
+}
+
+/**
+ * xt_table_info_rdlock_bh - recursive read lock for xt table info
+ *
+ * Filter processing calls xt_info_lock_bh which acts like a reader
+ * lock that can be locked recursively acquired. This only holds off
+ * xt_info_lock_all, not other calls to xt_info_lock_bh.
+ */
+void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	local_bh_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (likely(++lock->depth == 0))
+		spin_lock(&lock->lock);
+}
+EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
+
+/**
+ * xt_info_rdunlock_bh - release recursive table info lock
+ *
+ * Used after filter has updated
+ */
+void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	BUG_ON(lock->depth < 0);
+	if (likely(--lock->depth < 0))
+		spin_unlock(&lock->lock);
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(xt_info_rdunlock_bh);
+
+/**
+ * xt_info_wrlock_bh - lock xt table info for update
+ *
+ * Locks out all readers, and blocks bottom half
+ */
+void xt_info_wrlock_bh(void)
+{
+	int i;
+
+	local_bh_disable();
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock(&lock->lock);
+		BUG_ON(lock->depth != -1);
+	}
+}
+EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
+
+/**
+ * xt_info_wrunlock_bh - lock xt table info for update
+ *
+ * Unlocks all readers, and unblocks bottom half
+ */
+void xt_info_wrunlock_bh(void) __releases(&lock->lock)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		BUG_ON(lock->depth != -1);
+		spin_unlock(&lock->lock);
+	}
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -685,22 +759,21 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	xt_info_wrlock_bh();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		xt_info_wrunlock_bh();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	table->private =  newinfo;
+	newinfo->initial_entries = private->initial_entries;
+	xt_info_wrunlock_bh();
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +807,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1149,6 +1221,9 @@ static int __init xt_init(void)
 {
 	int i, rv;
 
+	for_each_possible_cpu(i)
+		xt_info_lock_init(&per_cpu(xt_info_locks, i));
+
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
 		return -ENOMEM;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-20 07:58:17.569948056 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-20 09:29:03.593890771 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
 	}
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-20 07:58:17.578890388 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-20 09:27:02.254203584 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -734,65 +734,11 @@ static void get_counters(const struct xt
 	}
 }
 
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
- unlock_up_free:
-	mutex_unlock(&t->lock);
 
+ unlock_up_free:
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 17:34                                                                 ` [PATCH] netfilter: use per-cpu recursive lock (v10) Stephen Hemminger
@ 2009-04-20 18:21                                                                   ` Paul E. McKenney
  2009-04-20 18:25                                                                     ` Eric Dumazet
  1 sibling, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-20 18:21 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, David Miller, kaber, torvalds, dada1,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Mon, Apr 20, 2009 at 10:34:14AM -0700, Stephen Hemminger wrote:
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.

This looks good to me!

But I have to ask the stupid question...  Would it be possible for
the "write" side to acquire the locks one at a time, accumulating and
resetting the counts for that CPU, then advancing to the next CPU?

If this is possible, then lockdep is much happier and the disruption
to readers is much less.

							Thanx, Paul

> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> 
> ---
> Changes from earlier patches.
>   - function name changes
>   - disable bottom half in info_rdlock
> 
> These should still be addressed but beyond the scope of the problem
>   - lockdep mapping; really a tradeoff between LOCKDEP special clutter
>     and clarity
>   - Figure out how to stop sparse warning
>   - hot plug CPU case, if kernel is built with large # of CPU's, skip
>     the inactive ones; migrate values when CPU is removed.
> 
>  include/linux/netfilter/x_tables.h |   10 +--
>  net/ipv4/netfilter/arp_tables.c    |  110 +++++++---------------------------
>  net/ipv4/netfilter/ip_tables.c     |  110 +++++++---------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  108 +++++++---------------------------
>  net/netfilter/x_tables.c           |  117 ++++++++++++++++++++++++++++++-------
>  5 files changed, 174 insertions(+), 281 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-04-20 07:58:17.609890831 -0700
> +++ b/include/linux/netfilter/x_tables.h	2009-04-20 09:39:34.163891182 -0700
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
> 
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> 
> @@ -434,8 +431,11 @@ extern void xt_proto_fini(struct net *ne
> 
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
> +
> +extern void xt_info_rdlock_bh(void) __acquires(xt_info_lock);
> +extern void xt_info_rdunlock_bh(void) __releases(xt_info_lock);
> +extern void xt_info_wrlock_bh(void) __acquires(xt_info_lock);
> +extern void xt_info_wrunlock_bh(void) __releases(xt_info_lock);
> 
>  /*
>   * This helper is performance critical and must be inlined
> --- a/net/ipv4/netfilter/ip_tables.c	2009-04-20 07:58:17.590949808 -0700
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-04-20 09:25:16.452078280 -0700
> @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
>  	tgpar.hooknum = hook;
> 
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
> 
> @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
>  				  counters,
>  				  &i);
>  	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
>  }
> 
>  static struct xt_counters * alloc_counters(struct xt_table *table)
> @@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> +		return ERR_PTR(-ENOMEM);
> 
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	xt_info_wrlock_bh();
> +	get_counters(private, counters);
> +	xt_info_wrunlock_bh();
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int
> @@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> 
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +	xt_info_wrlock_bh();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	loc_cpu_entry = private->entries[smp_processor_id()];
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	xt_info_wrunlock_bh();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/netfilter/x_tables.c	2009-04-20 07:58:17.558895273 -0700
> +++ b/net/netfilter/x_tables.c	2009-04-20 10:29:00.719320837 -0700
> @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
>  }
>  EXPORT_SYMBOL(xt_free_table_info);
> 
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> -{
> -	unsigned int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> -
> -}
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> -
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  				    const char *name)
> @@ -676,6 +662,94 @@ void xt_compat_unlock(u_int8_t af)
>  EXPORT_SYMBOL_GPL(xt_compat_unlock);
>  #endif
> 
> +/*
> + * The info table entries are per-cpu, and are usually updated
> + * only by the current CPU.
> + */
> +
> +struct xt_info_lock {
> +	spinlock_t 	   lock;
> +	int   	   	   depth;	/* # readers - 1 */
> +};
> +static DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
> +
> +static void xt_info_lock_init(struct xt_info_lock *lock)
> +{
> +	spin_lock_init(&lock->lock);
> +	lock->depth = -1;
> +}
> +
> +/**
> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> + *
> + * Filter processing calls xt_info_lock_bh which acts like a reader
> + * lock that can be locked recursively acquired. This only holds off
> + * xt_info_lock_all, not other calls to xt_info_lock_bh.
> + */
> +void xt_info_rdlock_bh(void)
> +{
> +	struct xt_info_lock *lock;
> +
> +	local_bh_disable();
> +	lock = &__get_cpu_var(xt_info_locks);
> +	if (likely(++lock->depth == 0))
> +		spin_lock(&lock->lock);
> +}
> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> +
> +/**
> + * xt_info_rdunlock_bh - release recursive table info lock
> + *
> + * Used after filter has updated
> + */
> +void xt_info_rdunlock_bh(void)
> +{
> +	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
> +
> +	BUG_ON(lock->depth < 0);
> +	if (likely(--lock->depth < 0))
> +		spin_unlock(&lock->lock);
> +	local_bh_enable();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_rdunlock_bh);
> +
> +/**
> + * xt_info_wrlock_bh - lock xt table info for update
> + *
> + * Locks out all readers, and blocks bottom half
> + */
> +void xt_info_wrlock_bh(void)
> +{
> +	int i;
> +
> +	local_bh_disable();
> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		spin_lock(&lock->lock);
> +		BUG_ON(lock->depth != -1);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
> +
> +/**
> + * xt_info_wrunlock_bh - lock xt table info for update
> + *
> + * Unlocks all readers, and unblocks bottom half
> + */
> +void xt_info_wrunlock_bh(void) __releases(&lock->lock)
> +{
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		BUG_ON(lock->depth != -1);
> +		spin_unlock(&lock->lock);
> +	}
> +	local_bh_enable();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);
> +
> +
>  struct xt_table_info *
>  xt_replace_table(struct xt_table *table,
>  	      unsigned int num_counters,
> @@ -685,22 +759,21 @@ xt_replace_table(struct xt_table *table,
>  	struct xt_table_info *oldinfo, *private;
> 
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
> +	xt_info_wrlock_bh();
>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
> +		xt_info_wrunlock_bh();
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
>  	oldinfo = private;
> -	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
> +	table->private =  newinfo;
> +	newinfo->initial_entries = private->initial_entries;
> +	xt_info_wrunlock_bh();
> 
> -	synchronize_net();
>  	return oldinfo;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
> @@ -734,7 +807,6 @@ struct xt_table *xt_register_table(struc
> 
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
> 
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;
> @@ -1149,6 +1221,9 @@ static int __init xt_init(void)
>  {
>  	int i, rv;
> 
> +	for_each_possible_cpu(i)
> +		xt_info_lock_init(&per_cpu(xt_info_locks, i));
> +
>  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
>  	if (!xt)
>  		return -ENOMEM;
> --- a/net/ipv6/netfilter/ip6_tables.c	2009-04-20 07:58:17.569948056 -0700
> +++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-20 09:29:03.593890771 -0700
> @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
> 
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> 
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
> 
> @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
>  #ifdef CONFIG_NETFILTER_DEBUG
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
>  	}
>  }
> 
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ip6t_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IP6T_ENTRY_ITERATE(t->entries[cpu],
> -			   t->size,
> -			   add_counter_to_entry,
> -			   counters,
> -			   &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct ip6t_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				   zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> +		return ERR_PTR(-ENOMEM);
> 
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	xt_info_wrlock_bh();
> +	get_counters(private, counters);
> +	xt_info_wrunlock_bh();
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int
> @@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ip6t_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		int compat)
> @@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +	xt_info_wrlock_bh();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	loc_cpu_entry = private->entries[smp_processor_id()];
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	xt_info_wrunlock_bh();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/ipv4/netfilter/arp_tables.c	2009-04-20 07:58:17.578890388 -0700
> +++ b/net/ipv4/netfilter/arp_tables.c	2009-04-20 09:27:02.254203584 -0700
> @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
>  	indev = in ? in->name : nulldevname;
>  	outdev = out ? out->name : nulldevname;
> 
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  	back = get_entry(table_base, private->underflow[hook]);
> @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
> 
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> +
>  			ADD_COUNTER(e->counters, hdr_len, 1);
> 
>  			t = arpt_get_target(e);
> @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  	if (hotdrop)
>  		return NF_DROP;
> @@ -734,65 +734,11 @@ static void get_counters(const struct xt
>  	}
>  }
> 
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct arpt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	ARPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
> -	local_bh_enable();
> -}
> -
> -static inline int
> -zero_entry_counter(struct arpt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	 * (other than comefrom, which userspace doesn't care
> @@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> +		return ERR_PTR(-ENOMEM);
> 
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	xt_info_wrlock_bh();
> +	get_counters(private, counters);
> +	xt_info_wrunlock_bh();
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int copy_entries_to_user(unsigned int total_size,
> @@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct arpt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  			   int compat)
>  {
> @@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +	xt_info_wrlock_bh();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
>  	loc_cpu_entry = private->entries[smp_processor_id()];
> @@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	preempt_enable();
> - unlock_up_free:
> -	mutex_unlock(&t->lock);
> 
> + unlock_up_free:
> +	xt_info_wrunlock_bh();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 17:34                                                                 ` [PATCH] netfilter: use per-cpu recursive lock (v10) Stephen Hemminger
@ 2009-04-20 18:25                                                                     ` Eric Dumazet
  2009-04-20 18:25                                                                     ` Eric Dumazet
  1 sibling, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-20 18:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> 
> ---
> Changes from earlier patches.
>   - function name changes
>   - disable bottom half in info_rdlock

OK, but we still have a problem on machines with >= 250 cpus,
because calling 250 times spin_lock() is going to overflow preempt_count,
as each spin_lock() increases preempt_count by one.

PREEMPT_MASK: 0x000000ff

add_preempt_count() should warn us about this overflow if CONFIG_DEBUG_PREEMPT is set

#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
         */
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
#endif


My suggestion (in a previous mail) was to call preempt_disable() after each spin_lock(),
and of course doing the reverse on unlock path.


> +/**
> + * xt_info_wrlock_bh - lock xt table info for update
> + *
> + * Locks out all readers, and blocks bottom half
> + */
> +void xt_info_wrlock_bh(void)
> +{
> +	int i;
> +
> +	local_bh_disable();
 
/* at this point , preemption is disabled... */


> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		spin_lock(&lock->lock);
	
		preempt_enable(); /* avoid preempt count overflow */
		
> +		BUG_ON(lock->depth != -1);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
> +
> +/**
> + * xt_info_wrunlock_bh - lock xt table info for update
> + *
> + * Unlocks all readers, and unblocks bottom half
> + */
> +void xt_info_wrunlock_bh(void) __releases(&lock->lock)
> +{
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		BUG_ON(lock->depth != -1);

		preempt_disable() /* restore preempt count lowered in xt_info_wrlock_bh */

> +		spin_unlock(&lock->lock);
> +	}
> +	local_bh_enable();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
@ 2009-04-20 18:25                                                                     ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-20 18:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: paulmck, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> 
> ---
> Changes from earlier patches.
>   - function name changes
>   - disable bottom half in info_rdlock

OK, but we still have a problem on machines with >= 250 cpus,
because calling 250 times spin_lock() is going to overflow preempt_count,
as each spin_lock() increases preempt_count by one.

PREEMPT_MASK: 0x000000ff

add_preempt_count() should warn us about this overflow if CONFIG_DEBUG_PREEMPT is set

#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
         */
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
#endif


My suggestion (in a previous mail) was to call preempt_disable() after each spin_lock(),
and of course doing the reverse on unlock path.


> +/**
> + * xt_info_wrlock_bh - lock xt table info for update
> + *
> + * Locks out all readers, and blocks bottom half
> + */
> +void xt_info_wrlock_bh(void)
> +{
> +	int i;
> +
> +	local_bh_disable();
 
/* at this point , preemption is disabled... */


> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		spin_lock(&lock->lock);
	
		preempt_enable(); /* avoid preempt count overflow */
		
> +		BUG_ON(lock->depth != -1);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
> +
> +/**
> + * xt_info_wrunlock_bh - lock xt table info for update
> + *
> + * Unlocks all readers, and unblocks bottom half
> + */
> +void xt_info_wrunlock_bh(void) __releases(&lock->lock)
> +{
> +	int i;
> +
> +	for_each_possible_cpu(i) {
> +		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
> +		BUG_ON(lock->depth != -1);

		preempt_disable() /* restore preempt count lowered in xt_info_wrlock_bh */

> +		spin_unlock(&lock->lock);
> +	}
> +	local_bh_enable();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 18:25                                                                     ` Eric Dumazet
  (?)
@ 2009-04-20 20:32                                                                     ` Stephen Hemminger
  -1 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-20 20:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Mon, 20 Apr 2009 20:25:14 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> > recursive lock that can be nested. It is sort of like existing kernel_lock,
> > rwlock_t and even old 2.4 brlock.
> > 
> > "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> > It needs to ensure that the rules are not being changed while packet
> > is being processed.
> > 
> > "Writer" is used in two cases: first is replacing rules in which case
> > all packets in flight have to be processed before rules are swapped,
> > then counters are read from the old (stale) info. Second case is where
> > counters need to be read on the fly, in this case all CPU's are blocked
> > from further rule processing until values are aggregated.
> > 
> > The idea for this came from an earlier version done by Eric Dumazet.
> > Locking is done per-cpu, the fast path locks on the current cpu
> > and updates counters.  This reduces the contention of a
> > single reader lock (in 2.6.29) without the delay of synchronize_net()
> > (in 2.6.30-rc2). 
> > 
> > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > there already is a mutex for xt[af].mutex that is held.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> > 
> > ---
> > Changes from earlier patches.
> >   - function name changes
> >   - disable bottom half in info_rdlock
> 
> OK, but we still have a problem on machines with >= 250 cpus,
> because calling 250 times spin_lock() is going to overflow preempt_count,
> as each spin_lock() increases preempt_count by one.

Ok, not that I have one of those.

The problem which lockdep has is that it seems to associate all the
locks with same name.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 18:25                                                                     ` Eric Dumazet
  (?)
  (?)
@ 2009-04-20 20:42                                                                     ` Stephen Hemminger
  2009-04-20 21:05                                                                         ` Paul E. McKenney
  -1 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-20 20:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: paulmck, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Mon, 20 Apr 2009 20:25:14 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> > recursive lock that can be nested. It is sort of like existing kernel_lock,
> > rwlock_t and even old 2.4 brlock.
> > 
> > "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> > It needs to ensure that the rules are not being changed while packet
> > is being processed.
> > 
> > "Writer" is used in two cases: first is replacing rules in which case
> > all packets in flight have to be processed before rules are swapped,
> > then counters are read from the old (stale) info. Second case is where
> > counters need to be read on the fly, in this case all CPU's are blocked
> > from further rule processing until values are aggregated.
> > 
> > The idea for this came from an earlier version done by Eric Dumazet.
> > Locking is done per-cpu, the fast path locks on the current cpu
> > and updates counters.  This reduces the contention of a
> > single reader lock (in 2.6.29) without the delay of synchronize_net()
> > (in 2.6.30-rc2). 
> > 
> > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > there already is a mutex for xt[af].mutex that is held.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> > 
> > ---
> > Changes from earlier patches.
> >   - function name changes
> >   - disable bottom half in info_rdlock
> 
> OK, but we still have a problem on machines with >= 250 cpus,
> because calling 250 times spin_lock() is going to overflow preempt_count,
> as each spin_lock() increases preempt_count by one.
> 
> PREEMPT_MASK: 0x000000ff
> 
> add_preempt_count() should warn us about this overflow if CONFIG_DEBUG_PREEMPT is set

Wouldn't 256 or higher CPU system be faster without preempt?  If there are that many
CPU's, it is faster to do the work on other cpu and avoid the overhead of a hotly
updated preempt count.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 20:42                                                                     ` Stephen Hemminger
@ 2009-04-20 21:05                                                                         ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-20 21:05 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Mon, Apr 20, 2009 at 01:42:49PM -0700, Stephen Hemminger wrote:
> On Mon, 20 Apr 2009 20:25:14 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> > Stephen Hemminger a écrit :
> > > This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> > > recursive lock that can be nested. It is sort of like existing kernel_lock,
> > > rwlock_t and even old 2.4 brlock.
> > > 
> > > "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> > > It needs to ensure that the rules are not being changed while packet
> > > is being processed.
> > > 
> > > "Writer" is used in two cases: first is replacing rules in which case
> > > all packets in flight have to be processed before rules are swapped,
> > > then counters are read from the old (stale) info. Second case is where
> > > counters need to be read on the fly, in this case all CPU's are blocked
> > > from further rule processing until values are aggregated.
> > > 
> > > The idea for this came from an earlier version done by Eric Dumazet.
> > > Locking is done per-cpu, the fast path locks on the current cpu
> > > and updates counters.  This reduces the contention of a
> > > single reader lock (in 2.6.29) without the delay of synchronize_net()
> > > (in 2.6.30-rc2). 
> > > 
> > > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > > there already is a mutex for xt[af].mutex that is held.
> > > 
> > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> > > 
> > > ---
> > > Changes from earlier patches.
> > >   - function name changes
> > >   - disable bottom half in info_rdlock
> > 
> > OK, but we still have a problem on machines with >= 250 cpus,
> > because calling 250 times spin_lock() is going to overflow preempt_count,
> > as each spin_lock() increases preempt_count by one.
> > 
> > PREEMPT_MASK: 0x000000ff
> > 
> > add_preempt_count() should warn us about this overflow if CONFIG_DEBUG_PREEMPT is set
> 
> Wouldn't 256 or higher CPU system be faster without preempt?  If there
> are that many CPU's, it is faster to do the work on other cpu and avoid
> the overhead of a hotly updated preempt count.

The preempt count is maintained per-CPU, so has low overhead.  The
problem is that for CONFIG_PREEMPT builds, the preempt disabing is
built into spin_lock().

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
@ 2009-04-20 21:05                                                                         ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-20 21:05 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Evgeniy Polyakov, David Miller, kaber, torvalds,
	jeff.chua.linux, paulus, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Mon, Apr 20, 2009 at 01:42:49PM -0700, Stephen Hemminger wrote:
> On Mon, 20 Apr 2009 20:25:14 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
> > Stephen Hemminger a écrit :
> > > This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> > > recursive lock that can be nested. It is sort of like existing kernel_lock,
> > > rwlock_t and even old 2.4 brlock.
> > > 
> > > "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> > > It needs to ensure that the rules are not being changed while packet
> > > is being processed.
> > > 
> > > "Writer" is used in two cases: first is replacing rules in which case
> > > all packets in flight have to be processed before rules are swapped,
> > > then counters are read from the old (stale) info. Second case is where
> > > counters need to be read on the fly, in this case all CPU's are blocked
> > > from further rule processing until values are aggregated.
> > > 
> > > The idea for this came from an earlier version done by Eric Dumazet.
> > > Locking is done per-cpu, the fast path locks on the current cpu
> > > and updates counters.  This reduces the contention of a
> > > single reader lock (in 2.6.29) without the delay of synchronize_net()
> > > (in 2.6.30-rc2). 
> > > 
> > > The mutex that was added for 2.6.30 in xt_table is unnecessary since
> > > there already is a mutex for xt[af].mutex that is held.
> > > 
> > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> > > 
> > > ---
> > > Changes from earlier patches.
> > >   - function name changes
> > >   - disable bottom half in info_rdlock
> > 
> > OK, but we still have a problem on machines with >= 250 cpus,
> > because calling 250 times spin_lock() is going to overflow preempt_count,
> > as each spin_lock() increases preempt_count by one.
> > 
> > PREEMPT_MASK: 0x000000ff
> > 
> > add_preempt_count() should warn us about this overflow if CONFIG_DEBUG_PREEMPT is set
> 
> Wouldn't 256 or higher CPU system be faster without preempt?  If there
> are that many CPU's, it is faster to do the work on other cpu and avoid
> the overhead of a hotly updated preempt count.

The preempt count is maintained per-CPU, so has low overhead.  The
problem is that for CONFIG_PREEMPT builds, the preempt disabing is
built into spin_lock().

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 18:25                                                                     ` Eric Dumazet
                                                                                       ` (2 preceding siblings ...)
  (?)
@ 2009-04-20 21:23                                                                     ` Paul Mackerras
  2009-04-20 21:58                                                                       ` Paul E. McKenney
  -1 siblings, 1 reply; 254+ messages in thread
From: Paul Mackerras @ 2009-04-20 21:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, paulmck, Evgeniy Polyakov, David Miller,
	kaber, torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Eric Dumazet writes:

> OK, but we still have a problem on machines with >= 250 cpus,
> because calling 250 times spin_lock() is going to overflow preempt_count,
> as each spin_lock() increases preempt_count by one.

Huh?  Each cpu has its own separate preempt_count.

Paul.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 21:23                                                                     ` Paul Mackerras
@ 2009-04-20 21:58                                                                       ` Paul E. McKenney
  2009-04-20 22:41                                                                         ` Paul Mackerras
  0 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-20 21:58 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Eric Dumazet, Stephen Hemminger, Evgeniy Polyakov, David Miller,
	kaber, torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Tue, Apr 21, 2009 at 07:23:31AM +1000, Paul Mackerras wrote:
> Eric Dumazet writes:
> 
> > OK, but we still have a problem on machines with >= 250 cpus,
> > because calling 250 times spin_lock() is going to overflow preempt_count,
> > as each spin_lock() increases preempt_count by one.
> 
> Huh?  Each cpu has its own separate preempt_count.

But a single CPU is acquiring one lock per CPU, so all the increments
are to one CPU's preempt_count.  :-(

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 21:58                                                                       ` Paul E. McKenney
@ 2009-04-20 22:41                                                                         ` Paul Mackerras
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
  2009-04-20 23:44                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v10) Paul E. McKenney
  0 siblings, 2 replies; 254+ messages in thread
From: Paul Mackerras @ 2009-04-20 22:41 UTC (permalink / raw)
  To: paulmck
  Cc: Eric Dumazet, Stephen Hemminger, Evgeniy Polyakov, David Miller,
	kaber, torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Paul E. McKenney writes:

> But a single CPU is acquiring one lock per CPU, so all the increments
> are to one CPU's preempt_count.  :-(

OK, I see, so a task can't take more than 255 spinlocks without
overflowing the preempt count, which seems a bit limiting.

There are 6 free bits in the preempt_count currently, so the preempt
count could be expanded to 14 bits, which would be enough for all
current systems.  Beyond that I guess we could make preempt_count be a
long and allow bigger counts on 64-bit architectures.

Paul.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-20 22:41                                                                         ` Paul Mackerras
@ 2009-04-20 23:01                                                                           ` Stephen Hemminger
  2009-04-21  3:41                                                                             ` Lai Jiangshan
                                                                                               ` (3 more replies)
  2009-04-20 23:44                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v10) Paul E. McKenney
  1 sibling, 4 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-20 23:01 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: paulmck, Eric Dumazet, Evgeniy Polyakov, David Miller, kaber,
	torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers


This version of x_tables (ip/ip6/arp) locking uses a per-cpu
recursive lock that can be nested. It is sort of like existing kernel_lock,
rwlock_t and even old 2.4 brlock.

"Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
It needs to ensure that the rules are not being changed while packet
is being processed.

"Writer" is used in two cases: first is replacing rules in which case
all packets in flight have to be processed before rules are swapped,
then counters are read from the old (stale) info. Second case is where
counters need to be read on the fly, in this case all CPU's are blocked
from further rule processing until values are aggregated.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2). 

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
CHANGES 
  - optimize for UP
  - disable bottom half in info_rdlock
  - prevent preempt count overflow
  - turn off lockdep in writer to avoid bogus warning
  - optimize unlock_bh

TODO
  - Figure out how to stop sparse warnings
  - hot plug CPU case, if kernel is built with large # of CPU's, skip
    the inactive ones; migrate values when CPU is removed.


 include/linux/netfilter/x_tables.h |   19 +++--
 net/ipv4/netfilter/arp_tables.c    |  110 ++++++-----------------------
 net/ipv4/netfilter/ip_tables.c     |  110 ++++++-----------------------
 net/ipv6/netfilter/ip6_tables.c    |  108 ++++++----------------------
 net/netfilter/x_tables.c           |  138 +++++++++++++++++++++++++++++++------
 5 files changed, 204 insertions(+), 281 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-20 13:57:28.281199339 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-20 15:33:55.664465517 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,20 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+#ifdef CONFIG_SMP
+extern void xt_info_rdlock_bh(void) __acquires(xt_table_info_lock);
+extern void xt_info_rdunlock_bh(void) __releases(xt_table_info_lock);
+extern void xt_info_wrlock_bh(void) __acquires(xt_table_info_lock);
+extern void xt_info_wrunlock_bh(void) __releases(xt_table_info_lock);
+#else
+extern rwlock_t xt_table_info_lock;
+
+#define xt_info_rdlock_bh()	read_lock_bh(&xt_table_info_lock)
+#define xt_info_rdunlock_bh()	read_unlock_bh(&xt_table_info_lock)
+#define xt_info_wrlock_bh()	write_lock_bh(&xt_table_info_lock)
+#define xt_info_wrunlock_bh()	write_unlock_bh(&xt_table_info_lock)
+#endif
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-20 13:57:28.270221233 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-20 13:57:30.437670996 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-20 13:57:28.237225858 -0700
+++ b/net/netfilter/x_tables.c	2009-04-20 15:40:54.316348074 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,6 +662,113 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+#ifdef CONFIG_SMP
+/*
+ * The info table entries are per-cpu, and are usually updated
+ * only by the current CPU.
+ */
+
+struct xt_info_lock {
+	spinlock_t 	   lock;
+	int   	   	   depth;	/* # readers - 1 */
+};
+static DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+
+static inline void xt_info_lock_init(struct xt_info_lock *lock)
+{
+	spin_lock_init(&lock->lock);
+	lock->depth = -1;
+}
+
+/**
+ * xt_table_info_rdlock_bh - recursive read lock for xt table info
+ *
+ * Table processing calls this to hold off any changes to table
+ * (on current CPU). Always leaves with bottom half disabled.
+ * If called recursively, then assumes bh/preempt already disabled.
+ */
+void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	preempt_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (likely(++lock->depth == 0))
+		spin_lock_bh(&lock->lock);
+	preempt_enable_no_resched();
+}
+EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
+
+/**
+ * xt_info_rdunlock_bh - release recursive table info lock
+ *
+ * Used after filter has updated
+ */
+void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	BUG_ON(lock->depth < 0);
+	if (likely(--lock->depth < 0))
+		spin_unlock_bh(&lock->lock);
+}
+EXPORT_SYMBOL_GPL(xt_info_rdunlock_bh);
+
+/**
+ * xt_info_wrlock_bh - lock xt table info for update
+ *
+ * Locks out all readers, and blocks bottom half
+ */
+void xt_info_wrlock_bh(void)
+{
+	unsigned int i;
+
+	local_bh_disable();
+	preempt_disable();
+
+	/* lockdep just can't handle per-cpu locks correctly;
+	 * it falsely reports conflict.
+	 */
+	lockdep_off();
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock(&lock->lock);
+		preempt_enable_no_resched(); /* avoid preempt count overflow */
+	}
+	lockdep_on();
+}
+EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
+
+/**
+ * xt_info_wrunlock_bh - lock xt table info for update
+ *
+ * Unlocks all readers, and unblocks bottom half
+ */
+void xt_info_wrunlock_bh(void)
+{
+	unsigned int i;
+
+	lockdep_off();
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		preempt_disable();	   /* other part of overflow prevent */
+		spin_unlock(&lock->lock);
+	}
+	lockdep_on();
+	preempt_enable_no_resched();
+	local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);
+
+#else /* !CONFIG_SMP */
+/* On UP we can just inline all of the above as reader/write lock
+ * and save space/time.
+ */
+DEFINE_RWLOCK(xt_table_info_lock);
+EXPORT_SYMBOL_GPL(xt_table_info_lock);
+#endif /* !CONFIG_SMP */
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -685,22 +778,21 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	xt_info_wrlock_bh();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		xt_info_wrunlock_bh();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	table->private =  newinfo;
+	newinfo->initial_entries = private->initial_entries;
+	xt_info_wrunlock_bh();
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +826,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1149,6 +1240,11 @@ static int __init xt_init(void)
 {
 	int i, rv;
 
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(i)
+		xt_info_lock_init(&per_cpu(xt_info_locks, i));
+#endif
+
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
 		return -ENOMEM;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-20 13:57:28.249221885 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-20 13:57:30.441246470 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
 	}
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-20 13:57:28.258205255 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-20 13:57:30.444261842 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -734,65 +734,11 @@ static void get_counters(const struct xt
 	}
 }
 
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
- unlock_up_free:
-	mutex_unlock(&t->lock);
 
+ unlock_up_free:
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v10)
  2009-04-20 22:41                                                                         ` Paul Mackerras
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
@ 2009-04-20 23:44                                                                           ` Paul E. McKenney
  1 sibling, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-20 23:44 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: Eric Dumazet, Stephen Hemminger, Evgeniy Polyakov, David Miller,
	kaber, torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

On Tue, Apr 21, 2009 at 08:41:36AM +1000, Paul Mackerras wrote:
> Paul E. McKenney writes:
> 
> > But a single CPU is acquiring one lock per CPU, so all the increments
> > are to one CPU's preempt_count.  :-(
> 
> OK, I see, so a task can't take more than 255 spinlocks without
> overflowing the preempt count, which seems a bit limiting.
> 
> There are 6 free bits in the preempt_count currently, so the preempt
> count could be expanded to 14 bits, which would be enough for all
> current systems.  Beyond that I guess we could make preempt_count be a
> long and allow bigger counts on 64-bit architectures.

Or we use the trick Eric suggested and Steve employed in the most recent
patch.  ;-)

An alternative would be for the update code to acquire but one lock at a
time, but this would likely require another lock to exclude other
updaters and I believe would also require restructuring the count
accumulation.

So Steve's current patch seems a bit less intrusive, overall.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
@ 2009-04-21  3:41                                                                             ` Lai Jiangshan
  2009-04-21  3:56                                                                               ` Eric Dumazet
  2009-04-21  4:59                                                                               ` Eric Dumazet
                                                                                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  3:41 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Stephen Hemminger wrote:
> +/**
> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> + *
> + * Table processing calls this to hold off any changes to table
> + * (on current CPU). Always leaves with bottom half disabled.
> + * If called recursively, then assumes bh/preempt already disabled.
> + */
> +void xt_info_rdlock_bh(void)
> +{
> +	struct xt_info_lock *lock;
> +
> +	preempt_disable();
> +	lock = &__get_cpu_var(xt_info_locks);
> +	if (likely(++lock->depth == 0))

Maybe I missed something. I think softirq may be still enabled here.
So what happen when xt_info_rdlock_bh() called recursively here?

> +		spin_lock_bh(&lock->lock);
> +	preempt_enable_no_resched();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> +

Is this OK for you:

void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	local_bh_disable();
	lock = &__get_cpu_var(xt_info_locks);
	if (likely(++lock->depth == 0))
		spin_lock(&lock->lock);
}

Lai.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  3:41                                                                             ` Lai Jiangshan
@ 2009-04-21  3:56                                                                               ` Eric Dumazet
  2009-04-21  4:15                                                                                 ` Stephen Hemminger
                                                                                                   ` (2 more replies)
  0 siblings, 3 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21  3:56 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Stephen Hemminger, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Lai Jiangshan a écrit :
> Stephen Hemminger wrote:
>> +/**
>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
>> + *
>> + * Table processing calls this to hold off any changes to table
>> + * (on current CPU). Always leaves with bottom half disabled.
>> + * If called recursively, then assumes bh/preempt already disabled.
>> + */
>> +void xt_info_rdlock_bh(void)
>> +{
>> +	struct xt_info_lock *lock;
>> +
>> +	preempt_disable();
>> +	lock = &__get_cpu_var(xt_info_locks);
>> +	if (likely(++lock->depth == 0))
> 
> Maybe I missed something. I think softirq may be still enabled here.
> So what happen when xt_info_rdlock_bh() called recursively here?

well, first time its called, you are right softirqs are enabled until
the point we call spin_lock_bh(), right after this line :


> 
>> +		spin_lock_bh(&lock->lock);
>> +	preempt_enable_no_resched();

After this line, both softirqs and preempt are disabled.

Future calls to this function temporarly raise preemptcount and decrease it.
(Null effect)

>> +}
>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
>> +
> 
> Is this OK for you:
> 
> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();

well, Stephen was trying to not change preempt count for the 2nd, 3rd, 4th?... invocation of this function.
This is how I understood the code.

> 	lock = &__get_cpu_var(xt_info_locks);
> 	if (likely(++lock->depth == 0))
> 		spin_lock(&lock->lock);
> }
> 
> Lai.
> 


Thanks for reviewing Lai


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  3:56                                                                               ` Eric Dumazet
@ 2009-04-21  4:15                                                                                 ` Stephen Hemminger
  2009-04-21  5:22                                                                                 ` Lai Jiangshan
  2009-04-21  5:34                                                                                   ` Lai Jiangshan
  2 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21  4:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Lai Jiangshan, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, 21 Apr 2009 05:56:55 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Lai Jiangshan a écrit :
> > Stephen Hemminger wrote:
> >> +/**
> >> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> >> + *
> >> + * Table processing calls this to hold off any changes to table
> >> + * (on current CPU). Always leaves with bottom half disabled.
> >> + * If called recursively, then assumes bh/preempt already disabled.
> >> + */
> >> +void xt_info_rdlock_bh(void)
> >> +{
> >> +	struct xt_info_lock *lock;
> >> +
> >> +	preempt_disable();
> >> +	lock = &__get_cpu_var(xt_info_locks);
> >> +	if (likely(++lock->depth == 0))
> > 
> > Maybe I missed something. I think softirq may be still enabled here.
> > So what happen when xt_info_rdlock_bh() called recursively here?
> 
> well, first time its called, you are right softirqs are enabled until
> the point we call spin_lock_bh(), right after this line :
> 
> 
> > 
> >> +		spin_lock_bh(&lock->lock);
> >> +	preempt_enable_no_resched();
> 
> After this line, both softirqs and preempt are disabled.
> 
> Future calls to this function temporarly raise preemptcount and decrease it.
> (Null effect)
> 
> >> +}
> >> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> >> +
> > 
> > Is this OK for you:
> > 
> > void xt_info_rdlock_bh(void)
> > {
> > 	struct xt_info_lock *lock;
> > 
> > 	local_bh_disable();
> 
> well, Stephen was trying to not change preempt count for the 2nd, 3rd, 4th?... invocation of this function.
> This is how I understood the code.
> 
> > 	lock = &__get_cpu_var(xt_info_locks);
> > 	if (likely(++lock->depth == 0))
> > 		spin_lock(&lock->lock);
> > }
> > 
> > Lai.
> > 

In this version, I was trying to use/preserve the optimizations that
are done in spin_unlock_bh().

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
@ 2009-04-21  4:59                                                                               ` Eric Dumazet
  2009-04-21  4:59                                                                               ` Eric Dumazet
                                                                                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21  4:59 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

I reviewed this patch believe its in quite good shape, thanks Stephen.

Then I tested it on a x86_32 8 cpus machine and got no obvious problem.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Hopefully, next rcu_bh (or whatever name is used) will permit us
to switch back to pure RCU in 2.6.31
 
oprofile snapshot of a tbench session, with light iptables rules.
(4 rules in INPUT chain, 3 rules on OUTPUT)

xt_info_rdlock_bh() uses 0.6786 % of cpu
xt_info_rdunlock_bh() uses 0.1743 % of cpu


CPU: Core 2, speed 3000.77 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  cum. samples  %        cum. %     symbol name
1248350  1248350       11.3285  11.3285    copy_from_user
534049   1782399        4.8464  16.1749    copy_to_user
480898   2263297        4.3641  20.5390    __schedule
325581   2588878        2.9546  23.4936    ipt_do_table
312697   2901575        2.8377  26.3312    tcp_ack
309381   3210956        2.8076  29.1388    tcp_sendmsg
248238   3459194        2.2527  31.3915    tcp_v4_rcv
230405   3689599        2.0909  33.4824    tcp_transmit_skb
220638   3910237        2.0022  35.4847    ip_queue_xmit
217099   4127336        1.9701  37.4548    tcp_recvmsg
175885   4303221        1.5961  39.0509    tcp_rcv_established
173112   4476333        1.5710  40.6219    __switch_to
165138   4641471        1.4986  42.1205    sysenter_past_esp
149367   4790838        1.3555  43.4759    dst_release
138619   4929457        1.2579  44.7339    sched_clock_cpu
132724   5062181        1.2044  45.9383    lock_sock_nested
121353   5183534        1.1013  47.0396    nf_iterate
119205   5302739        1.0818  48.1214    netif_receive_skb
118859   5421598        1.0786  49.2000    release_sock
112597   5534195        1.0218  50.2218    __inet_lookup_established
112195   5646390        1.0181  51.2399    sys_socketcall
110018   5756408        0.9984  52.2383    tcp_write_xmit
106466   5862874        0.9662  53.2045    __alloc_skb
93386    5956260        0.8475  54.0519    dev_queue_xmit
89229    6045489        0.8097  54.8617    tcp_event_data_recv
85972    6131461        0.7802  55.6418    local_bh_enable
82882    6214343        0.7521  56.3940    skb_release_data
80898    6295241        0.7341  57.1281    ip_rcv
76380    6371621        0.6931  57.8213    skb_copy_datagram_iovec
74782    6446403        0.6786  58.4999    xt_info_rdlock_bh
73593    6519996        0.6678  59.1677    mod_timer
72884    6592880        0.6614  59.8291    sock_recvmsg
71789    6664669        0.6515  60.4806    __copy_skb_header
70560    6735229        0.6403  61.1209    fget_light
68756    6803985        0.6239  61.7449    get_page_from_freelist
68378    6872363        0.6205  62.3654    put_page
68042    6940405        0.6175  62.9829    ip_finish_output
67618    7008023        0.6136  63.5965    page_address
64894    7072917        0.5889  64.1854    tcp_cleanup_rbuf


> 
> ---
> CHANGES 
>   - optimize for UP
>   - disable bottom half in info_rdlock
>   - prevent preempt count overflow
>   - turn off lockdep in writer to avoid bogus warning
>   - optimize unlock_bh
> 
>


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-21  4:59                                                                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21  4:59 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	torvalds, jeff.chua.linux, mingo, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested. It is sort of like existing kernel_lock,
> rwlock_t and even old 2.4 brlock.
> 
> "Reader" is ip/arp/ip6 tables rule processing which runs per-cpu.
> It needs to ensure that the rules are not being changed while packet
> is being processed.
> 
> "Writer" is used in two cases: first is replacing rules in which case
> all packets in flight have to be processed before rules are swapped,
> then counters are read from the old (stale) info. Second case is where
> counters need to be read on the fly, in this case all CPU's are blocked
> from further rule processing until values are aggregated.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

I reviewed this patch believe its in quite good shape, thanks Stephen.

Then I tested it on a x86_32 8 cpus machine and got no obvious problem.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Hopefully, next rcu_bh (or whatever name is used) will permit us
to switch back to pure RCU in 2.6.31
 
oprofile snapshot of a tbench session, with light iptables rules.
(4 rules in INPUT chain, 3 rules on OUTPUT)

xt_info_rdlock_bh() uses 0.6786 % of cpu
xt_info_rdunlock_bh() uses 0.1743 % of cpu


CPU: Core 2, speed 3000.77 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  cum. samples  %        cum. %     symbol name
1248350  1248350       11.3285  11.3285    copy_from_user
534049   1782399        4.8464  16.1749    copy_to_user
480898   2263297        4.3641  20.5390    __schedule
325581   2588878        2.9546  23.4936    ipt_do_table
312697   2901575        2.8377  26.3312    tcp_ack
309381   3210956        2.8076  29.1388    tcp_sendmsg
248238   3459194        2.2527  31.3915    tcp_v4_rcv
230405   3689599        2.0909  33.4824    tcp_transmit_skb
220638   3910237        2.0022  35.4847    ip_queue_xmit
217099   4127336        1.9701  37.4548    tcp_recvmsg
175885   4303221        1.5961  39.0509    tcp_rcv_established
173112   4476333        1.5710  40.6219    __switch_to
165138   4641471        1.4986  42.1205    sysenter_past_esp
149367   4790838        1.3555  43.4759    dst_release
138619   4929457        1.2579  44.7339    sched_clock_cpu
132724   5062181        1.2044  45.9383    lock_sock_nested
121353   5183534        1.1013  47.0396    nf_iterate
119205   5302739        1.0818  48.1214    netif_receive_skb
118859   5421598        1.0786  49.2000    release_sock
112597   5534195        1.0218  50.2218    __inet_lookup_established
112195   5646390        1.0181  51.2399    sys_socketcall
110018   5756408        0.9984  52.2383    tcp_write_xmit
106466   5862874        0.9662  53.2045    __alloc_skb
93386    5956260        0.8475  54.0519    dev_queue_xmit
89229    6045489        0.8097  54.8617    tcp_event_data_recv
85972    6131461        0.7802  55.6418    local_bh_enable
82882    6214343        0.7521  56.3940    skb_release_data
80898    6295241        0.7341  57.1281    ip_rcv
76380    6371621        0.6931  57.8213    skb_copy_datagram_iovec
74782    6446403        0.6786  58.4999    xt_info_rdlock_bh
73593    6519996        0.6678  59.1677    mod_timer
72884    6592880        0.6614  59.8291    sock_recvmsg
71789    6664669        0.6515  60.4806    __copy_skb_header
70560    6735229        0.6403  61.1209    fget_light
68756    6803985        0.6239  61.7449    get_page_from_freelist
68378    6872363        0.6205  62.3654    put_page
68042    6940405        0.6175  62.9829    ip_finish_output
67618    7008023        0.6136  63.5965    page_address
64894    7072917        0.5889  64.1854    tcp_cleanup_rbuf


> 
> ---
> CHANGES 
>   - optimize for UP
>   - disable bottom half in info_rdlock
>   - prevent preempt count overflow
>   - turn off lockdep in writer to avoid bogus warning
>   - optimize unlock_bh
> 
>

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  3:56                                                                               ` Eric Dumazet
  2009-04-21  4:15                                                                                 ` Stephen Hemminger
@ 2009-04-21  5:22                                                                                 ` Lai Jiangshan
  2009-04-21  5:45                                                                                     ` Stephen Hemminger
  2009-04-21  5:34                                                                                   ` Lai Jiangshan
  2 siblings, 1 reply; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  5:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Lai Jiangshan a écrit :
>> Stephen Hemminger wrote:
>>> +/**
>>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
>>> + *
>>> + * Table processing calls this to hold off any changes to table
>>> + * (on current CPU). Always leaves with bottom half disabled.
>>> + * If called recursively, then assumes bh/preempt already disabled.
>>> + */
>>> +void xt_info_rdlock_bh(void)
>>> +{
>>> +	struct xt_info_lock *lock;
>>> +
>>> +	preempt_disable();
>>> +	lock = &__get_cpu_var(xt_info_locks);
>>> +	if (likely(++lock->depth == 0))
>> Maybe I missed something. I think softirq may be still enabled here.
>> So what happen when xt_info_rdlock_bh() called recursively here?
> 
> well, first time its called, you are right softirqs are enabled until
> the point we call spin_lock_bh(), right after this line :

xt_info_rdlock_bh() called recursively here will enter the
critical region without &__get_cpu_var(xt_info_locks)->lock.

Because xt_info_rdlock_bh() called recursively here sees
lock->depth >= 0, and "++lock->depth == 0" is false.

> 
> 
>>> +		spin_lock_bh(&lock->lock);
>>> +	preempt_enable_no_resched();
> 
> After this line, both softirqs and preempt are disabled.
> 
> Future calls to this function temporarly raise preemptcount and decrease it.
> (Null effect)
> 
>>> +}
>>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
>>> +
>> Is this OK for you:
>>
>> void xt_info_rdlock_bh(void)
>> {
>> 	struct xt_info_lock *lock;
>>
>> 	local_bh_disable();
> 
> well, Stephen was trying to not change preempt count for the 2nd, 3rd, 4th?... invocation of this function.
> This is how I understood the code.
> 
>> 	lock = &__get_cpu_var(xt_info_locks);
>> 	if (likely(++lock->depth == 0))
>> 		spin_lock(&lock->lock);
>> }
>>

Sorry for it.
Is this OK:

void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	local_bh_disable();
	lock = &__get_cpu_var(xt_info_locks);
	if (likely(++lock->depth == 0))
		spin_lock(&lock->lock);
	else
		local_bh_enable();
}

I did not think things carefully enough, and I do know
nothing about ip/ip6/arp.

Lai




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  3:56                                                                               ` Eric Dumazet
@ 2009-04-21  5:34                                                                                   ` Lai Jiangshan
  2009-04-21  5:22                                                                                 ` Lai Jiangshan
  2009-04-21  5:34                                                                                   ` Lai Jiangshan
  2 siblings, 0 replies; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  5:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Lai Jiangshan a écrit :
>> Stephen Hemminger wrote:
>>> +/**
>>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
>>> + *
>>> + * Table processing calls this to hold off any changes to table
>>> + * (on current CPU). Always leaves with bottom half disabled.
>>> + * If called recursively, then assumes bh/preempt already disabled.
>>> + */
>>> +void xt_info_rdlock_bh(void)
>>> +{
>>> +	struct xt_info_lock *lock;
>>> +
>>> +	preempt_disable();
>>> +	lock = &__get_cpu_var(xt_info_locks);
>>> +	if (likely(++lock->depth == 0))
>> Maybe I missed something. I think softirq may be still enabled here.
>> So what happen when xt_info_rdlock_bh() called recursively here?
> 
> well, first time its called, you are right softirqs are enabled until
> the point we call spin_lock_bh(), right after this line :
> 
> 

Which context can enter the critical region?
Can irq and softirq? or softirq only?


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-21  5:34                                                                                   ` Lai Jiangshan
  0 siblings, 0 replies; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  5:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Lai Jiangshan a écrit :
>> Stephen Hemminger wrote:
>>> +/**
>>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
>>> + *
>>> + * Table processing calls this to hold off any changes to table
>>> + * (on current CPU). Always leaves with bottom half disabled.
>>> + * If called recursively, then assumes bh/preempt already disabled.
>>> + */
>>> +void xt_info_rdlock_bh(void)
>>> +{
>>> +	struct xt_info_lock *lock;
>>> +
>>> +	preempt_disable();
>>> +	lock = &__get_cpu_var(xt_info_locks);
>>> +	if (likely(++lock->depth == 0))
>> Maybe I missed something. I think softirq may be still enabled here.
>> So what happen when xt_info_rdlock_bh() called recursively here?
> 
> well, first time its called, you are right softirqs are enabled until
> the point we call spin_lock_bh(), right after this line :
> 
> 

Which context can enter the critical region?
Can irq and softirq? or softirq only?

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  5:22                                                                                 ` Lai Jiangshan
@ 2009-04-21  5:45                                                                                     ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21  5:45 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Eric Dumazet, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, 21 Apr 2009 13:22:27 +0800
Lai Jiangshan <laijs@cn.fujitsu.com> wrote:

> Eric Dumazet wrote:
> > Lai Jiangshan a écrit :
> >> Stephen Hemminger wrote:
> >>> +/**
> >>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> >>> + *
> >>> + * Table processing calls this to hold off any changes to table
> >>> + * (on current CPU). Always leaves with bottom half disabled.
> >>> + * If called recursively, then assumes bh/preempt already disabled.
> >>> + */
> >>> +void xt_info_rdlock_bh(void)
> >>> +{
> >>> +	struct xt_info_lock *lock;
> >>> +
> >>> +	preempt_disable();
> >>> +	lock = &__get_cpu_var(xt_info_locks);
> >>> +	if (likely(++lock->depth == 0))
> >> Maybe I missed something. I think softirq may be still enabled here.
> >> So what happen when xt_info_rdlock_bh() called recursively here?
> > 
> > well, first time its called, you are right softirqs are enabled until
> > the point we call spin_lock_bh(), right after this line :
> 
> xt_info_rdlock_bh() called recursively here will enter the
> critical region without &__get_cpu_var(xt_info_locks)->lock.

NO spin_lock_bh always does a preempt_disable

  xt_info_rdlock_bh            (depth = -1)
+1         preempt_disable
           spin_lock_bh
+1             preempt_disable
-1         preempt_enable_no_resched
---
+1

Second call preempt_count=1   (depth = 0)
       xt_info_rdlock_bh
+1         preempt_disable
-1         preempt_enable_no_resched
---

Result is preempt_count=1 (depth = 1)


Now lets do unlocks
       xt_info_rdunlock_bh  preempt_count=1 depth=1
          does nothing
   xt_info_rdunlock_bh      preempt_count=1 depth = 0
-1      spin_unlock_bh

Resulting preempt_count=0  depth = -1

Same as starting point.

> Because xt_info_rdlock_bh() called recursively here sees
> lock->depth >= 0, and "++lock->depth == 0" is false.
> 
> > 
> > 
> >>> +		spin_lock_bh(&lock->lock);
> >>> +	preempt_enable_no_resched();
> > 
> > After this line, both softirqs and preempt are disabled.

No. spin_lock_bh on first pass does this.

> > Future calls to this function temporarly raise preemptcount and decrease it.
> > (Null effect)
> > 
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> >>> +
> >> Is this OK for you:
> >>
> >> void xt_info_rdlock_bh(void)
> >> {
> >> 	struct xt_info_lock *lock;
> >>
> >> 	local_bh_disable();
> > 
> > well, Stephen was trying to not change preempt count for the 2nd, 3rd, 4th?... invocation of this function.
> > This is how I understood the code.
> > 
> >> 	lock = &__get_cpu_var(xt_info_locks);
> >> 	if (likely(++lock->depth == 0))
> >> 		spin_lock(&lock->lock);
> >> }
> >>
> 
> Sorry for it.
> Is this OK:
> 
> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();
> 	lock = &__get_cpu_var(xt_info_locks);
> 	if (likely(++lock->depth == 0))
> 		spin_lock(&lock->lock);
> 	else
> 		local_bh_enable();
> }

Unnecessary.

> I did not think things carefully enough, and I do know
> nothing about ip/ip6/arp.
> 
> Lai

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-21  5:45                                                                                     ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21  5:45 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Eric Dumazet, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, 21 Apr 2009 13:22:27 +0800
Lai Jiangshan <laijs@cn.fujitsu.com> wrote:

> Eric Dumazet wrote:
> > Lai Jiangshan a écrit :
> >> Stephen Hemminger wrote:
> >>> +/**
> >>> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> >>> + *
> >>> + * Table processing calls this to hold off any changes to table
> >>> + * (on current CPU). Always leaves with bottom half disabled.
> >>> + * If called recursively, then assumes bh/preempt already disabled.
> >>> + */
> >>> +void xt_info_rdlock_bh(void)
> >>> +{
> >>> +	struct xt_info_lock *lock;
> >>> +
> >>> +	preempt_disable();
> >>> +	lock = &__get_cpu_var(xt_info_locks);
> >>> +	if (likely(++lock->depth == 0))
> >> Maybe I missed something. I think softirq may be still enabled here.
> >> So what happen when xt_info_rdlock_bh() called recursively here?
> > 
> > well, first time its called, you are right softirqs are enabled until
> > the point we call spin_lock_bh(), right after this line :
> 
> xt_info_rdlock_bh() called recursively here will enter the
> critical region without &__get_cpu_var(xt_info_locks)->lock.

NO spin_lock_bh always does a preempt_disable

  xt_info_rdlock_bh            (depth = -1)
+1         preempt_disable
           spin_lock_bh
+1             preempt_disable
-1         preempt_enable_no_resched
---
+1

Second call preempt_count=1   (depth = 0)
       xt_info_rdlock_bh
+1         preempt_disable
-1         preempt_enable_no_resched
---

Result is preempt_count=1 (depth = 1)


Now lets do unlocks
       xt_info_rdunlock_bh  preempt_count=1 depth=1
          does nothing
   xt_info_rdunlock_bh      preempt_count=1 depth = 0
-1      spin_unlock_bh

Resulting preempt_count=0  depth = -1

Same as starting point.

> Because xt_info_rdlock_bh() called recursively here sees
> lock->depth >= 0, and "++lock->depth == 0" is false.
> 
> > 
> > 
> >>> +		spin_lock_bh(&lock->lock);
> >>> +	preempt_enable_no_resched();
> > 
> > After this line, both softirqs and preempt are disabled.

No. spin_lock_bh on first pass does this.

> > Future calls to this function temporarly raise preemptcount and decrease it.
> > (Null effect)
> > 
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> >>> +
> >> Is this OK for you:
> >>
> >> void xt_info_rdlock_bh(void)
> >> {
> >> 	struct xt_info_lock *lock;
> >>
> >> 	local_bh_disable();
> > 
> > well, Stephen was trying to not change preempt count for the 2nd, 3rd, 4th?... invocation of this function.
> > This is how I understood the code.
> > 
> >> 	lock = &__get_cpu_var(xt_info_locks);
> >> 	if (likely(++lock->depth == 0))
> >> 		spin_lock(&lock->lock);
> >> }
> >>
> 
> Sorry for it.
> Is this OK:
> 
> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();
> 	lock = &__get_cpu_var(xt_info_locks);
> 	if (likely(++lock->depth == 0))
> 		spin_lock(&lock->lock);
> 	else
> 		local_bh_enable();
> }

Unnecessary.

> I did not think things carefully enough, and I do know
> nothing about ip/ip6/arp.
> 
> Lai
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
  2009-04-21  3:41                                                                             ` Lai Jiangshan
  2009-04-21  4:59                                                                               ` Eric Dumazet
@ 2009-04-21  5:46                                                                             ` Lai Jiangshan
  2009-04-21 16:13                                                                             ` Linus Torvalds
  3 siblings, 0 replies; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  5:46 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers


My silly opinion about lockdep in this patch:

> +
> +struct xt_info_lock {
> +	spinlock_t 	   lock;

+	struct lock_class_key key;

> +	int   	   	   depth;	/* # readers - 1 */
> +};
> +static DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
> +
> +
> +static inline void xt_info_lock_init(struct xt_info_lock *lock)
> +{
> +	spin_lock_init(&lock->lock);

+	lockdep_set_class(&lock->lock, &lock->key);

> +	lock->depth = -1;
> +}
> +

And remove lockdep_xxx()s.



^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  5:45                                                                                     ` Stephen Hemminger
  (?)
@ 2009-04-21  6:52                                                                                     ` Lai Jiangshan
  2009-04-21  8:16                                                                                       ` Evgeniy Polyakov
  -1 siblings, 1 reply; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  6:52 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Paul Mackerras, paulmck, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Stephen Hemminger wrote:
>> xt_info_rdlock_bh() called recursively here will enter the
>> critical region without &__get_cpu_var(xt_info_locks)->lock.
> 
> NO spin_lock_bh always does a preempt_disable
> 
>   xt_info_rdlock_bh            (depth = -1)
> +1         preempt_disable
>            spin_lock_bh
> +1             preempt_disable
> -1         preempt_enable_no_resched
> ---
> +1
> 
> Second call preempt_count=1   (depth = 0)
>        xt_info_rdlock_bh
> +1         preempt_disable
> -1         preempt_enable_no_resched
> ---
> 

I think I had made you mistook my email.

The preempt_count is correct, but I thought about lock-save:
we must hold &__get_cpu_var(xt_info_locks)->lock
when we enter the read-side critical region.

--------------------
xt_info_rdlock_bh() (depth = -1)
  preempt_disable()
  depth++
  ==========>interrupt here
  ==========>
  ==========>xt_info_rdlock_bh() (depth = 0)
  ==========>  preempt_disable()
  ==========>  depth++
  ==========>  preempt_enable_no_resched()
  ==========>
  ==========>enter the read-side critical region *without* lock.
  ==========>  it may get trashy data.
  ==========>
  ==========>xt_info_rdunlock_bh()
  ==========>
  ==========>interrupt return.
  spin_lock_bh()
  preempt_enable_no_resched()

enter the read-side critical region *with* lock.

xt_info_rdunlock_bh().
----------------------

So I asked:
> +void xt_info_rdlock_bh(void)
> > +{
> > +	struct xt_info_lock *lock;
> > +
> > +	preempt_disable();
> > +	lock = &__get_cpu_var(xt_info_locks);
> > +	if (likely(++lock->depth == 0))

So what happen when xt_info_rdlock_bh() called recursively here?

> > +		spin_lock_bh(&lock->lock);
> > +	preempt_enable_no_resched();
> > +}
> > +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> > +

----------
Is this OK? (Now I suppose we can enter the read-side critical region
in irq context)

void xt_info_rdlock_bh(void)
{
	unsigned long flags;
	struct xt_info_lock *lock;

	local_irq_save(flags);
	lock = &__get_cpu_var(xt_info_locks);
	if (likely(++lock->depth == 0))
		spin_lock_bh(&lock->lock);
	local_irq_restore(flags);
}

Lai


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  6:52                                                                                     ` Lai Jiangshan
@ 2009-04-21  8:16                                                                                       ` Evgeniy Polyakov
  2009-04-21  8:42                                                                                         ` Lai Jiangshan
  2009-04-21  8:55                                                                                         ` Eric Dumazet
  0 siblings, 2 replies; 254+ messages in thread
From: Evgeniy Polyakov @ 2009-04-21  8:16 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Stephen Hemminger, Eric Dumazet, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Hi.

On Tue, Apr 21, 2009 at 02:52:30PM +0800, Lai Jiangshan (laijs@cn.fujitsu.com) wrote:
> > +void xt_info_rdlock_bh(void)
> > > +{
> > > +	struct xt_info_lock *lock;
> > > +
> > > +	preempt_disable();
> > > +	lock = &__get_cpu_var(xt_info_locks);
> > > +	if (likely(++lock->depth == 0))
> 
> So what happen when xt_info_rdlock_bh() called recursively here?
> 
> > > +		spin_lock_bh(&lock->lock);
> > > +	preempt_enable_no_resched();
> > > +}
> > > +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> > > +
> 
> ----------
> Is this OK? (Now I suppose we can enter the read-side critical region
> in irq context)
> 
> void xt_info_rdlock_bh(void)
> {
> 	unsigned long flags;
> 	struct xt_info_lock *lock;
> 
> 	local_irq_save(flags);
> 	lock = &__get_cpu_var(xt_info_locks);
> 	if (likely(++lock->depth == 0))
> 		spin_lock_bh(&lock->lock);
> 	local_irq_restore(flags);
> }

Netfilter as long as other generic network pathes are never accessed
from interrupt context, but your analysis looks right for the softirq
case.

Stephen, should preempt_disable() be replaced with local_bh_disable() to
prevent softirq to race on the same cpu for the lock's depth field? Or
can it be made atomic?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  8:16                                                                                       ` Evgeniy Polyakov
@ 2009-04-21  8:42                                                                                         ` Lai Jiangshan
  2009-04-21  8:49                                                                                           ` David Miller
  2009-04-21  8:55                                                                                         ` Eric Dumazet
  1 sibling, 1 reply; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  8:42 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Stephen Hemminger, Eric Dumazet, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Evgeniy Polyakov wrote:
> 
> Netfilter as long as other generic network pathes are never accessed
> from interrupt context, but your analysis looks right for the softirq
> case.
> 

A question:

softirq is always not nesting. Why we need recursive lock?

Lai.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  8:42                                                                                         ` Lai Jiangshan
@ 2009-04-21  8:49                                                                                           ` David Miller
  0 siblings, 0 replies; 254+ messages in thread
From: David Miller @ 2009-04-21  8:49 UTC (permalink / raw)
  To: laijs
  Cc: zbr, shemminger, dada1, paulus, paulmck, kaber, torvalds,
	jeff.chua.linux, mingo, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 16:42:40 +0800

> softirq is always not nesting. Why we need recursive lock?

Netfilter itself, is nesting.

When using bridging netfilter, iptables can be entered twice
in the same call chain.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  8:16                                                                                       ` Evgeniy Polyakov
  2009-04-21  8:42                                                                                         ` Lai Jiangshan
@ 2009-04-21  8:55                                                                                         ` Eric Dumazet
  2009-04-21  9:22                                                                                           ` Evgeniy Polyakov
  2009-04-21  9:34                                                                                             ` Lai Jiangshan
  1 sibling, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21  8:55 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Lai Jiangshan, Stephen Hemminger, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Evgeniy Polyakov a écrit :
> Hi.
> 
> On Tue, Apr 21, 2009 at 02:52:30PM +0800, Lai Jiangshan (laijs@cn.fujitsu.com) wrote:
>>> +void xt_info_rdlock_bh(void)
>>>> +{
>>>> +	struct xt_info_lock *lock;
>>>> +
>>>> +	preempt_disable();
>>>> +	lock = &__get_cpu_var(xt_info_locks);
>>>> +	if (likely(++lock->depth == 0))
>> So what happen when xt_info_rdlock_bh() called recursively here?
>>
>>>> +		spin_lock_bh(&lock->lock);
>>>> +	preempt_enable_no_resched();
>>>> +}
>>>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
>>>> +
>> ----------
>> Is this OK? (Now I suppose we can enter the read-side critical region
>> in irq context)
>>
>> void xt_info_rdlock_bh(void)
>> {
>> 	unsigned long flags;
>> 	struct xt_info_lock *lock;
>>
>> 	local_irq_save(flags);
>> 	lock = &__get_cpu_var(xt_info_locks);
>> 	if (likely(++lock->depth == 0))
>> 		spin_lock_bh(&lock->lock);
>> 	local_irq_restore(flags);
>> }
> 
> Netfilter as long as other generic network pathes are never accessed
> from interrupt context, but your analysis looks right for the softirq
> case.
> 
> Stephen, should preempt_disable() be replaced with local_bh_disable() to
> prevent softirq to race on the same cpu for the lock's depth field? Or
> can it be made atomic?
> 


Maybe just dont care about calling several time local_bh_disable()
(since we were doing this in previous kernels anyway, we used to call read_lock_bh())

This shortens fastpath, is faster than local_irq_save()/local_irq_restore(),
and looks better.

void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	local_bh_disable();
 	lock = &__get_cpu_var(xt_info_locks);
 	if (likely(++lock->depth == 0))
 		spin_lock(&lock->lock);
}

void xt_info_rdunlock_bh(void)
{
	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);

	BUG_ON(lock->depth < 0);
	if (likely(--lock->depth < 0))
		 spin_unlock(&lock->lock);
	local_bh_enable();
}




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  8:55                                                                                         ` Eric Dumazet
@ 2009-04-21  9:22                                                                                           ` Evgeniy Polyakov
  2009-04-21  9:34                                                                                             ` Lai Jiangshan
  1 sibling, 0 replies; 254+ messages in thread
From: Evgeniy Polyakov @ 2009-04-21  9:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Lai Jiangshan, Stephen Hemminger, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, Apr 21, 2009 at 10:55:59AM +0200, Eric Dumazet (dada1@cosmosbay.com) wrote:
> Maybe just dont care about calling several time local_bh_disable()
> (since we were doing this in previous kernels anyway, we used to call read_lock_bh())
> 
> This shortens fastpath, is faster than local_irq_save()/local_irq_restore(),
> and looks better.

Yeah, given that non-nested locking is more likely condition, it will be
even faster than preemption case.

> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();
>  	lock = &__get_cpu_var(xt_info_locks);
>  	if (likely(++lock->depth == 0))
>  		spin_lock(&lock->lock);
> }
> 
> void xt_info_rdunlock_bh(void)
> {
> 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
> 
> 	BUG_ON(lock->depth < 0);
> 	if (likely(--lock->depth < 0))
> 		 spin_unlock(&lock->lock);
> 	local_bh_enable();
> }
> 
> 

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  8:55                                                                                         ` Eric Dumazet
@ 2009-04-21  9:34                                                                                             ` Lai Jiangshan
  2009-04-21  9:34                                                                                             ` Lai Jiangshan
  1 sibling, 0 replies; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  9:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Evgeniy Polyakov, Stephen Hemminger, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Evgeniy Polyakov a écrit :
>> Hi.
>>
>> On Tue, Apr 21, 2009 at 02:52:30PM +0800, Lai Jiangshan (laijs@cn.fujitsu.com) wrote:
>>>> +void xt_info_rdlock_bh(void)
>>>>> +{
>>>>> +	struct xt_info_lock *lock;
>>>>> +
>>>>> +	preempt_disable();
>>>>> +	lock = &__get_cpu_var(xt_info_locks);
>>>>> +	if (likely(++lock->depth == 0))
>>> So what happen when xt_info_rdlock_bh() called recursively here?
>>>
>>>>> +		spin_lock_bh(&lock->lock);
>>>>> +	preempt_enable_no_resched();
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
>>>>> +
>>> ----------
>>> Is this OK? (Now I suppose we can enter the read-side critical region
>>> in irq context)
>>>
>>> void xt_info_rdlock_bh(void)
>>> {
>>> 	unsigned long flags;
>>> 	struct xt_info_lock *lock;
>>>
>>> 	local_irq_save(flags);
>>> 	lock = &__get_cpu_var(xt_info_locks);
>>> 	if (likely(++lock->depth == 0))
>>> 		spin_lock_bh(&lock->lock);
>>> 	local_irq_restore(flags);
>>> }
>> Netfilter as long as other generic network pathes are never accessed
>> from interrupt context, but your analysis looks right for the softirq
>> case.
>>
>> Stephen, should preempt_disable() be replaced with local_bh_disable() to
>> prevent softirq to race on the same cpu for the lock's depth field? Or
>> can it be made atomic?
>>
> 
> 
> Maybe just dont care about calling several time local_bh_disable()
> (since we were doing this in previous kernels anyway, we used to call read_lock_bh())
> 
> This shortens fastpath, is faster than local_irq_save()/local_irq_restore(),
> and looks better.
> 
> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();
>  	lock = &__get_cpu_var(xt_info_locks);
>  	if (likely(++lock->depth == 0))
>  		spin_lock(&lock->lock);
> }

This two functions is OK. But...

> 
> void xt_info_rdunlock_bh(void)
> {
> 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
> 
> 	BUG_ON(lock->depth < 0);
> 	if (likely(--lock->depth < 0))
> 		 spin_unlock(&lock->lock);
> 	local_bh_enable();
> }
> 
> 


David said:
Netfilter itself, is nesting.

When using bridging netfilter, iptables can be entered twice
in the same call chain.

And Stephen said:
In this version, I was trying to use/preserve the optimizations that
are done in spin_unlock_bh().

So:

void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	preempt_disable();
 	lock = &__get_cpu_var(xt_info_locks);
 	if (likely(lock->depth < 0))
 		spin_lock_bh(&lock->lock);
	/* softirq is disabled now */
	++lock->depth;
	preempt_enable_no_resched();
}

xt_info_rdunlock_bh() is the same as v11.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-21  9:34                                                                                             ` Lai Jiangshan
  0 siblings, 0 replies; 254+ messages in thread
From: Lai Jiangshan @ 2009-04-21  9:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Evgeniy Polyakov, Stephen Hemminger, Paul Mackerras, paulmck,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Evgeniy Polyakov a écrit :
>> Hi.
>>
>> On Tue, Apr 21, 2009 at 02:52:30PM +0800, Lai Jiangshan (laijs@cn.fujitsu.com) wrote:
>>>> +void xt_info_rdlock_bh(void)
>>>>> +{
>>>>> +	struct xt_info_lock *lock;
>>>>> +
>>>>> +	preempt_disable();
>>>>> +	lock = &__get_cpu_var(xt_info_locks);
>>>>> +	if (likely(++lock->depth == 0))
>>> So what happen when xt_info_rdlock_bh() called recursively here?
>>>
>>>>> +		spin_lock_bh(&lock->lock);
>>>>> +	preempt_enable_no_resched();
>>>>> +}
>>>>> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
>>>>> +
>>> ----------
>>> Is this OK? (Now I suppose we can enter the read-side critical region
>>> in irq context)
>>>
>>> void xt_info_rdlock_bh(void)
>>> {
>>> 	unsigned long flags;
>>> 	struct xt_info_lock *lock;
>>>
>>> 	local_irq_save(flags);
>>> 	lock = &__get_cpu_var(xt_info_locks);
>>> 	if (likely(++lock->depth == 0))
>>> 		spin_lock_bh(&lock->lock);
>>> 	local_irq_restore(flags);
>>> }
>> Netfilter as long as other generic network pathes are never accessed
>> from interrupt context, but your analysis looks right for the softirq
>> case.
>>
>> Stephen, should preempt_disable() be replaced with local_bh_disable() to
>> prevent softirq to race on the same cpu for the lock's depth field? Or
>> can it be made atomic?
>>
> 
> 
> Maybe just dont care about calling several time local_bh_disable()
> (since we were doing this in previous kernels anyway, we used to call read_lock_bh())
> 
> This shortens fastpath, is faster than local_irq_save()/local_irq_restore(),
> and looks better.
> 
> void xt_info_rdlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	local_bh_disable();
>  	lock = &__get_cpu_var(xt_info_locks);
>  	if (likely(++lock->depth == 0))
>  		spin_lock(&lock->lock);
> }

This two functions is OK. But...

> 
> void xt_info_rdunlock_bh(void)
> {
> 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
> 
> 	BUG_ON(lock->depth < 0);
> 	if (likely(--lock->depth < 0))
> 		 spin_unlock(&lock->lock);
> 	local_bh_enable();
> }
> 
> 


David said:
Netfilter itself, is nesting.

When using bridging netfilter, iptables can be entered twice
in the same call chain.

And Stephen said:
In this version, I was trying to use/preserve the optimizations that
are done in spin_unlock_bh().

So:

void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	preempt_disable();
 	lock = &__get_cpu_var(xt_info_locks);
 	if (likely(lock->depth < 0))
 		spin_lock_bh(&lock->lock);
	/* softirq is disabled now */
	++lock->depth;
	preempt_enable_no_resched();
}

xt_info_rdunlock_bh() is the same as v11.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
                                                                                               ` (2 preceding siblings ...)
  2009-04-21  5:46                                                                             ` Lai Jiangshan
@ 2009-04-21 16:13                                                                             ` Linus Torvalds
  2009-04-21 16:43                                                                               ` Stephen Hemminger
                                                                                                 ` (3 more replies)
  3 siblings, 4 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-21 16:13 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, jeff.chua.linux, mingo, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers


Ok, so others already pointed out how dangerous/buggy this looks, but I'd 
like to strengthen that a bit more:

On Mon, 20 Apr 2009, Stephen Hemminger wrote:
> +
> +/**
> + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> + *
> + * Table processing calls this to hold off any changes to table
> + * (on current CPU). Always leaves with bottom half disabled.
> + * If called recursively, then assumes bh/preempt already disabled.
> + */
> +void xt_info_rdlock_bh(void)
> +{
> +	struct xt_info_lock *lock;
> +
> +	preempt_disable();
> +	lock = &__get_cpu_var(xt_info_locks);
> +	if (likely(++lock->depth == 0))
> +		spin_lock_bh(&lock->lock);
> +	preempt_enable_no_resched();
> +}
> +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);

This function is FUCKED UP.

It's total crap for several reasons"

 - the already-mentioned race with bottom half locking.

   If bottom halfs aren't already disabled, then if a bottom half comes in 
   after the "++lock->depth" and before the spin_lock_bh(), then you will 
   have no locking AT ALL for the processing of that bottom half - it will 
   just increment the lock depth again, and nobody will have locked 
   anything at all.

   And if for some reason, you can prove that bottom half processing is 
   already disabled, then ALL THAT OTHER CRAP is just that - CRAP. The 
   whole preemption disabling, the whole "_bh()" thing, everything.

   So either it's horribly buggy, or it's horribly broken and pointless. 
   Take your pick.

 - the total lack of comments. Why does that "count" protect anything? 
   It's not a recursive lock, since there is no ownership (two 
   independent accessors could call this and both "get" the lock), so you 
   had damn well better create some big-ass comments about why it's ok in 
   this case, and what the rules are that make it ok.

   So DON'T GO AROUND CALLING IT A RECURSIVE LOCK! Don't write comments 
   that are TOTAL AND UTTER SH*T! Just DON'T!

   It's a "reader lock". It's not "recursive".  It never was recursive, it 
   never will be, and calling it that is just a sign that whoever wrote 
   the function is a moron and doesn't know what he is doing. STOP DOING THIS!

 - that _idiotic_ "preempt_enable_no_resched()". F*ck me, but the comment 
   already says that preemption is disabled when exiting, so why does it 
   even bother to enable it? Why play those mindless games with preemption 
   counts, knowing that they are bogus?

   Do it readably. Disable preemption first, and just re-enable it at 
   UNLOCK time. No odd pseudo-reenables anywhere.

Oh, I know very well that the spli-locking will disable preemption, so 
it's "correct" to play those games, but the point is, it's just damn 
stupid and annoying. It just makes the source code actively _misleading_ 
to see crap like that - it looks like you enabled preemption, when in fact 
the code very much on purpose does _not_ enable preemption at all. 

In other words, I really REALLY hate that patch. I think it looks slightly 
better than Eric Dumazet's original patch (at least the locking subtlety 
is now in a function of its own and _could_ be commented upon sanely and 
if it wasn't broken it might be acceptable), but quite frankly, I'd still 
horribly disgusted with the crap.

Why are you doing this insane thing? If you want a read-lock, just use the 
damned read-write locks already! Ad far as I can tell, this lock is in 
_no_ way better than just using those counting reader-writer locks, except 
it is open-coded and looks buggy.

There is basically _never_ a good reason to re-implement locking 
primitives: you'll just introduce bugs. As proven very ably by the amount 
of crap above in what is supposed to be a very simple function.

If you want a counting read-lock (in _order_ to allow recursion, but not 
because the lock itself is somehow recursive!), then that function should 
look like this:

	void xt_info_rdlock_bh(void)
	{
		struct xt_info_lock *lock

		local_bh_disable();
		lock = &__get_cpu_var(xt_info_locks);
		read_lock(&lock->lock);
	}

And then the "unlock" should be the reverse. No games, no crap, and 
hopefully then no bugs. And if you do it that way, you don't even need the 
comments, although quite frankly, it probably makes a lot of sense to talk 
about the interaction between "local_bh_disable()" and the preempt count, 
and why you're not using "read_lock_bh()".

And if you don't want a read-lock, then fine - don't use a read-lock, do 
something else. But then don't just re-implement it (badly) either and 
call it something else!

			Linus

PS: Ingo, why do the *_bh() functions in kernel/spinlock.c do _both_ a 
"local_bh_disable()" and a "preempt_disable()"? BH disable should disable 
preemption too, no? Or am I confused? In which case we need that in 
the above rdlock_bh too.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21  4:59                                                                               ` Eric Dumazet
  (?)
@ 2009-04-21 16:37                                                                               ` Paul E. McKenney
  -1 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-21 16:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Paul Mackerras, Evgeniy Polyakov,
	David Miller, kaber, torvalds, jeff.chua.linux, mingo, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, Apr 21, 2009 at 06:59:29AM +0200, Eric Dumazet wrote:

[ . . . ]

> Hopefully, next rcu_bh (or whatever name is used) will permit us
> to switch back to pure RCU in 2.6.31

My excursion in to rcu_fgp can be thought of as a training exercise for
doing this, though perhaps others are also working on an upgraded rcu_bh
as well.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 16:13                                                                             ` Linus Torvalds
@ 2009-04-21 16:43                                                                               ` Stephen Hemminger
  2009-04-21 16:50                                                                                 ` Linus Torvalds
  2009-04-21 18:02                                                                               ` Ingo Molnar
                                                                                                 ` (2 subsequent siblings)
  3 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21 16:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, jeff.chua.linux, mingo, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, 21 Apr 2009 09:13:52 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> Ok, so others already pointed out how dangerous/buggy this looks, but I'd 
> like to strengthen that a bit more:
> 
> On Mon, 20 Apr 2009, Stephen Hemminger wrote:
> > +
> > +/**
> > + * xt_table_info_rdlock_bh - recursive read lock for xt table info
> > + *
> > + * Table processing calls this to hold off any changes to table
> > + * (on current CPU). Always leaves with bottom half disabled.
> > + * If called recursively, then assumes bh/preempt already disabled.
> > + */
> > +void xt_info_rdlock_bh(void)
> > +{
> > +	struct xt_info_lock *lock;
> > +
> > +	preempt_disable();
> > +	lock = &__get_cpu_var(xt_info_locks);
> > +	if (likely(++lock->depth == 0))
> > +		spin_lock_bh(&lock->lock);
> > +	preempt_enable_no_resched();
> > +}
> > +EXPORT_SYMBOL_GPL(xt_info_rdlock_bh);
> 
> This function is FUCKED UP.
> 
> It's total crap for several reasons"
> 
>  - the already-mentioned race with bottom half locking.
> 
>    If bottom halfs aren't already disabled, then if a bottom half comes in 
>    after the "++lock->depth" and before the spin_lock_bh(), then you will 
>    have no locking AT ALL for the processing of that bottom half - it will 
>    just increment the lock depth again, and nobody will have locked 
>    anything at all.
> 
>    And if for some reason, you can prove that bottom half processing is 
>    already disabled, then ALL THAT OTHER CRAP is just that - CRAP. The 
>    whole preemption disabling, the whole "_bh()" thing, everything.
> 
>    So either it's horribly buggy, or it's horribly broken and pointless. 
>    Take your pick.
> 
>  - the total lack of comments. Why does that "count" protect anything? 
>    It's not a recursive lock, since there is no ownership (two 
>    independent accessors could call this and both "get" the lock), so you 
>    had damn well better create some big-ass comments about why it's ok in 
>    this case, and what the rules are that make it ok.
> 
>    So DON'T GO AROUND CALLING IT A RECURSIVE LOCK! Don't write comments 
>    that are TOTAL AND UTTER SH*T! Just DON'T!
> 
>    It's a "reader lock". It's not "recursive".  It never was recursive, it 
>    never will be, and calling it that is just a sign that whoever wrote 
>    the function is a moron and doesn't know what he is doing. STOP DOING THIS!
> 
>  - that _idiotic_ "preempt_enable_no_resched()". F*ck me, but the comment 
>    already says that preemption is disabled when exiting, so why does it 
>    even bother to enable it? Why play those mindless games with preemption 
>    counts, knowing that they are bogus?
> 
>    Do it readably. Disable preemption first, and just re-enable it at 
>    UNLOCK time. No odd pseudo-reenables anywhere.
> 
> Oh, I know very well that the spli-locking will disable preemption, so 
> it's "correct" to play those games, but the point is, it's just damn 
> stupid and annoying. It just makes the source code actively _misleading_ 
> to see crap like that - it looks like you enabled preemption, when in fact 
> the code very much on purpose does _not_ enable preemption at all. 
> 
> In other words, I really REALLY hate that patch. I think it looks slightly 
> better than Eric Dumazet's original patch (at least the locking subtlety 
> is now in a function of its own and _could_ be commented upon sanely and 
> if it wasn't broken it might be acceptable), but quite frankly, I'd still 
> horribly disgusted with the crap.
> 
> Why are you doing this insane thing? If you want a read-lock, just use the 
> damned read-write locks already! Ad far as I can tell, this lock is in 
> _no_ way better than just using those counting reader-writer locks, except 
> it is open-coded and looks buggy.
> 
> There is basically _never_ a good reason to re-implement locking 
> primitives: you'll just introduce bugs. As proven very ably by the amount 
> of crap above in what is supposed to be a very simple function.
> 
> If you want a counting read-lock (in _order_ to allow recursion, but not 
> because the lock itself is somehow recursive!), then that function should 
> look like this:
> 
> 	void xt_info_rdlock_bh(void)
> 	{
> 		struct xt_info_lock *lock
> 
> 		local_bh_disable();
> 		lock = &__get_cpu_var(xt_info_locks);
> 		read_lock(&lock->lock);
> 	}
> 
> And then the "unlock" should be the reverse. No games, no crap, and 
> hopefully then no bugs. And if you do it that way, you don't even need the 
> comments, although quite frankly, it probably makes a lot of sense to talk 
> about the interaction between "local_bh_disable()" and the preempt count, 
> and why you're not using "read_lock_bh()".
> 
> And if you don't want a read-lock, then fine - don't use a read-lock, do 
> something else. But then don't just re-implement it (badly) either and 
> call it something else!
> 
> 			Linus
> 
> PS: Ingo, why do the *_bh() functions in kernel/spinlock.c do _both_ a 
> "local_bh_disable()" and a "preempt_disable()"? BH disable should disable 
> preemption too, no? Or am I confused? In which case we need that in 
> the above rdlock_bh too.

Ah a nice day, with Linus giving constructive feedback. Too bad he has
to channel it out of the dark side.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 16:43                                                                               ` Stephen Hemminger
@ 2009-04-21 16:50                                                                                 ` Linus Torvalds
  0 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-21 16:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, jeff.chua.linux, mingo, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers



On Tue, 21 Apr 2009, Stephen Hemminger wrote:
> 
> Ah a nice day, with Linus giving constructive feedback. Too bad he has
> to channel it out of the dark side.

I already flamed that patch at least once before. People didn't react. 

What do I have to do to make people listen? I'm sorry, but I'm not going 
to send you flowers with a card saying "Hope you do better next time!".

I realize that the flowers might be friendlier, but I have absolutely no 
incentive to be friendly to just bad code. I have even _less_ incentive 
when my first "that sucks" is apparently totally ignored. So now I spelled 
it out why it sucks, but I sure as hell didn't have any reason to be 
polite about it.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 16:13                                                                             ` Linus Torvalds
  2009-04-21 16:43                                                                               ` Stephen Hemminger
@ 2009-04-21 18:02                                                                               ` Ingo Molnar
  2009-04-21 18:15                                                                               ` Stephen Hemminger
  2009-04-21 18:34                                                                               ` [PATCH] netfilter: use per-cpu recursive lock (v11) Paul E. McKenney
  3 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-21 18:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Paul Mackerras, paulmck, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> PS: Ingo, why do the *_bh() functions in kernel/spinlock.c do 
> _both_ a "local_bh_disable()" and a "preempt_disable()"? BH 
> disable should disable preemption too, no? Or am I confused? In 
> which case we need that in the above rdlock_bh too.

i think there might be (are?) uses of:

	spin_lock_bh(&some->lock);
	...
	spin_unlock(&some->lock);
	...
	local_bh_enable();

So we have to have two preemption control levels for that, as 
there's no knowledge at the spin_lock_bh() place whether it will be 
followed by a spin_unlock_bh() [in which case it would be safe to do 
SOFTIRQ_OFFSET only] - or by a spin_unlock() + local_bh_enable() 
pair..

[ That locking pattrn even makes a certain amount of sense: keep the 
  lock held for a short amount of time - then weaken locking to bh 
  context exclusion only. ]

What we could do is an optimization to do a compound increase the 
preempt count by SOFTIRQ_OFFSET+1 - instead of a local_bh_disable() 
+ preempt_disable()? Symmetrically we could do a compound decrease 
in the unlock case.

It might even be called: local_bh_preempt_disable() or so?

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 16:13                                                                             ` Linus Torvalds
  2009-04-21 16:43                                                                               ` Stephen Hemminger
  2009-04-21 18:02                                                                               ` Ingo Molnar
@ 2009-04-21 18:15                                                                               ` Stephen Hemminger
  2009-04-21 19:10                                                                                 ` Ingo Molnar
  2009-04-21 19:39                                                                                 ` Ingo Molnar
  2009-04-21 18:34                                                                               ` [PATCH] netfilter: use per-cpu recursive lock (v11) Paul E. McKenney
  3 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21 18:15 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, jeff.chua.linux, mingo, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Subject: netfilter: use per-cpu recursive lock (v12)

This version of x_tables (ip/ip6/arp) locking uses a per-cpu
recursive lock that can be nested.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2). 

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

I don't like the NR_CPUS preempt count kludge, it is working around an
issue that should be handled in a more generic way.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
CHANGES 
  - go back to read/write lock
  - reader case is now inline
  - more comments
  - always disables bottom half
  - only play preempt count mindgames if really necessary
  - add lockdep keys

 include/linux/netfilter/x_tables.h |   42 ++++++++++++--
 net/ipv4/netfilter/arp_tables.c    |  110 ++++++++-----------------------------
 net/ipv4/netfilter/ip_tables.c     |  110 +++++++------------------------------
 net/ipv6/netfilter/ip6_tables.c    |  108 ++++++++----------------------------
 net/netfilter/x_tables.c           |   77 ++++++++++++++++++-------
 5 files changed, 165 insertions(+), 282 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-21 07:57:06.668582345 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-21 11:06:10.381487605 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,43 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+
+/*
+ * Per-CPU read/write lock. This makes reader locking fast since
+ * there is no shared variable to cause cache ping-pong; but adds an
+ * additional write-side penalty since write must iterate over all
+ * possible CPU's. Readers read lock their per-cpu lock, and writers
+ * write lock on all CPU's.
+ *
+ * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while packet
+ * is being processed.
+ *
+ * Write lock is used in two cases:
+ *    1. reading counter values
+ *       all readers need to be stopped and the per-CPU values are summed.
+ *
+ *    2. replacing rules
+ *       all packets in flight have to be processed before rules are swapped,
+ *       then counters are read from the old (stale) info.
+ *
+ */
+DECLARE_PER_CPU(rwlock_t, xt_info_locks);
+
+static inline void xt_info_rdlock_bh(void)
+{
+	local_bh_disable();
+	read_lock(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	read_unlock_bh(&__get_cpu_var(xt_info_locks));
+}
+
+extern void xt_info_wrlock_bh(void);
+extern void xt_info_wrunlock_bh(void);
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-21 07:57:06.649549203 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-21 08:13:36.133673244 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -918,60 +916,6 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
@@ -979,7 +923,6 @@ static struct xt_counters * alloc_counte
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +931,13 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,6 +1303,18 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1437,25 +1375,23 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-21 07:57:06.605264365 -0700
+++ b/net/netfilter/x_tables.c	2009-04-21 11:08:00.526402956 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,6 +662,46 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(rwlock_t, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+void xt_info_wrlock_bh(void)
+{
+	unsigned int i;
+
+	local_bh_disable();
+	for_each_possible_cpu(i) {
+		write_lock(&per_cpu(xt_info_locks, i));
+#if NR_CPUS > (PREEMPT_MASK - 1)
+		/*
+		 * Since spin_lock disables preempt, the following is
+		 * required to avoid overflowing the preempt counter
+		 */
+		preempt_enable_no_resched();
+#endif
+	}
+}
+EXPORT_SYMBOL_GPL(xt_info_wrlock_bh);
+
+void xt_info_wrunlock_bh(void)
+	__releases(xt_info_lock)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i) {
+#if NR_CPUS > (PREEMPT_MASK - 1)
+		/*
+		 * Spin_unlock calls preempt_enable, but since we had
+		 * to adjust the count in xt_info_wrlock_bh, do it again
+		 */
+		preempt_disable();
+#endif
+		write_unlock(&per_cpu(xt_info_locks, i));
+	}
+	local_bh_enable();
+}
+EXPORT_SYMBOL_GPL(xt_info_wrunlock_bh);
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
@@ -685,22 +711,21 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	xt_info_wrlock_bh();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		xt_info_wrunlock_bh();
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
+	table->private =  newinfo;
+	newinfo->initial_entries = private->initial_entries;
+	xt_info_wrunlock_bh();
 
-	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -734,7 +759,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1171,16 @@ static struct pernet_operations xt_net_o
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+	static struct lock_class_key xt_lock_key[NR_CPUS];
+
+	for_each_possible_cpu(i) {
+		rwlock_t *lock = &per_cpu(xt_info_locks, i);
+
+		rwlock_init(lock);
+		lockdep_set_class(lock, xt_lock_key+i);
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-21 07:57:06.621260619 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-21 08:13:36.160551552 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -949,64 +949,11 @@ get_counters(const struct xt_table_info 
 	}
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +962,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,6 +1335,19 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1465,25 +1408,24 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-21 07:57:06.633261308 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-21 08:13:36.163555922 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -734,65 +734,11 @@ static void get_counters(const struct xt
 	}
 }
 
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
-	local_bh_enable();
-}
-
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +748,13 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
+		return ERR_PTR(-ENOMEM);
 
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	xt_info_wrlock_bh();
+	get_counters(private, counters);
+	xt_info_wrunlock_bh();
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,6 +1094,19 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1224,14 +1166,13 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	xt_info_wrlock_bh();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1240,10 +1181,9 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
- unlock_up_free:
-	mutex_unlock(&t->lock);
 
+ unlock_up_free:
+	xt_info_wrunlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 16:13                                                                             ` Linus Torvalds
                                                                                                 ` (2 preceding siblings ...)
  2009-04-21 18:15                                                                               ` Stephen Hemminger
@ 2009-04-21 18:34                                                                               ` Paul E. McKenney
  2009-04-21 20:14                                                                                 ` Linus Torvalds
  3 siblings, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-21 18:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Paul Mackerras, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, mingo,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers

On Tue, Apr 21, 2009 at 09:13:52AM -0700, Linus Torvalds wrote:
> 
> Ok, so others already pointed out how dangerous/buggy this looks, but I'd 
> like to strengthen that a bit more:

I believe that at least some of this is naming...

> On Mon, 20 Apr 2009, Stephen Hemminger wrote:

[ . . . ]

> If you want a counting read-lock (in _order_ to allow recursion, but not 
> because the lock itself is somehow recursive!), then that function should 
> look like this:
> 
> 	void xt_info_rdlock_bh(void)
> 	{
> 		struct xt_info_lock *lock
> 
> 		local_bh_disable();
> 		lock = &__get_cpu_var(xt_info_locks);
> 		read_lock(&lock->lock);
> 	}
> 
> And then the "unlock" should be the reverse. No games, no crap, and 
> hopefully then no bugs. And if you do it that way, you don't even need the 
> comments, although quite frankly, it probably makes a lot of sense to talk 
> about the interaction between "local_bh_disable()" and the preempt count, 
> and why you're not using "read_lock_bh()".
> 
> And if you don't want a read-lock, then fine - don't use a read-lock, do 
> something else. But then don't just re-implement it (badly) either and 
> call it something else!

Sigh!!!

Part of the problem is that back in 1971, Courtois, Heymans, and Parnas
foolishly named their article "Concurrent Control with 'Readers' and
'Writers'", which lead to the name "reader-writer lock".  This started
really biting around 1991, when Hsieh and Weihl created a reader-optimized
lock similar to brlock, but with each of the per-CPU locks being
exclusive rather than each being an rwlock.

The problem was that Hsieh's and Weihl's lock was more than just
a reader-writer lock.  It could also be used (and -was- used) as a
local/global lock, where for example you acquire your own lock to make
local changes, and acquire all of the locks to obtain a consistent view
of global state.  In which case, you would read-acquire the lock in
order to write, and write-acquire the lock in order to read.  Blech.

So, would it help if the function names names in this patch said something
about "local" and "global" rather than "read" and "write"?

							Thanx, Paul


> 			Linus
> 
> PS: Ingo, why do the *_bh() functions in kernel/spinlock.c do _both_ a 
> "local_bh_disable()" and a "preempt_disable()"? BH disable should disable 
> preemption too, no? Or am I confused? In which case we need that in 
> the above rdlock_bh too.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 18:15                                                                               ` Stephen Hemminger
@ 2009-04-21 19:10                                                                                 ` Ingo Molnar
  2009-04-21 19:46                                                                                     ` Eric Dumazet
  2009-04-21 21:04                                                                                   ` Stephen Hemminger
  2009-04-21 19:39                                                                                 ` Ingo Molnar
  1 sibling, 2 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-21 19:10 UTC (permalink / raw)
  To: Stephen Hemminger, Peter Zijlstra
  Cc: Linus Torvalds, Paul Mackerras, paulmck, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers


* Stephen Hemminger <shemminger@vyatta.com> wrote:

> +void xt_info_wrlock_bh(void)
> +{
> +	unsigned int i;
> +
> +	local_bh_disable();
> +	for_each_possible_cpu(i) {
> +		write_lock(&per_cpu(xt_info_locks, i));
> +#if NR_CPUS > (PREEMPT_MASK - 1)
> +		/*
> +		 * Since spin_lock disables preempt, the following is
> +		 * required to avoid overflowing the preempt counter
> +		 */
> +		preempt_enable_no_resched();
> +#endif
> +	}
> +}

hm, this is rather ugly and it will make a lot of instrumentation 
code explode.

Why not use the obvious solution: a _single_ wrlock for global 
access and read_can_lock() plus per cpu locks in the fastpath?

That way there's no global cacheline bouncing (just the _reading_ of 
a global cacheline - which will be nicely localized - on NUMA too) - 
and we will hold at most 1-2 locks at once!

Something like:

	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);

	DEFINE_PER_CPU(rwlock_t local_lock);


	void local_read_lock(void)
	{
	again:
		read_lock(&per_cpu(local_lock, this_cpu));

		if (unlikely(!read_can_lock(&global_wrlock))) {
			read_unlock(&per_cpu(local_lock, this_cpu));
			/*
			 * Just wait for any global write activity:
			 */
			read_unlock_wait(&global_wrlock);
			goto again;
		}
	}

	void global_write_lock(void)
	{
		write_lock(&global_wrlock);

		for_each_possible_cpu(i)
			write_unlock_wait(&per_cpu(local_lock, i));
	}

Note how nesting friendly this construct is: we dont actually _hold_ 
NR_CPUS locks all at once, we simply cycle through all CPUs and make 
sure they have our attention.

No preempt overflow. No lockdep explosion. A very fast and scalable 
read path.

Okay - we need to implement read_unlock_wait() and 
write_unlock_wait() which is similar to spin_unlock_wait(). The 
trivial first-approximation is:

	read_unlock_wait(x)
	{
		read_lock(x);
		read_unlock(x);
	}

	write_unlock_wait(x)
	{
		write_lock(x);
		write_unlock(x);
	}

Hm?

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 18:15                                                                               ` Stephen Hemminger
  2009-04-21 19:10                                                                                 ` Ingo Molnar
@ 2009-04-21 19:39                                                                                 ` Ingo Molnar
  2009-04-21 21:39                                                                                   ` [PATCH] netfilter: use per-cpu recursive lock (v13) Stephen Hemminger
  1 sibling, 1 reply; 254+ messages in thread
From: Ingo Molnar @ 2009-04-21 19:39 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Torvalds, Paul Mackerras, paulmck, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers


* Stephen Hemminger <shemminger@vyatta.com> wrote:

> +void xt_info_wrunlock_bh(void)
> +	__releases(xt_info_lock)
> +{
> +	unsigned int i;
> +
> +	for_each_possible_cpu(i) {
> +#if NR_CPUS > (PREEMPT_MASK - 1)
> +		/*
> +		 * Spin_unlock calls preempt_enable, but since we had
> +		 * to adjust the count in xt_info_wrlock_bh, do it again
> +		 */
> +		preempt_disable();
> +#endif
> +		write_unlock(&per_cpu(xt_info_locks, i));
> +	}
> +	local_bh_enable();
> +}

In the global/local lock scheme i proposed this would become:

	global_write_unlock(void)
	{
		write_unlock(&global_lock);
	}

As we dont hold the local locks during the write-locked critical 
section. No loop needed over CPUs, no preempt nesting complications, 
no lockdep complications, etc.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 19:10                                                                                 ` Ingo Molnar
@ 2009-04-21 19:46                                                                                     ` Eric Dumazet
  2009-04-21 21:04                                                                                   ` Stephen Hemminger
  1 sibling, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21 19:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

Ingo Molnar a écrit :
> 
> Why not use the obvious solution: a _single_ wrlock for global 
> access and read_can_lock() plus per cpu locks in the fastpath?

Obvious is not the qualifier I would use :)

Brilliant yes :)

> 
> That way there's no global cacheline bouncing (just the _reading_ of 
> a global cacheline - which will be nicely localized - on NUMA too) - 
> and we will hold at most 1-2 locks at once!
> 
> Something like:
> 
> 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> 
> 	DEFINE_PER_CPU(rwlock_t local_lock);
> 
> 
> 	void local_read_lock(void)
> 	{
> 	again:
> 		read_lock(&per_cpu(local_lock, this_cpu));

Hmm... here we can see global_wrlock locked by on writer, while
this cpu already called local_read_lock(), and calls again this
function -> Deadlock, because we hold our local_lock locked.

> 
> 		if (unlikely(!read_can_lock(&global_wrlock))) {
> 			read_unlock(&per_cpu(local_lock, this_cpu));
> 			/*
> 			 * Just wait for any global write activity:
> 			 */
> 			read_unlock_wait(&global_wrlock);
> 			goto again;
> 		}
> 	}
> 
> 	void global_write_lock(void)
> 	{
> 		write_lock(&global_wrlock);
> 
> 		for_each_possible_cpu(i)
> 			write_unlock_wait(&per_cpu(local_lock, i));
> 	}
> 
> Note how nesting friendly this construct is: we dont actually _hold_ 
> NR_CPUS locks all at once, we simply cycle through all CPUs and make 
> sure they have our attention.
> 
> No preempt overflow. No lockdep explosion. A very fast and scalable 
> read path.
> 
> Okay - we need to implement read_unlock_wait() and 
> write_unlock_wait() which is similar to spin_unlock_wait(). The 
> trivial first-approximation is:
> 
> 	read_unlock_wait(x)
> 	{
> 		read_lock(x);
> 		read_unlock(x);
> 	}
> 
> 	write_unlock_wait(x)
> 	{
> 		write_lock(x);
> 		write_unlock(x);
> 	}
> 

Very interesting and could be changed to use spinlock + depth per cpu.

-> we can detect recursion and avoid the deadlock, and we only use one
atomic operation per lock/unlock pair in fastpath (this was the reason we
tried hard to use a percpu spinlock during this thread)


__cacheline_aligned DEFINE_RWLOCK(global_wrlock);

struct ingo_local_lock {
	spinlock_t lock;
	int depth;
};
DEFINE_PER_CPU(struct ingo_local_lock local_lock);


void local_read_lock(void)
{
	struct ingo_local_lock *lck;

	local_bh_and_preempt_disable();
	lck = &get_cpu_var(local_lock);
	if (++lck->depth > 0) /* already locked */
		return;
again:
	spin_lock(&lck->lock);

	if (unlikely(!read_can_lock(&global_wrlock))) {
		spin_unlock(&lck->lock);
		/*
		 * Just wait for any global write activity:
		 */
		read_unlock_wait(&global_wrlock);
		goto again;
	}
}

void global_write_lock(void)
{
	write_lock(&global_wrlock);

	for_each_possible_cpu(i)
		spin_unlock_wait(&per_cpu(local_lock, i));
}



Hmm ?


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-21 19:46                                                                                     ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-21 19:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

Ingo Molnar a écrit :
> 
> Why not use the obvious solution: a _single_ wrlock for global 
> access and read_can_lock() plus per cpu locks in the fastpath?

Obvious is not the qualifier I would use :)

Brilliant yes :)

> 
> That way there's no global cacheline bouncing (just the _reading_ of 
> a global cacheline - which will be nicely localized - on NUMA too) - 
> and we will hold at most 1-2 locks at once!
> 
> Something like:
> 
> 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> 
> 	DEFINE_PER_CPU(rwlock_t local_lock);
> 
> 
> 	void local_read_lock(void)
> 	{
> 	again:
> 		read_lock(&per_cpu(local_lock, this_cpu));

Hmm... here we can see global_wrlock locked by on writer, while
this cpu already called local_read_lock(), and calls again this
function -> Deadlock, because we hold our local_lock locked.

> 
> 		if (unlikely(!read_can_lock(&global_wrlock))) {
> 			read_unlock(&per_cpu(local_lock, this_cpu));
> 			/*
> 			 * Just wait for any global write activity:
> 			 */
> 			read_unlock_wait(&global_wrlock);
> 			goto again;
> 		}
> 	}
> 
> 	void global_write_lock(void)
> 	{
> 		write_lock(&global_wrlock);
> 
> 		for_each_possible_cpu(i)
> 			write_unlock_wait(&per_cpu(local_lock, i));
> 	}
> 
> Note how nesting friendly this construct is: we dont actually _hold_ 
> NR_CPUS locks all at once, we simply cycle through all CPUs and make 
> sure they have our attention.
> 
> No preempt overflow. No lockdep explosion. A very fast and scalable 
> read path.
> 
> Okay - we need to implement read_unlock_wait() and 
> write_unlock_wait() which is similar to spin_unlock_wait(). The 
> trivial first-approximation is:
> 
> 	read_unlock_wait(x)
> 	{
> 		read_lock(x);
> 		read_unlock(x);
> 	}
> 
> 	write_unlock_wait(x)
> 	{
> 		write_lock(x);
> 		write_unlock(x);
> 	}
> 

Very interesting and could be changed to use spinlock + depth per cpu.

-> we can detect recursion and avoid the deadlock, and we only use one
atomic operation per lock/unlock pair in fastpath (this was the reason we
tried hard to use a percpu spinlock during this thread)


__cacheline_aligned DEFINE_RWLOCK(global_wrlock);

struct ingo_local_lock {
	spinlock_t lock;
	int depth;
};
DEFINE_PER_CPU(struct ingo_local_lock local_lock);


void local_read_lock(void)
{
	struct ingo_local_lock *lck;

	local_bh_and_preempt_disable();
	lck = &get_cpu_var(local_lock);
	if (++lck->depth > 0) /* already locked */
		return;
again:
	spin_lock(&lck->lock);

	if (unlikely(!read_can_lock(&global_wrlock))) {
		spin_unlock(&lck->lock);
		/*
		 * Just wait for any global write activity:
		 */
		read_unlock_wait(&global_wrlock);
		goto again;
	}
}

void global_write_lock(void)
{
	write_lock(&global_wrlock);

	for_each_possible_cpu(i)
		spin_unlock_wait(&per_cpu(local_lock, i));
}



Hmm ?

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 18:34                                                                               ` [PATCH] netfilter: use per-cpu recursive lock (v11) Paul E. McKenney
@ 2009-04-21 20:14                                                                                 ` Linus Torvalds
  0 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-21 20:14 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Stephen Hemminger, Paul Mackerras, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, mingo,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers



On Tue, 21 Apr 2009, Paul E. McKenney wrote:
> 
> I believe that at least some of this is naming...

Maybe.

I do agree that the way netfilter would use the lock is somewhat different 
from a normal 'reader-writer' lock, since this special case of readers is 
about a per-cpu thing.

> So, would it help if the function names names in this patch said something
> about "local" and "global" rather than "read" and "write"?

Oh, I would have no problem at all with 'local' and 'global', in fact it 
would explain _why_ that read-write lock works.

The problem with naming I have is with the 'recursive' part. There is no 
ambiguity what-so-ever about what a "recursive lock" is (at least of the 
traditional kind), and the lock described here is not it.

So don't get me wrong - I could certainly live with a special lock in the 
networking. BUT:

 - it had damn well be documented as to what it does, and why it works

 - and it had better actually _work_, and not be buggy.

I suspect that using our regular reader-writer locks works well enough, 
and yes, it's probably worth making it really clear that the reader 
variety can only ever be used in the "local" form. That kind of is implied 
by the whole function, though.

And if somebody wants to open-code it as a per-cpu spinlock and a per-cpu 
'local counter', I can live with that too, but at that point I want people 
to just be a lot more careful, and also document it a lot more. 

		Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 19:10                                                                                 ` Ingo Molnar
  2009-04-21 19:46                                                                                     ` Eric Dumazet
@ 2009-04-21 21:04                                                                                   ` Stephen Hemminger
  2009-04-22  8:00                                                                                     ` Ingo Molnar
  1 sibling, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21 21:04 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Linus Torvalds, Paul Mackerras, paulmck,
	Eric Dumazet, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

On Tue, 21 Apr 2009 21:10:07 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Stephen Hemminger <shemminger@vyatta.com> wrote:
> 
> > +void xt_info_wrlock_bh(void)
> > +{
> > +	unsigned int i;
> > +
> > +	local_bh_disable();
> > +	for_each_possible_cpu(i) {
> > +		write_lock(&per_cpu(xt_info_locks, i));
> > +#if NR_CPUS > (PREEMPT_MASK - 1)
> > +		/*
> > +		 * Since spin_lock disables preempt, the following is
> > +		 * required to avoid overflowing the preempt counter
> > +		 */
> > +		preempt_enable_no_resched();
> > +#endif
> > +	}
> > +}
> 
> hm, this is rather ugly and it will make a lot of instrumentation 
> code explode.

Better general solutions:
    * use raw_spin_lock
    * increase PREEMPT_BITS on 64 bit machine 
      and limit to 128 CPU or less on 32 bit
    * get rid of default preempt_disable in spin_lock

You choose. It is a general problem.

> Why not use the obvious solution: a _single_ wrlock for global 
> access and read_can_lock() plus per cpu locks in the fastpath?
> 
> That way there's no global cacheline bouncing (just the _reading_ of 
> a global cacheline - which will be nicely localized - on NUMA too) - 
> and we will hold at most 1-2 locks at once!
> 
> Something like:
> 
> 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> 
> 	DEFINE_PER_CPU(rwlock_t local_lock);
> 
> 
> 	void local_read_lock(void)
> 	{
> 	again:
> 		read_lock(&per_cpu(local_lock, this_cpu));
> 
> 		if (unlikely(!read_can_lock(&global_wrlock))) {
> 			read_unlock(&per_cpu(local_lock, this_cpu));
> 			/*
> 			 * Just wait for any global write activity:
> 			 */
> 			read_unlock_wait(&global_wrlock);
> 			goto again;
> 		}
> 	}

Quit trying to be so damn f*cking cool. We don't build the software
for locking instrumentation. Locking instrumentation needs to serve
the kernel, not the other way around. 

Your version fails for the case of nested local rules. Which was
the whole reason I switched to read/writer locks.

        CPU 1                            CPU 2
        local_read_lock
        ...                              global_write_lock
            local_read_lock
Stuck...

CPU 2 is waiting for CPU 1 to get out of its nested table processing
CPU 1 is waiting for write lock to get done


^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-cpu recursive lock (v13)
  2009-04-21 19:39                                                                                 ` Ingo Molnar
@ 2009-04-21 21:39                                                                                   ` Stephen Hemminger
  2009-04-22  4:17                                                                                     ` Paul E. McKenney
                                                                                                       ` (2 more replies)
  0 siblings, 3 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-21 21:39 UTC (permalink / raw)
  To: Ingo Molnar, Linus Torvalds
  Cc: Paul Mackerras, paulmck, Eric Dumazet, Evgeniy Polyakov,
	David Miller, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh, mathieu.desnoyers

This version of x_tables (ip/ip6/arp) locking uses a per-cpu
recursive lock that can be nested.

The idea for this came from an earlier version done by Eric Dumazet.
Locking is done per-cpu, the fast path locks on the current cpu
and updates counters.  This reduces the contention of a
single reader lock (in 2.6.29) without the delay of synchronize_net()
(in 2.6.30-rc2). 

The mutex that was added for 2.6.30 in xt_table is unnecessary since
there already is a mutex for xt[af].mutex that is held.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

---
CHANGES 
  - reader and write now inline
  - only acquire one cpu write lock at a time
  - write lock pushed down into get_counters

 include/linux/netfilter/x_tables.h |   50 +++++++++++++--
 net/ipv4/netfilter/arp_tables.c    |  121 ++++++++++---------------------------
 net/ipv4/netfilter/ip_tables.c     |  120 +++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  120 ++++++++++--------------------------
 net/netfilter/x_tables.c           |   55 +++++++++-------
 5 files changed, 174 insertions(+), 292 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-21 07:57:06.668582345 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-21 14:24:03.295299154 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,51 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+
+/*
+ * Per-CPU read/write lock. This makes reader locking fast since
+ * there is no shared variable to cause cache ping-pong; but adds an
+ * additional write-side penalty since write must iterate over all
+ * possible CPU's. Readers read lock their per-cpu lock, and writers
+ * write lock on all CPU's.
+ *
+ * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while packet
+ * is being processed.
+ *
+ * Write lock is used in two cases:
+ *    1. reading counter values
+ *       all readers need to be stopped and the per-CPU values are summed.
+ *
+ *    2. replacing rules
+ *       all packets in flight have to be processed before rules are swapped,
+ *       then counters are read from the old (stale) info.
+ *
+ */
+DECLARE_PER_CPU(rwlock_t, xt_info_locks);
+
+static inline void xt_info_rdlock_bh(void)
+{
+	local_bh_disable();
+	read_lock(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	read_unlock_bh(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	write_lock(&per_cpu(xt_info_locks, cpu));
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+
+	write_unlock(&per_cpu(xt_info_locks, cpu));
+}
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-21 07:57:06.649549203 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-21 14:33:29.842423044 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -897,89 +895,39 @@ get_counters(const struct xt_table_info 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
+	xt_info_wrunlock(curcpu);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +936,11 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1377,11 +1306,23 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1378,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-21 07:57:06.605264365 -0700
+++ b/net/netfilter/x_tables.c	2009-04-21 14:33:50.177486967 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(rwlock_t, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	unsigned int i;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/* wait for each other cpu to see new table */
+	for_each_possible_cpu(i)
+		if (i != smp_processor_id()) {
+			xt_info_wrlock(i);
+			xt_info_wrunlock(i);
+		}
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,16 @@ static struct pernet_operations xt_net_o
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+	static struct lock_class_key xt_lock_key[NR_CPUS];
+
+	for_each_possible_cpu(i) {
+		rwlock_t *lock = &per_cpu(xt_info_locks, i);
+
+		rwlock_init(lock);
+		lockdep_set_class(lock, xt_lock_key+i);
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-21 07:57:06.621260619 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-21 14:29:57.312236004 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,87 +926,40 @@ get_counters(const struct xt_table_info 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
+	xt_info_wrlock(curcpu);
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	xt_info_wrunlock(curcpu);
+
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1405,11 +1339,24 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1412,28 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-21 07:57:06.633261308 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-21 14:34:39.026255677 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,88 +711,39 @@ static void get_counters(const struct xt
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
+	xt_info_wrunlock(curcpu);
 
 	for_each_possible_cpu(cpu) {
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +753,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1165,10 +1097,23 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1169,26 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v13)
  2009-04-21 21:39                                                                                   ` [PATCH] netfilter: use per-cpu recursive lock (v13) Stephen Hemminger
@ 2009-04-22  4:17                                                                                     ` Paul E. McKenney
  2009-04-22 14:57                                                                                     ` Eric Dumazet
  2009-04-22 15:32                                                                                     ` Linus Torvalds
  2 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-22  4:17 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ingo Molnar, Linus Torvalds, Paul Mackerras, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Tue, Apr 21, 2009 at 02:39:27PM -0700, Stephen Hemminger wrote:
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.

Looks good from a concurrency viewpoint!

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com
> 
> ---
> CHANGES 
>   - reader and write now inline
>   - only acquire one cpu write lock at a time

This is very good -- gets rid of problems with preemption nesting
depth limitations and lockdep limitations.  In addition, it gets rid of
the period of time during which all packet processing might otherwise
be blocked.  Not a big deal on a small machine, but could be a real
problem on a large one.

Very cool!!!

>   - write lock pushed down into get_counters
> 
>  include/linux/netfilter/x_tables.h |   50 +++++++++++++--
>  net/ipv4/netfilter/arp_tables.c    |  121 ++++++++++---------------------------
>  net/ipv4/netfilter/ip_tables.c     |  120 +++++++++---------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  120 ++++++++++--------------------------
>  net/netfilter/x_tables.c           |   55 +++++++++-------
>  5 files changed, 174 insertions(+), 292 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-04-21 07:57:06.668582345 -0700
> +++ b/include/linux/netfilter/x_tables.h	2009-04-21 14:24:03.295299154 -0700
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
> 
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> 
> @@ -434,8 +431,51 @@ extern void xt_proto_fini(struct net *ne
> 
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
> +
> +
> +/*
> + * Per-CPU read/write lock. This makes reader locking fast since
> + * there is no shared variable to cause cache ping-pong; but adds an
> + * additional write-side penalty since write must iterate over all
> + * possible CPU's. Readers read lock their per-cpu lock, and writers
> + * write lock on all CPU's.
> + *
> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
> + * It needs to ensure that the rules are not being changed while packet
> + * is being processed.
> + *
> + * Write lock is used in two cases:
> + *    1. reading counter values
> + *       all readers need to be stopped and the per-CPU values are summed.
> + *
> + *    2. replacing rules
> + *       all packets in flight have to be processed before rules are swapped,
> + *       then counters are read from the old (stale) info.
> + *
> + */
> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
> +
> +static inline void xt_info_rdlock_bh(void)
> +{
> +	local_bh_disable();
> +	read_lock(&__get_cpu_var(xt_info_locks));

Good, you do indeed need to prevent migration before you acquire this
CPU's lock.  Otherwise, you could have more than one CPU attempting to
update the same counter.

> +}
> +
> +static inline void xt_info_rdunlock_bh(void)
> +{
> +	read_unlock_bh(&__get_cpu_var(xt_info_locks));
> +}
> +
> +static inline void xt_info_wrlock(unsigned int cpu)
> +{
> +	write_lock(&per_cpu(xt_info_locks, cpu));
> +}
> +
> +static inline void xt_info_wrunlock(unsigned int cpu)
> +{
> +
> +	write_unlock(&per_cpu(xt_info_locks, cpu));
> +}
> 
>  /*
>   * This helper is performance critical and must be inlined
> --- a/net/ipv4/netfilter/ip_tables.c	2009-04-21 07:57:06.649549203 -0700
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-04-21 14:33:29.842423044 -0700
> @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
>  	tgpar.hooknum = hook;
> 
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
> 
> @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -897,89 +895,39 @@ get_counters(const struct xt_table_info 
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.

Good.  Of course, the reason we care about preemption here is that
otherwise some other task could mess up this CPU's counters.

Bad for real-time response, but then again, what the heck are you
doing updating netfilter rules while a system is running a real-time
workload?

>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();
> +	curcpu = smp_processor_id();
> 
>  	i = 0;
> +	xt_info_wrlock(curcpu);
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
>  			  t->size,
>  			  set_entry_to_counter,
>  			  counters,
>  			  &i);
> +	xt_info_wrunlock(curcpu);
> 
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
>  	local_bh_enable();
>  }
> 
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters * alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +936,11 @@ static struct xt_counters * alloc_counte
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> +		return ERR_PTR(-ENOMEM);
> 
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int
> @@ -1377,11 +1306,23 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> 
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1437,25 +1378,26 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	loc_cpu_entry = private->entries[curcpu];
> +	xt_info_wrlock(curcpu);
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/netfilter/x_tables.c	2009-04-21 07:57:06.605264365 -0700
> +++ b/net/netfilter/x_tables.c	2009-04-21 14:33:50.177486967 -0700
> @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
>  }
>  EXPORT_SYMBOL(xt_free_table_info);
> 
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> -{
> -	unsigned int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> -
> -}
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> -
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  				    const char *name)
> @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
>  EXPORT_SYMBOL_GPL(xt_compat_unlock);
>  #endif
> 
> +DEFINE_PER_CPU(rwlock_t, xt_info_locks);
> +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);

Hmmm...  Not sure I want to know what module uses this.

Ah, that is right -- a bunch of the comms stack can be built as a module.

> +
> +
>  struct xt_table_info *
>  xt_replace_table(struct xt_table *table,
>  	      unsigned int num_counters,
>  	      struct xt_table_info *newinfo,
>  	      int *error)
>  {
> -	struct xt_table_info *oldinfo, *private;
> +	unsigned int i;
> +	struct xt_table_info *private;
> 
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
> +	local_bh_disable();

Good, disabling preemption prevents other readers from messing with this
CPU's local state.

>  	private = table->private;
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
> +		local_bh_enable();
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
> -	oldinfo = private;
> -	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
> 
> -	synchronize_net();
> -	return oldinfo;
> +	table->private = newinfo;
> +	newinfo->initial_entries = private->initial_entries;
> +
> +	/* wait for each other cpu to see new table */
> +	for_each_possible_cpu(i)
> +		if (i != smp_processor_id()) {
> +			xt_info_wrlock(i);
> +			xt_info_wrunlock(i);
> +		}

And the above loop acts sort of like a lock-based synchronize_rcu().

> +	local_bh_enable();
> +
> +	return private;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
> 
> @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc
> 
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
> 
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;
> @@ -1147,7 +1143,16 @@ static struct pernet_operations xt_net_o
> 
>  static int __init xt_init(void)
>  {
> -	int i, rv;
> +	unsigned int i;
> +	int rv;
> +	static struct lock_class_key xt_lock_key[NR_CPUS];
> +
> +	for_each_possible_cpu(i) {
> +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
> +
> +		rwlock_init(lock);
> +		lockdep_set_class(lock, xt_lock_key+i);
> +	}
> 
>  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
>  	if (!xt)
> --- a/net/ipv6/netfilter/ip6_tables.c	2009-04-21 07:57:06.621260619 -0700
> +++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-21 14:29:57.312236004 -0700
> @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
> 
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> 
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
> 
> @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
>  #ifdef CONFIG_NETFILTER_DEBUG
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -926,87 +926,40 @@ get_counters(const struct xt_table_info 
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();
> +	curcpu = smp_processor_id();

Same story, but for IPv6.

>  	i = 0;
> +	xt_info_wrlock(curcpu);
>  	IP6T_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	xt_info_wrunlock(curcpu);
> +
> 
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ip6t_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IP6T_ENTRY_ITERATE(t->entries[cpu],
> -			   t->size,
> -			   add_counter_to_entry,
> -			   counters,
> -			   &i);
>  	local_bh_enable();
>  }
> 
> -static inline int
> -zero_entry_counter(struct ip6t_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				   zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> +		return ERR_PTR(-ENOMEM);
> 
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int
> @@ -1405,11 +1339,24 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ip6t_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1465,25 +1412,28 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +
> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	xt_info_wrlock(curcpu);
> +	loc_cpu_entry = private->entries[curcpu];
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
> +
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/ipv4/netfilter/arp_tables.c	2009-04-21 07:57:06.633261308 -0700
> +++ b/net/ipv4/netfilter/arp_tables.c	2009-04-21 14:34:39.026255677 -0700
> @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
>  	indev = in ? in->name : nulldevname;
>  	outdev = out ? out->name : nulldevname;
> 
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
> 
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  	back = get_entry(table_base, private->underflow[hook]);
> @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
> 
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> +
>  			ADD_COUNTER(e->counters, hdr_len, 1);
> 
>  			t = arpt_get_target(e);
> @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
> 
>  	if (hotdrop)
>  		return NF_DROP;
> @@ -711,88 +711,39 @@ static void get_counters(const struct xt
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();

Same story, but for ARP.

> +	curcpu = smp_processor_id();
> 
>  	i = 0;
> +	xt_info_wrlock(curcpu);
>  	ARPT_ENTRY_ITERATE(t->entries[curcpu],
>  			   t->size,
>  			   set_entry_to_counter,
>  			   counters,
>  			   &i);
> +	xt_info_wrunlock(curcpu);
> 
>  	for_each_possible_cpu(cpu) {
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
>  				   counters,
>  				   &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -}
> -
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct arpt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	ARPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
>  	local_bh_enable();
>  }
> 
> -static inline int
> -zero_entry_counter(struct arpt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
> 
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	 * (other than comefrom, which userspace doesn't care
> @@ -802,30 +753,11 @@ static struct xt_counters *alloc_counter
>  	counters = vmalloc_node(countersize, numa_node_id());
> 
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> +		return ERR_PTR(-ENOMEM);
> 
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
> 
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
> 
>  static int copy_entries_to_user(unsigned int total_size,
> @@ -1165,10 +1097,23 @@ static int do_replace(struct net *net, v
>  	return ret;
>  }
> 
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct arpt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  			   int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1224,26 +1169,26 @@ static int do_add_counters(struct net *n
>  		goto free;
>  	}
> 
> -	mutex_lock(&t->lock);
> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
> 
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	loc_cpu_entry = private->entries[curcpu];
> +	xt_info_wrlock(curcpu);
>  	ARPT_ENTRY_ITERATE(loc_cpu_entry,
>  			   private->size,
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> -
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 19:46                                                                                     ` Eric Dumazet
@ 2009-04-22  7:35                                                                                       ` Ingo Molnar
  -1 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22  7:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Eric Dumazet <dada1@cosmosbay.com> wrote:

> Ingo Molnar a écrit :
> > 
> > Why not use the obvious solution: a _single_ wrlock for global 
> > access and read_can_lock() plus per cpu locks in the fastpath?
> 
> Obvious is not the qualifier I would use :)
> 
> Brilliant yes :)

thanks :)

> > That way there's no global cacheline bouncing (just the 
> > _reading_ of a global cacheline - which will be nicely localized 
> > - on NUMA too) - and we will hold at most 1-2 locks at once!
> > 
> > Something like:
> > 
> > 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> > 
> > 	DEFINE_PER_CPU(rwlock_t local_lock);
> > 
> > 
> > 	void local_read_lock(void)
> > 	{
> > 	again:
> > 		read_lock(&per_cpu(local_lock, this_cpu));
> 
> Hmm... here we can see global_wrlock locked by on writer, while 
> this cpu already called local_read_lock(), and calls again this 
> function -> Deadlock, because we hold our local_lock locked.

Yeah, indeed.

I wasnt really concentrating on the nested case, i was concentrating 
on the scalability and lock nesting angle. I think the code 
submitted here looks rather incestous in that regard.

Allowing nested locking _on the same CPU_ is asking for trouble. Use 
short critical sections and if there's any exclusion needed, use an 
irq-safe lock or a softirq-safe lock. Problem solved.

> Very interesting and could be changed to use spinlock + depth per 
> cpu.
> 
> -> we can detect recursion and avoid the deadlock, and we only use 
> one atomic operation per lock/unlock pair in fastpath (this was 
> the reason we tried hard to use a percpu spinlock during this 
> thread)
> 
> 
> __cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> 
> struct ingo_local_lock {
> 	spinlock_t lock;
> 	int depth;
> };
> DEFINE_PER_CPU(struct ingo_local_lock local_lock);
> 
> 
> void local_read_lock(void)
> {
> 	struct ingo_local_lock *lck;
> 
> 	local_bh_and_preempt_disable();
> 	lck = &get_cpu_var(local_lock);
> 	if (++lck->depth > 0) /* already locked */
> 		return;
> again:
> 	spin_lock(&lck->lock);
> 
> 	if (unlikely(!read_can_lock(&global_wrlock))) {
> 		spin_unlock(&lck->lock);
> 		/*
> 		 * Just wait for any global write activity:
> 		 */
> 		read_unlock_wait(&global_wrlock);
> 		goto again;
> 	}
> }
> 
> void global_write_lock(void)
> {
> 	write_lock(&global_wrlock);
> 
> 	for_each_possible_cpu(i)
> 		spin_unlock_wait(&per_cpu(local_lock, i));
> }
> 
> Hmm ?

Yeah, this looks IRQ-nesting safe. But i really have to ask: why 
does this code try so hard to allow same-CPU nesting?

Nesting on the same CPU is _bad_ for several reasons:

1) Performance: it rips apart critical sections cache-wise. Instead 
   of a nice:

       C1 ... C2 ... C3 ... C4

   serial sequence of critical sections, we get:
 
       C1A ... ( C2 ) ... C1B ... C3 ... C4

   Note that C1 got "ripped apart" into C1A and C1B with C2 injected 
   - reducing cache locality between C1A and C1B. We have to execute
   C1B no matter what, so we didnt actually win anything in terms of 
   total work to do, by processing C2 out of order.

   [ Preemption of work (which this kind of nesting is really about)
     is _the anti-thesis of performance_, and we try to delay it as
     much as possible and we try to batch up as much as possible. 
     For example the CPU scheduler will try _real_ hard to not 
     preempt a typical workload, as long as external latency 
     boundaries allow that. ]

2) Locking complexity and robustness. Nested locking is rank #1 in
   terms of introducing regressions into the kernel.

3) Instrumentation/checking complexity. Checking locking 
   dependencies is good and catches a boatload of bugs before they 
   hit upstream, and nested locks are supported but cause an 
   exponential explosion in terms of dependencies to check.

   Also, whenever instrumentation explodes is typically the sign of 
   some true, physical complexity that has been introduced into the 
   code. So it often is a canary for a misdesign at a fundamental 
   level, not a failure in the instrumentation framework.

In the past i saw lock nesting often used as a wrong solution when 
the critical sections were too long (causing too long latencies for 
critical work - e.g. delaying hardirq completion processing 
unreasonably), or just plain out of confusion about the items above.

I dont know whether that's the case here - it could be one of the 
rare exceptions calling for a new locking primitive (which should 
then be introduced at the core kernel level IMHO) - i dont know the 
code that well.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
@ 2009-04-22  7:35                                                                                       ` Ingo Molnar
  0 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22  7:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Eric Dumazet <dada1@cosmosbay.com> wrote:

> Ingo Molnar a écrit :
> > 
> > Why not use the obvious solution: a _single_ wrlock for global 
> > access and read_can_lock() plus per cpu locks in the fastpath?
> 
> Obvious is not the qualifier I would use :)
> 
> Brilliant yes :)

thanks :)

> > That way there's no global cacheline bouncing (just the 
> > _reading_ of a global cacheline - which will be nicely localized 
> > - on NUMA too) - and we will hold at most 1-2 locks at once!
> > 
> > Something like:
> > 
> > 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> > 
> > 	DEFINE_PER_CPU(rwlock_t local_lock);
> > 
> > 
> > 	void local_read_lock(void)
> > 	{
> > 	again:
> > 		read_lock(&per_cpu(local_lock, this_cpu));
> 
> Hmm... here we can see global_wrlock locked by on writer, while 
> this cpu already called local_read_lock(), and calls again this 
> function -> Deadlock, because we hold our local_lock locked.

Yeah, indeed.

I wasnt really concentrating on the nested case, i was concentrating 
on the scalability and lock nesting angle. I think the code 
submitted here looks rather incestous in that regard.

Allowing nested locking _on the same CPU_ is asking for trouble. Use 
short critical sections and if there's any exclusion needed, use an 
irq-safe lock or a softirq-safe lock. Problem solved.

> Very interesting and could be changed to use spinlock + depth per 
> cpu.
> 
> -> we can detect recursion and avoid the deadlock, and we only use 
> one atomic operation per lock/unlock pair in fastpath (this was 
> the reason we tried hard to use a percpu spinlock during this 
> thread)
> 
> 
> __cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> 
> struct ingo_local_lock {
> 	spinlock_t lock;
> 	int depth;
> };
> DEFINE_PER_CPU(struct ingo_local_lock local_lock);
> 
> 
> void local_read_lock(void)
> {
> 	struct ingo_local_lock *lck;
> 
> 	local_bh_and_preempt_disable();
> 	lck = &get_cpu_var(local_lock);
> 	if (++lck->depth > 0) /* already locked */
> 		return;
> again:
> 	spin_lock(&lck->lock);
> 
> 	if (unlikely(!read_can_lock(&global_wrlock))) {
> 		spin_unlock(&lck->lock);
> 		/*
> 		 * Just wait for any global write activity:
> 		 */
> 		read_unlock_wait(&global_wrlock);
> 		goto again;
> 	}
> }
> 
> void global_write_lock(void)
> {
> 	write_lock(&global_wrlock);
> 
> 	for_each_possible_cpu(i)
> 		spin_unlock_wait(&per_cpu(local_lock, i));
> }
> 
> Hmm ?

Yeah, this looks IRQ-nesting safe. But i really have to ask: why 
does this code try so hard to allow same-CPU nesting?

Nesting on the same CPU is _bad_ for several reasons:

1) Performance: it rips apart critical sections cache-wise. Instead 
   of a nice:

       C1 ... C2 ... C3 ... C4

   serial sequence of critical sections, we get:
 
       C1A ... ( C2 ) ... C1B ... C3 ... C4

   Note that C1 got "ripped apart" into C1A and C1B with C2 injected 
   - reducing cache locality between C1A and C1B. We have to execute
   C1B no matter what, so we didnt actually win anything in terms of 
   total work to do, by processing C2 out of order.

   [ Preemption of work (which this kind of nesting is really about)
     is _the anti-thesis of performance_, and we try to delay it as
     much as possible and we try to batch up as much as possible. 
     For example the CPU scheduler will try _real_ hard to not 
     preempt a typical workload, as long as external latency 
     boundaries allow that. ]

2) Locking complexity and robustness. Nested locking is rank #1 in
   terms of introducing regressions into the kernel.

3) Instrumentation/checking complexity. Checking locking 
   dependencies is good and catches a boatload of bugs before they 
   hit upstream, and nested locks are supported but cause an 
   exponential explosion in terms of dependencies to check.

   Also, whenever instrumentation explodes is typically the sign of 
   some true, physical complexity that has been introduced into the 
   code. So it often is a canary for a misdesign at a fundamental 
   level, not a failure in the instrumentation framework.

In the past i saw lock nesting often used as a wrong solution when 
the critical sections were too long (causing too long latencies for 
critical work - e.g. delaying hardirq completion processing 
unreasonably), or just plain out of confusion about the items above.

I dont know whether that's the case here - it could be one of the 
rare exceptions calling for a new locking primitive (which should 
then be introduced at the core kernel level IMHO) - i dont know the 
code that well.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-21 21:04                                                                                   ` Stephen Hemminger
@ 2009-04-22  8:00                                                                                     ` Ingo Molnar
  0 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22  8:00 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Peter Zijlstra, Linus Torvalds, Paul Mackerras, paulmck,
	Eric Dumazet, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Stephen Hemminger <shemminger@vyatta.com> wrote:

> > That way there's no global cacheline bouncing (just the 
> > _reading_ of a global cacheline - which will be nicely localized 
> > - on NUMA too) - and we will hold at most 1-2 locks at once!
> > 
> > Something like:
> > 
> > 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
> > 
> > 	DEFINE_PER_CPU(rwlock_t local_lock);
> > 
> > 
> > 	void local_read_lock(void)
> > 	{
> > 	again:
> > 		read_lock(&per_cpu(local_lock, this_cpu));
> > 
> > 		if (unlikely(!read_can_lock(&global_wrlock))) {
> > 			read_unlock(&per_cpu(local_lock, this_cpu));
> > 			/*
> > 			 * Just wait for any global write activity:
> > 			 */
> > 			read_unlock_wait(&global_wrlock);
> > 			goto again;
> > 		}
> > 	}
> 
> Quit trying to be so damn f*cking cool. [...]

You make it quite hard to give reasonable feedback to your code :-/
First you attack me personally here, then - 30 minutes later - in 
the next iteration of your patch, you do:

+       /* wait for each other cpu to see new table */
+       for_each_possible_cpu(i)
+               if (i != smp_processor_id()) {
+                       xt_info_wrlock(i);
+                       xt_info_wrunlock(i);
+               }

... which i have not seen in your previous patch and which looks 
awfully similar to the write_lock_wait() based primitive i 
suggested.

( Just open-coded in an ugly fashion and slower than a real, proper
  wait-unlock would be, because it dirties all those locks 
  needlessly. )

So you must have agreed with me to a certain degree - i just dont 
see that in any of the discussion. (you seem to totally disagree 
with me to the level of ridiculing me.) Which makes it hard to 
discuss this on a rational basis.

> Your version fails for the case of nested local rules. [...]

Yes, as Eric pointed it out, more than an hour before your reply. I 
find the nesting uninteresting (in fact i find it harmful - see my 
reply to Eric). If you were only interested in nesting then a plain 
old-fashioned rwlock would have done the job.

The detail that is interesting here is how to avoid the global 
rwlock cacheline bounce - not the recursion. (the same-CPU recursion 
is avoidable via proper design or workaround-able via a counter in 
so many ways)

Anyway, i'm back into lurker mode.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22  7:35                                                                                       ` Ingo Molnar
  (?)
@ 2009-04-22  8:53                                                                                       ` Eric Dumazet
  2009-04-22 10:13                                                                                         ` Jarek Poplawski
                                                                                                           ` (3 more replies)
  -1 siblings, 4 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-22  8:53 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

Ingo Molnar a écrit :
> * Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Ingo Molnar a écrit :
>>> Why not use the obvious solution: a _single_ wrlock for global 
>>> access and read_can_lock() plus per cpu locks in the fastpath?
>> Obvious is not the qualifier I would use :)
>>
>> Brilliant yes :)
> 
> thanks :)
> 
>>> That way there's no global cacheline bouncing (just the 
>>> _reading_ of a global cacheline - which will be nicely localized 
>>> - on NUMA too) - and we will hold at most 1-2 locks at once!
>>>
>>> Something like:
>>>
>>> 	__cacheline_aligned DEFINE_RWLOCK(global_wrlock);
>>>
>>> 	DEFINE_PER_CPU(rwlock_t local_lock);
>>>
>>>
>>> 	void local_read_lock(void)
>>> 	{
>>> 	again:
>>> 		read_lock(&per_cpu(local_lock, this_cpu));
>> Hmm... here we can see global_wrlock locked by on writer, while 
>> this cpu already called local_read_lock(), and calls again this 
>> function -> Deadlock, because we hold our local_lock locked.
> 
> Yeah, indeed.
> 
> I wasnt really concentrating on the nested case, i was concentrating 
> on the scalability and lock nesting angle. I think the code 
> submitted here looks rather incestous in that regard.
> 
> Allowing nested locking _on the same CPU_ is asking for trouble. Use 
> short critical sections and if there's any exclusion needed, use an 
> irq-safe lock or a softirq-safe lock. Problem solved.
> 
>> Very interesting and could be changed to use spinlock + depth per 
>> cpu.
>>
>> -> we can detect recursion and avoid the deadlock, and we only use 
>> one atomic operation per lock/unlock pair in fastpath (this was 
>> the reason we tried hard to use a percpu spinlock during this 
>> thread)
>>
>>
>> __cacheline_aligned DEFINE_RWLOCK(global_wrlock);
>>
>> struct ingo_local_lock {
>> 	spinlock_t lock;
>> 	int depth;
>> };
>> DEFINE_PER_CPU(struct ingo_local_lock local_lock);
>>
>>
>> void local_read_lock(void)
>> {
>> 	struct ingo_local_lock *lck;
>>
>> 	local_bh_and_preempt_disable();
>> 	lck = &get_cpu_var(local_lock);
>> 	if (++lck->depth > 0) /* already locked */
>> 		return;
>> again:
>> 	spin_lock(&lck->lock);
>>
>> 	if (unlikely(!read_can_lock(&global_wrlock))) {
>> 		spin_unlock(&lck->lock);
>> 		/*
>> 		 * Just wait for any global write activity:
>> 		 */
>> 		read_unlock_wait(&global_wrlock);
>> 		goto again;
>> 	}
>> }
>>
>> void global_write_lock(void)
>> {
>> 	write_lock(&global_wrlock);
>>
>> 	for_each_possible_cpu(i)
>> 		spin_unlock_wait(&per_cpu(local_lock, i));
>> }
>>
>> Hmm ?
> 
> Yeah, this looks IRQ-nesting safe. But i really have to ask: why 
> does this code try so hard to allow same-CPU nesting?
> 
> Nesting on the same CPU is _bad_ for several reasons:
> 
> 1) Performance: it rips apart critical sections cache-wise. Instead 
>    of a nice:
> 
>        C1 ... C2 ... C3 ... C4
> 
>    serial sequence of critical sections, we get:
>  
>        C1A ... ( C2 ) ... C1B ... C3 ... C4
> 
>    Note that C1 got "ripped apart" into C1A and C1B with C2 injected 
>    - reducing cache locality between C1A and C1B. We have to execute
>    C1B no matter what, so we didnt actually win anything in terms of 
>    total work to do, by processing C2 out of order.
> 
>    [ Preemption of work (which this kind of nesting is really about)
>      is _the anti-thesis of performance_, and we try to delay it as
>      much as possible and we try to batch up as much as possible. 
>      For example the CPU scheduler will try _real_ hard to not 
>      preempt a typical workload, as long as external latency 
>      boundaries allow that. ]
> 
> 2) Locking complexity and robustness. Nested locking is rank #1 in
>    terms of introducing regressions into the kernel.
> 
> 3) Instrumentation/checking complexity. Checking locking 
>    dependencies is good and catches a boatload of bugs before they 
>    hit upstream, and nested locks are supported but cause an 
>    exponential explosion in terms of dependencies to check.
> 
>    Also, whenever instrumentation explodes is typically the sign of 
>    some true, physical complexity that has been introduced into the 
>    code. So it often is a canary for a misdesign at a fundamental 
>    level, not a failure in the instrumentation framework.
> 
> In the past i saw lock nesting often used as a wrong solution when 
> the critical sections were too long (causing too long latencies for 
> critical work - e.g. delaying hardirq completion processing 
> unreasonably), or just plain out of confusion about the items above.
> 

I agree with all this Ingo.

> I dont know whether that's the case here - it could be one of the 
> rare exceptions calling for a new locking primitive (which should 
> then be introduced at the core kernel level IMHO) - i dont know the 
> code that well.


The netfilter case is real simple Ingo, (note I did not use "obvious" here ;) )

netfilter in 2.6.2[0-9] used :

CPU 1

sofirq handles one packet from a NIC

ipt_do_table() /* to handle INPUT table for example, or FORWARD */
read_lock_bh(&a_central_and_damn_rwlock)
... parse rules
    -> calling some netfilter sub-function
       re-entering network IP stack to send some packet (say a RST packet)
       ...
       ipt_do_table() /* to handle OUPUT table rules for example */
       read_lock_bh() ; /* WE RECURSE here, but once in a while (if ever) */


This is one of the case, but other can happens with virtual networks, tunnels, ...
and so on. (Stephen had some cases with KVM if I remember well)

If this could be done without recursion, I am pretty sure netfilter
and network guys would have done it. I found Linus reaction quite
shocking IMHO, considering hard work done by all people on this.

I was pleased by your locking schem, that was *very* interesting, even
if not yet ready.

1) We can discuss of how bad recursion is.

We know loopback_xmit() could be way faster if we could avoid queeing packet
to softirq handler.
(Remember you and I suggested this patch some months ago ? Please remember
David rejected this because of recursion and possibility to overflow stack)

So yes, people are aware of recursion problems.

2) We can discuss how bad rwlocks are

We did lot of work on last months to delete some of rwlocks we had in kernel.
UDP stack for example dont use them anymore.

We tried to delete them on x_tables, but we must take care of ipt_do_table() nesting,
that is legal on 2.6.30.
Maybe netfilter guys can work to avoid this nesting on 2.6.31, 
I dont know how hard it is, definitly not 2.6.30 material.

Solutions were discussed many times and Stephen provided 13 versions of the patch.
If this was that obvious, one or two iterations would have been OK.

3) About last patch (v13)
Stephen did not agreed with you (well... maybe after all ..),
he only submitted again a previous version.

With linux-2.6.2[0-9], "iptables -L" used to block all cpus from entering netfilter
because we did a write_lock_bh() on the central rwlock while folding counters.

On v13, we dont try to freeze whole x_table context, but cpu per cpu.
Thats a minor change against previous versions, and should not lead to strange
application behavior. Its so much scalable that we should accept this change.




^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22  8:53                                                                                       ` Eric Dumazet
@ 2009-04-22 10:13                                                                                         ` Jarek Poplawski
  2009-04-22 11:26                                                                                           ` Ingo Molnar
  2009-04-22 11:18                                                                                         ` Ingo Molnar
                                                                                                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 254+ messages in thread
From: Jarek Poplawski @ 2009-04-22 10:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ingo Molnar, Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

On 22-04-2009 10:53, Eric Dumazet wrote:
...
> If this could be done without recursion, I am pretty sure netfilter
> and network guys would have done it. I found Linus reaction quite
> shocking IMHO, considering hard work done by all people on this.

Right, looks like 100% (Scandinavian?!) troll. I wonder where is the
admin of this mess...

Jarek P.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22  8:53                                                                                       ` Eric Dumazet
  2009-04-22 10:13                                                                                         ` Jarek Poplawski
@ 2009-04-22 11:18                                                                                         ` Ingo Molnar
  2009-04-22 15:19                                                                                         ` Linus Torvalds
  2009-04-22 17:48                                                                                         ` Ingo Molnar
  3 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22 11:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Eric Dumazet <dada1@cosmosbay.com> wrote:

> netfilter in 2.6.2[0-9] used :
> 
> CPU 1
> 
> sofirq handles one packet from a NIC
> 
> ipt_do_table() /* to handle INPUT table for example, or FORWARD */
> read_lock_bh(&a_central_and_damn_rwlock)
> ... parse rules
>     -> calling some netfilter sub-function
>        re-entering network IP stack to send some packet (say a RST packet)
>        ...
>        ipt_do_table() /* to handle OUPUT table rules for example */
>        read_lock_bh() ; /* WE RECURSE here, but once in a while (if ever) */
> 
> This is one of the case, but other can happens with virtual 
> networks, tunnels, ... and so on. (Stephen had some cases with KVM 
> if I remember well)
> 
> If this could be done without recursion, I am pretty sure 
> netfilter and network guys would have done it. I found Linus 
> reaction quite shocking IMHO, considering hard work done by all 
> people on this.

Again ... i dont know this code well, but you yourself describe it 
as "a_central_and_damn_rwlock" above.

"Central and damn" locks of any type, anywhere tend to cause such 
trouble.

Often they are mini-BKL's in the making. Here's some historic 
patterns:

 1- First there's a convenient lock around a popular and useful 
    piece of data and code.

 2- As popularity (and reach of code) grows, and some non-trivial 
    interaction ensues, a little bit of self-recursion is added to 
    the mix (this is often easier to do than to fix the root cause 
    of the problem) - making critical sections even easier to grow 
    in size.

 3- Then attempts are made to make it scale all (it's a popular 
    piece of code), it's extended along a per-cpu axis, but 
    complexity of locking explode by taking a ton of locks all at 
    once. The solution: yell at the lock validator for not allowing
    this (or for exploding due to the sheer mathematically 
    large complexity of the locking rules). Frequent requests for
    an exclusion from those pesky validations are made, and various
    hacks are done to work it around. It's all the fault of the lock
    validator, of course.

 4- At this point it's rarely a clean, tidy data lock anymore - it 
    tends to grow into a code lock nobody really knows how it works, 
    except that it better be taken more often than not, badness may
    ensue otherwise. Nobody really knows when to take it, only that 
    it should be taken widely enough, that it should be recursive 
    enough to call even _more_ code from under it. Efforts to reduce 
    critical section length are rebuffed with: "this adds unlocking
    and relocking overhead". And it should then all scale as well.

 5- In the end it's a lock everyone curses but nobody is able to fix 
    anymore. "If it was easy to fix we'd have fixed it long ago 
    already" kind of fatalist thinking becomes widespread.

We saw many examples of this in the past: beyond the BKL (which is a 
bit special as its more of an UP legacy - but which hurt us the 
most), we had the tasklist_lock which was problematic for a long 
time, then there's also the reiser3 locking which was extremely 
monolithic. [ i'm only naming safe examples here, where nobody will 
flame me =B-) ]

Whether this particular case applies is up to you. I see certain 
matches up to phase 3 or so - but i also see certain dissimilarities 
as well. Nor do i claim that it's easy to fix or improve.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22 10:13                                                                                         ` Jarek Poplawski
@ 2009-04-22 11:26                                                                                           ` Ingo Molnar
  2009-04-22 11:39                                                                                             ` Jarek Poplawski
  0 siblings, 1 reply; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22 11:26 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Eric Dumazet, Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Jarek Poplawski <jarkao2@gmail.com> wrote:

> On 22-04-2009 10:53, Eric Dumazet wrote:
> ...
> > If this could be done without recursion, I am pretty sure 
> > netfilter and network guys would have done it. I found Linus 
> > reaction quite shocking IMHO, considering hard work done by all 
> > people on this.
> 
> Right, looks like 100% (Scandinavian?!) troll. I wonder where is 
> the admin of this mess...

You might disagree with me on an honest technical basis, but note 
that this kind of childish, knee-jerk reaction against dissenting 
people is what gives netdev its bad reputation.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22 11:26                                                                                           ` Ingo Molnar
@ 2009-04-22 11:39                                                                                             ` Jarek Poplawski
  0 siblings, 0 replies; 254+ messages in thread
From: Jarek Poplawski @ 2009-04-22 11:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Dumazet, Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

On Wed, Apr 22, 2009 at 01:26:12PM +0200, Ingo Molnar wrote:
> 
> * Jarek Poplawski <jarkao2@gmail.com> wrote:
> 
> > On 22-04-2009 10:53, Eric Dumazet wrote:
> > ...
> > > If this could be done without recursion, I am pretty sure 
> > > netfilter and network guys would have done it. I found Linus 
> > > reaction quite shocking IMHO, considering hard work done by all 
> > > people on this.
> > 
> > Right, looks like 100% (Scandinavian?!) troll. I wonder where is 
> > the admin of this mess...
> 
> You might disagree with me on an honest technical basis, but note 
> that this kind of childish, knee-jerk reaction against dissenting 
> people is what gives netdev its bad reputation.

How did you find I disagree with you? There was nothing about you,
neither anything technical. Actually, I forgot to mention I agree with
Eric about your locking proposal.

Jarek P.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v13)
  2009-04-21 21:39                                                                                   ` [PATCH] netfilter: use per-cpu recursive lock (v13) Stephen Hemminger
  2009-04-22  4:17                                                                                     ` Paul E. McKenney
@ 2009-04-22 14:57                                                                                     ` Eric Dumazet
  2009-04-22 15:32                                                                                     ` Linus Torvalds
  2 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-22 14:57 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ingo Molnar, Linus Torvalds, Paul Mackerras, paulmck,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Stephen Hemminger a écrit :
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested.
> 
> The idea for this came from an earlier version done by Eric Dumazet.
> Locking is done per-cpu, the fast path locks on the current cpu
> and updates counters.  This reduces the contention of a
> single reader lock (in 2.6.29) without the delay of synchronize_net()
> (in 2.6.30-rc2). 
> 
> The mutex that was added for 2.6.30 in xt_table is unnecessary since
> there already is a mutex for xt[af].mutex that is held.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Tested today on various machines and no problem so far
tbench/oprofile results, 3.7236% cpu spent in ipt_do_table, and 0.84% used
on read_lock/read_unlock

c04a5c30 <ipt_do_table>: /* ipt_do_table total: 217134  3.7236 */
...
   349  0.0060 :c04a5ccc:   call   c04ce380 <_read_lock>
 23914  0.4101 :c04a5cd1:   mov    0xc(%edi),%eax
...
               :c04a5ecb:   call   c04ce5f0 <_read_unlock_bh>
 25279  0.4335 :c04a5ed0:   cmpb   $0x0,-0xd(%ebp)

"iptables -L" fetches its data very fast too.
150 us on a 8 cpus machine, small firewall rules.
400-700 us on same machine, 1000 fw rules set (160000 bytes per cpu)
 depending on network trafic (from light to flood)


Thanks

> 
> ---
> CHANGES 
>   - reader and write now inline
>   - only acquire one cpu write lock at a time
>   - write lock pushed down into get_counters
> 
>  include/linux/netfilter/x_tables.h |   50 +++++++++++++--
>  net/ipv4/netfilter/arp_tables.c    |  121 ++++++++++---------------------------
>  net/ipv4/netfilter/ip_tables.c     |  120 +++++++++---------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  120 ++++++++++--------------------------
>  net/netfilter/x_tables.c           |   55 +++++++++-------
>  5 files changed, 174 insertions(+), 292 deletions(-)
> 


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22  8:53                                                                                       ` Eric Dumazet
  2009-04-22 10:13                                                                                         ` Jarek Poplawski
  2009-04-22 11:18                                                                                         ` Ingo Molnar
@ 2009-04-22 15:19                                                                                         ` Linus Torvalds
  2009-04-22 16:57                                                                                           ` Eric Dumazet
  2009-04-22 17:48                                                                                         ` Ingo Molnar
  3 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-22 15:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ingo Molnar, Stephen Hemminger, Peter Zijlstra, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers



On Wed, 22 Apr 2009, Eric Dumazet wrote:
> 
> If this could be done without recursion, I am pretty sure netfilter
> and network guys would have done it. I found Linus reaction quite
> shocking IMHO, considering hard work done by all people on this.

You don't _understand_ do you?

There is a huge difference between recursive code, and a recursive lock.

The netfilter code may need to occasionally re-enter itself. Nobody ever 
contested _that_ part.

What I have disagreed with the whole time is 

 (a) doing local ad-hoc locking primitives without any comments 
     what-so-ever.

 (b) Doing them _wrong_ in many cases

 (c) Calling the _lock_ a "recursive" lock.

The fact that a lock works with recursion doesn't make it "recursive". 
That generally has a very special meaning for locking primitives, and 
means something else.

In contrast, a read-write lock actually has known properties, and we have 
existing locking mechanisms for those. And we call them read-write locks 
DESPITE THE FACT that the reading part can be done recursively. 

If you call a read-write lock a "recursive" lock, then you're a moron. 
It simply is _not_ a recursive lock. And neither is the lock you actually 
implemented, even though you (and Stephen) continually call it that. 

SO STOP CALLING IT A RECURSIVE LOCK. Look at your very own code: you can 
actually only use that lock in a recursive context in a _very_ specific 
place. Notice how it's only "recursive" when taken in the per-CPU context, 
but _not_ recursive when the filter-updating code ("writer") takes it?

Do you understand now? It really shouldn't be so hard for you. 

Naming is important. Locking is important. You did both things wrong. You 
named your locks something incorrect and mis-leading that didn't actually 
describe them, and you did your own private locking code without then 
documenting what the rules for this special lock were.

Maybe in your world that's ok. But no, in mine it's not. I've seen too 
many damn _non-functioning_ locks to ever want to see stuff like that 
again.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v13)
  2009-04-21 21:39                                                                                   ` [PATCH] netfilter: use per-cpu recursive lock (v13) Stephen Hemminger
  2009-04-22  4:17                                                                                     ` Paul E. McKenney
  2009-04-22 14:57                                                                                     ` Eric Dumazet
@ 2009-04-22 15:32                                                                                     ` Linus Torvalds
  2009-04-24  4:09                                                                                       ` [PATCH] netfilter: use per-CPU recursive lock {XIV} Stephen Hemminger
  2 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-22 15:32 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ingo Molnar, Paul Mackerras, paulmck, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers



On Tue, 21 Apr 2009, Stephen Hemminger wrote:
>
> This version of x_tables (ip/ip6/arp) locking uses a per-cpu
> recursive lock that can be nested.

Ack on the code.

But the comment is _still_ crap. Please update. It's not a recursive lock, 
as clearly shown by the code itself. It's a per-cpu read-write lock, and 
only the reader is "recursive" (but that's how read-write locks with in 
Linux, and that has nothing to do with anything).

So make the explanations match the code and the intent. Write it something 
like

	This version of x_tables (ip/ip6/arp) locking uses a per-cpu
	reader-writer lock lock where the readers can nest.

and don't confuse it with incorrect commit messages. The lock is very much 
not recursive - on purpose - for half the people taking it.

[ That, btw, was always true, even in the original random open-coded 
  version. Because you can't actually do a real recursive lock without 
  having notion of "current ownership" either by making the count be 
  <per-thread,per-lock> - like the BKL - or by saving the ownership 
  information in the lock. A plain counter simply doesn't do it. ]

		Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22 15:19                                                                                         ` Linus Torvalds
@ 2009-04-22 16:57                                                                                           ` Eric Dumazet
  2009-04-22 17:18                                                                                             ` Linus Torvalds
  0 siblings, 1 reply; 254+ messages in thread
From: Eric Dumazet @ 2009-04-22 16:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Stephen Hemminger, Peter Zijlstra, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers

Linus Torvalds a écrit :
> 
> On Wed, 22 Apr 2009, Eric Dumazet wrote:
>> If this could be done without recursion, I am pretty sure netfilter
>> and network guys would have done it. I found Linus reaction quite
>> shocking IMHO, considering hard work done by all people on this.
> 
> You don't _understand_ do you?
> 
> There is a huge difference between recursive code, and a recursive lock.
> 
> The netfilter code may need to occasionally re-enter itself. Nobody ever 
> contested _that_ part.
> 
> What I have disagreed with the whole time is 
> 
>  (a) doing local ad-hoc locking primitives without any comments 
>      what-so-ever.
> 
>  (b) Doing them _wrong_ in many cases
> 
>  (c) Calling the _lock_ a "recursive" lock.
> 
> The fact that a lock works with recursion doesn't make it "recursive". 
> That generally has a very special meaning for locking primitives, and 
> means something else.
> 
> In contrast, a read-write lock actually has known properties, and we have 
> existing locking mechanisms for those. And we call them read-write locks 
> DESPITE THE FACT that the reading part can be done recursively. 
> 
> If you call a read-write lock a "recursive" lock, then you're a moron. 
> It simply is _not_ a recursive lock. And neither is the lock you actually 
> implemented, even though you (and Stephen) continually call it that. 
> 
> SO STOP CALLING IT A RECURSIVE LOCK. Look at your very own code: you can 
> actually only use that lock in a recursive context in a _very_ specific 
> place. Notice how it's only "recursive" when taken in the per-CPU context, 
> but _not_ recursive when the filter-updating code ("writer") takes it?
> 
> Do you understand now? It really shouldn't be so hard for you. 
> 
> Naming is important. Locking is important. You did both things wrong. You 
> named your locks something incorrect and mis-leading that didn't actually 
> describe them, and you did your own private locking code without then 
> documenting what the rules for this special lock were.
> 
> Maybe in your world that's ok. But no, in mine it's not. I've seen too 
> many damn _non-functioning_ locks to ever want to see stuff like that 
> again.
> 
>

Linus,

I actually sent *one* buggy patch, and you already gave your feedback and NACK.

Fine

I even relayed this to Stephen suggesting him not calling this a recursive lock.
(Note how I use 'suggesting' here)

So, what do you want from me ? Should I copy 100 times :

"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
"I should not call it a recursive lock. I shall not invent new locking infra. I am a moron." 
...

OK done

Can we now proceed and continue ?

Thank you


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22 16:57                                                                                           ` Eric Dumazet
@ 2009-04-22 17:18                                                                                             ` Linus Torvalds
  2009-04-22 20:46                                                                                               ` Jarek Poplawski
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-22 17:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ingo Molnar, Stephen Hemminger, Peter Zijlstra, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers



On Wed, 22 Apr 2009, Eric Dumazet wrote:
> 
> I actually sent *one* buggy patch, and you already gave your feedback 
> and NACK.

Actually, the thing is, I don't even think your original patch was even 
buggy. The bug crept in later. I NAK'ed it not because it was buggy, but 
because of the ad-hoc'ness and the naming. Really.

And I actually even said so in my original rant:

 'The fact that code "happens to work by mistake" (and I'm not saying that
  your does - but it might just because of the per-cpu'ness of it [..]'

because your original patch still had the

	rcu_read_lock_bh();

in place before the whole

	rl = &__get_cpu_var(arp_tables_lock);
	if (likely(rl->count++ == 0))
		spin_lock(&rl->lock);

and that should have protected against both BH callers and preemption.

So I actually believe that your original patch probably worked fine (but 
as I said in my reaction to it, I thought it was almost by mistake and I 
wasn't going to review it)

So the actual _bug_ crept in later, when the RCU lock was removed, and the 
lock was cleaned up and separated into a function of its own.

And in fact, that is kind of my point: "uncommented locking with ad-hoc 
semantics is very fragile". Even _correct_ code ends up not being correct 
in the long run, because people don't realize all the subtle issues.

> I even relayed this to Stephen suggesting him not calling this a recursive lock.
> (Note how I use 'suggesting' here)
> 
> So, what do you want from me ? Should I copy 100 times :

So I consider this thread ended from a technical standpoint.

[ That said, I will not be at all shocked to hear if people decide later 
  that the RCU method was better after all, and that even the per-cpu
  rwlock or spinlock is just too expensive. ]

My problem today (apart from the relatively minor issue of also wanting to 
get the commit log fixed up) is just that I see emails from you finding my 
reaction shocking and from Jarek Poplawski that seem to still think that 
I'm a troll.

Just because I pointed out real technical problems? Is that shocking or 
trolling?

Really - please go back to my _original_ email. No, it was not polite. But 
here's another quote from it:

  "Because even if it works today, it's just a bug waiting to happen."

and dammit, I sent that out _before_ the very next version of the patch 
that actually _did_ introduce that exact bug.

So dammit - what part of my email was "shocking" or "trolling"? The part 
where I was right? Or what?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22  8:53                                                                                       ` Eric Dumazet
                                                                                                           ` (2 preceding siblings ...)
  2009-04-22 15:19                                                                                         ` Linus Torvalds
@ 2009-04-22 17:48                                                                                         ` Ingo Molnar
  3 siblings, 0 replies; 254+ messages in thread
From: Ingo Molnar @ 2009-04-22 17:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Peter Zijlstra, Linus Torvalds,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers


* Eric Dumazet <dada1@cosmosbay.com> wrote:

> If this could be done without recursion, I am pretty sure 
> netfilter and network guys would have done it. I found Linus 
> reaction quite shocking IMHO, considering hard work done by all 
> people on this.

Btw., i didnt find Linus's reaction shocking at all, nor did i 
understand it as any criticism of prior (and future) good work of 
the people involved. I found it to be what it was: a forceful 
(because repeated) criticism of a bad patch.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-cpu recursive lock (v11)
  2009-04-22 17:18                                                                                             ` Linus Torvalds
@ 2009-04-22 20:46                                                                                               ` Jarek Poplawski
  0 siblings, 0 replies; 254+ messages in thread
From: Jarek Poplawski @ 2009-04-22 20:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Ingo Molnar, Stephen Hemminger, Peter Zijlstra,
	Paul Mackerras, paulmck, Evgeniy Polyakov, David Miller, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, mathieu.desnoyers

Linus Torvalds wrote, On 04/22/2009 07:18 PM:

...
> My problem today (apart from the relatively minor issue of also wanting to 
> get the commit log fixed up) is just that I see emails from you finding my 
> reaction shocking and from Jarek Poplawski that seem to still think that 
> I'm a troll.
> 
> Just because I pointed out real technical problems? Is that shocking or 
> trolling?

Very funny! For newbies only: http://en.wikipedia.org/wiki/Troll_(Internet)


Actually, I admit you could be the best "Ozzy" Osbourne imposter instead.
(At least at netdev.) Stephen's words seem to fit very well:
> Ah a nice day, with Linus giving constructive feedback. Too bad he has
> to channel it out of the dark side.

Jarek P.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH]  netfilter: use per-CPU recursive lock {XIV}
  2009-04-22 15:32                                                                                     ` Linus Torvalds
@ 2009-04-24  4:09                                                                                       ` Stephen Hemminger
  2009-04-24  4:58                                                                                         ` Eric Dumazet
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-24  4:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Paul Mackerras, paulmck, Eric Dumazet,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

In days of old in 2.6.29, netfilter did locketh using a 
lock of the reader kind when doing its table business, and do
a writer when with pen in hand like a overworked accountant
did replace the tables. This sucketh and caused the single
lock to fly back and forth like a poor errant boy.

But then netfilter was blessed with RCU and the performance
was divine, but alas there were those that suffered for
trying to replace their many rules one at a time.

So now RCU must be vanquished from the scene, and better
chastity belts be placed upon this valuable asset most dear.
The locks that were but one are now replaced by one per suitor.

The repair was made after much discussion involving
Eric the wise, and Linus the foul. With flowers springing
up amid the thorns some peace has finally prevailed and
all is soothed. This patch and purple prose was penned by
in honor of "Talk like Shakespeare" day.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
What hath changed over the last two setting suns:
  * more words, mostly correct...

  * no need to locketh for writeh on current cpu tis 
    always so

  * the locking of all cpu's on replace is always done as
    part of the get_counters cycle, so the sychronize swip
    in replace tables is gone with only a comment remaing

 include/linux/netfilter/x_tables.h |   55 ++++++++++++++--
 net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
 net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
 net/netfilter/x_tables.c           |   55 ++++++++--------
 5 files changed, 188 insertions(+), 296 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-23 19:59:36.076558199 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-23 20:22:06.566001575 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,56 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU read/write lock associated with per-cpu table entries.
+ * This is not a general solution but makes reader locking fast since
+ * there is no shared variable to cause cache ping-pong; but adds an
+ * additional write-side penalty since update must lock all
+ * possible CPU's.
+ *
+ * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because read locks handle nesting.
+ *
+ * Write lock is used in two cases:
+ *    1. reading counter values
+ *       all readers need to be stopped and the per-CPU values are summed.
+ *
+ *    2. replacing tables
+ *       any readers that are using the old tables have to complete
+ *       before freeing the old table. This is handled by reading
+ *	  as a side effect of reading counters
+ */
+DECLARE_PER_CPU(rwlock_t, xt_info_locks);
+
+static inline void xt_info_rdlock_bh(void)
+{
+	/*
+	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
+	 * because need to ensure that preemption is disable before
+	 * acquiring per-cpu-variable, so do it as a two step process
+	 */
+	local_bh_disable();
+	read_lock(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	read_unlock_bh(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	write_lock(&per_cpu(xt_info_locks, cpu));
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+
+	write_unlock(&per_cpu(xt_info_locks, cpu));
+}
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-23 19:59:36.055558989 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-23 20:38:48.270239729 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-23 19:59:36.065578531 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-23 20:14:24.483579537 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info 
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info 
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-23 19:59:36.047577942 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-23 20:46:47.356237966 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info 
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
+		return ERR_PTR(-ENOMEM);
 
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-23 19:59:36.038563216 -0700
+++ b/net/netfilter/x_tables.c	2009-04-23 20:09:04.809576277 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(rwlock_t, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,16 @@ static struct pernet_operations xt_net_o
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+	static struct lock_class_key xt_lock_key[NR_CPUS];
+
+	for_each_possible_cpu(i) {
+		rwlock_t *lock = &per_cpu(xt_info_locks, i);
+
+		rwlock_init(lock);
+		lockdep_set_class(lock, xt_lock_key+i);
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
  2009-04-24  4:09                                                                                       ` [PATCH] netfilter: use per-CPU recursive lock {XIV} Stephen Hemminger
@ 2009-04-24  4:58                                                                                         ` Eric Dumazet
  2009-04-24 15:33                                                                                             ` Patrick McHardy
  2009-04-24 16:18                                                                                             ` Stephen Hemminger
  0 siblings, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-24  4:58 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Linus Torvalds, Ingo Molnar, Paul Mackerras, paulmck,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Stephen Hemminger a écrit :
> In days of old in 2.6.29, netfilter did locketh using a 
> lock of the reader kind when doing its table business, and do
> a writer when with pen in hand like a overworked accountant
> did replace the tables. This sucketh and caused the single
> lock to fly back and forth like a poor errant boy.
> 
> But then netfilter was blessed with RCU and the performance
> was divine, but alas there were those that suffered for
> trying to replace their many rules one at a time.
> 
> So now RCU must be vanquished from the scene, and better
> chastity belts be placed upon this valuable asset most dear.
> The locks that were but one are now replaced by one per suitor.
> 
> The repair was made after much discussion involving
> Eric the wise, and Linus the foul. With flowers springing
> up amid the thorns some peace has finally prevailed and
> all is soothed. This patch and purple prose was penned by
> in honor of "Talk like Shakespeare" day.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


Philip Davis of the university’s School of English said :

  "Shakespeare surprises the brain and catches it off guard in
  a manner that produces a sudden burst of activity - a sense 
  of drama created out of the simplest of things."

http://www.physorg.com/news85664210.html

> 
> ---
> What hath changed over the last two setting suns:
>   * more words, mostly correct...
> 
>   * no need to locketh for writeh on current cpu tis 
>     always so
> 
>   * the locking of all cpu's on replace is always done as
>     part of the get_counters cycle, so the sychronize swip
>     in replace tables is gone with only a comment remaing
> 
>  include/linux/netfilter/x_tables.h |   55 ++++++++++++++--
>  net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
>  net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
>  net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
>  net/netfilter/x_tables.c           |   55 ++++++++--------
>  5 files changed, 188 insertions(+), 296 deletions(-)
> 


>  
>  static int __init xt_init(void)
>  {
> -	int i, rv;
> +	unsigned int i;
> +	int rv;
> +	static struct lock_class_key xt_lock_key[NR_CPUS];

Could we avoid this [NR_CPUS] thing ?

> +
> +	for_each_possible_cpu(i) {
> +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
> +
> +		rwlock_init(lock);
> +		lockdep_set_class(lock, xt_lock_key+i);
> +	}


Did you tried :

static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);

static int __init xt_init(void)
 {
	unsigned int i;
	int rv;

	for_each_possible_cpu(i) {
		rwlock_t *lock = &per_cpu(xt_info_locks, i);

		rwlock_init(lock);
		lockdep_set_class(lock, &per_cpu(&xt_locks_key, i));
	}
...

Thanks


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
  2009-04-24  4:58                                                                                         ` Eric Dumazet
@ 2009-04-24 15:33                                                                                             ` Patrick McHardy
  2009-04-24 16:18                                                                                             ` Stephen Hemminger
  1 sibling, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-24 15:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> In days of old in 2.6.29, netfilter did locketh using a 
>> lock of the reader kind when doing its table business, and do
>> a writer when with pen in hand like a overworked accountant
>> did replace the tables. This sucketh and caused the single
>> lock to fly back and forth like a poor errant boy.
>>
>> But then netfilter was blessed with RCU and the performance
>> was divine, but alas there were those that suffered for
>> trying to replace their many rules one at a time.
>>
>> So now RCU must be vanquished from the scene, and better
>> chastity belts be placed upon this valuable asset most dear.
>> The locks that were but one are now replaced by one per suitor.
>>
>> The repair was made after much discussion involving
>> Eric the wise, and Linus the foul. With flowers springing
>> up amid the thorns some peace has finally prevailed and
>> all is soothed. This patch and purple prose was penned by
>> in honor of "Talk like Shakespeare" day.

Hehe.

>>  static int __init xt_init(void)
>>  {
>> -	int i, rv;
>> +	unsigned int i;
>> +	int rv;
>> +	static struct lock_class_key xt_lock_key[NR_CPUS];
> 
> Could we avoid this [NR_CPUS] thing ?
> 
>> +
>> +	for_each_possible_cpu(i) {
>> +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
>> +
>> +		rwlock_init(lock);
>> +		lockdep_set_class(lock, xt_lock_key+i);
>> +	}
> 
> 
> Did you tried :
> 
> static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);

Either way is fine with me, I'll wait for Stephen to state his opinion.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
@ 2009-04-24 15:33                                                                                             ` Patrick McHardy
  0 siblings, 0 replies; 254+ messages in thread
From: Patrick McHardy @ 2009-04-24 15:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> In days of old in 2.6.29, netfilter did locketh using a 
>> lock of the reader kind when doing its table business, and do
>> a writer when with pen in hand like a overworked accountant
>> did replace the tables. This sucketh and caused the single
>> lock to fly back and forth like a poor errant boy.
>>
>> But then netfilter was blessed with RCU and the performance
>> was divine, but alas there were those that suffered for
>> trying to replace their many rules one at a time.
>>
>> So now RCU must be vanquished from the scene, and better
>> chastity belts be placed upon this valuable asset most dear.
>> The locks that were but one are now replaced by one per suitor.
>>
>> The repair was made after much discussion involving
>> Eric the wise, and Linus the foul. With flowers springing
>> up amid the thorns some peace has finally prevailed and
>> all is soothed. This patch and purple prose was penned by
>> in honor of "Talk like Shakespeare" day.

Hehe.

>>  static int __init xt_init(void)
>>  {
>> -	int i, rv;
>> +	unsigned int i;
>> +	int rv;
>> +	static struct lock_class_key xt_lock_key[NR_CPUS];
> 
> Could we avoid this [NR_CPUS] thing ?
> 
>> +
>> +	for_each_possible_cpu(i) {
>> +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
>> +
>> +		rwlock_init(lock);
>> +		lockdep_set_class(lock, xt_lock_key+i);
>> +	}
> 
> 
> Did you tried :
> 
> static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);

Either way is fine with me, I'll wait for Stephen to state his opinion.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
  2009-04-24  4:58                                                                                         ` Eric Dumazet
@ 2009-04-24 16:18                                                                                             ` Stephen Hemminger
  2009-04-24 16:18                                                                                             ` Stephen Hemminger
  1 sibling, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-24 16:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Ingo Molnar, Paul Mackerras, paulmck,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Fri, 24 Apr 2009 06:58:39 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > In days of old in 2.6.29, netfilter did locketh using a 
> > lock of the reader kind when doing its table business, and do
> > a writer when with pen in hand like a overworked accountant
> > did replace the tables. This sucketh and caused the single
> > lock to fly back and forth like a poor errant boy.
> > 
> > But then netfilter was blessed with RCU and the performance
> > was divine, but alas there were those that suffered for
> > trying to replace their many rules one at a time.
> > 
> > So now RCU must be vanquished from the scene, and better
> > chastity belts be placed upon this valuable asset most dear.
> > The locks that were but one are now replaced by one per suitor.
> > 
> > The repair was made after much discussion involving
> > Eric the wise, and Linus the foul. With flowers springing
> > up amid the thorns some peace has finally prevailed and
> > all is soothed. This patch and purple prose was penned by
> > in honor of "Talk like Shakespeare" day.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> Philip Davis of the university’s School of English said :
> 
>   "Shakespeare surprises the brain and catches it off guard in
>   a manner that produces a sudden burst of activity - a sense 
>   of drama created out of the simplest of things."
> 
> http://www.physorg.com/news85664210.html
> 
> > 
> > ---
> > What hath changed over the last two setting suns:
> >   * more words, mostly correct...
> > 
> >   * no need to locketh for writeh on current cpu tis 
> >     always so
> > 
> >   * the locking of all cpu's on replace is always done as
> >     part of the get_counters cycle, so the sychronize swip
> >     in replace tables is gone with only a comment remaing
> > 
> >  include/linux/netfilter/x_tables.h |   55 ++++++++++++++--
> >  net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
> >  net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
> >  net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
> >  net/netfilter/x_tables.c           |   55 ++++++++--------
> >  5 files changed, 188 insertions(+), 296 deletions(-)
> > 
> 
> 
> >  
> >  static int __init xt_init(void)
> >  {
> > -	int i, rv;
> > +	unsigned int i;
> > +	int rv;
> > +	static struct lock_class_key xt_lock_key[NR_CPUS];
> 
> Could we avoid this [NR_CPUS] thing ?
> 
> > +
> > +	for_each_possible_cpu(i) {
> > +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
> > +
> > +		rwlock_init(lock);
> > +		lockdep_set_class(lock, xt_lock_key+i);
> > +	}
> 
> 
> Did you tried :
> 
> static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);
> 

The lock keys are really only used by lock dep, and I thought per cpu
space was more scarce on some arch.


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
@ 2009-04-24 16:18                                                                                             ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-24 16:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linus Torvalds, Ingo Molnar, Paul Mackerras, paulmck,
	Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

On Fri, 24 Apr 2009 06:58:39 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > In days of old in 2.6.29, netfilter did locketh using a 
> > lock of the reader kind when doing its table business, and do
> > a writer when with pen in hand like a overworked accountant
> > did replace the tables. This sucketh and caused the single
> > lock to fly back and forth like a poor errant boy.
> > 
> > But then netfilter was blessed with RCU and the performance
> > was divine, but alas there were those that suffered for
> > trying to replace their many rules one at a time.
> > 
> > So now RCU must be vanquished from the scene, and better
> > chastity belts be placed upon this valuable asset most dear.
> > The locks that were but one are now replaced by one per suitor.
> > 
> > The repair was made after much discussion involving
> > Eric the wise, and Linus the foul. With flowers springing
> > up amid the thorns some peace has finally prevailed and
> > all is soothed. This patch and purple prose was penned by
> > in honor of "Talk like Shakespeare" day.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> Philip Davis of the university’s School of English said :
> 
>   "Shakespeare surprises the brain and catches it off guard in
>   a manner that produces a sudden burst of activity - a sense 
>   of drama created out of the simplest of things."
> 
> http://www.physorg.com/news85664210.html
> 
> > 
> > ---
> > What hath changed over the last two setting suns:
> >   * more words, mostly correct...
> > 
> >   * no need to locketh for writeh on current cpu tis 
> >     always so
> > 
> >   * the locking of all cpu's on replace is always done as
> >     part of the get_counters cycle, so the sychronize swip
> >     in replace tables is gone with only a comment remaing
> > 
> >  include/linux/netfilter/x_tables.h |   55 ++++++++++++++--
> >  net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
> >  net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
> >  net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
> >  net/netfilter/x_tables.c           |   55 ++++++++--------
> >  5 files changed, 188 insertions(+), 296 deletions(-)
> > 
> 
> 
> >  
> >  static int __init xt_init(void)
> >  {
> > -	int i, rv;
> > +	unsigned int i;
> > +	int rv;
> > +	static struct lock_class_key xt_lock_key[NR_CPUS];
> 
> Could we avoid this [NR_CPUS] thing ?
> 
> > +
> > +	for_each_possible_cpu(i) {
> > +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
> > +
> > +		rwlock_init(lock);
> > +		lockdep_set_class(lock, xt_lock_key+i);
> > +	}
> 
> 
> Did you tried :
> 
> static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);
> 

The lock keys are really only used by lock dep, and I thought per cpu
space was more scarce on some arch.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH]  netfilter: use per-CPU recursive lock {XIV}
  2009-04-24 16:18                                                                                             ` Stephen Hemminger
  (?)
@ 2009-04-24 20:43                                                                                             ` Jarek Poplawski
  2009-04-25 20:30                                                                                               ` [PATCH] netfilter: iptables no lockdep is needed Stephen Hemminger
  -1 siblings, 1 reply; 254+ messages in thread
From: Jarek Poplawski @ 2009-04-24 20:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers

Stephen Hemminger wrote, On 04/24/2009 06:18 PM:

> On Fri, 24 Apr 2009 06:58:39 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Stephen Hemminger a écrit :
>>> In days of old in 2.6.29, netfilter did locketh using a 
>>> lock of the reader kind when doing its table business, and do
>>> a writer when with pen in hand like a overworked accountant
>>> did replace the tables. This sucketh and caused the single
>>> lock to fly back and forth like a poor errant boy.
>>>
>>> But then netfilter was blessed with RCU and the performance
>>> was divine, but alas there were those that suffered for
>>> trying to replace their many rules one at a time.
>>>
>>> So now RCU must be vanquished from the scene, and better
>>> chastity belts be placed upon this valuable asset most dear.
>>> The locks that were but one are now replaced by one per suitor.
>>>
>>> The repair was made after much discussion involving
>>> Eric the wise, and Linus the foul. With flowers springing
>>> up amid the thorns some peace has finally prevailed and
>>> all is soothed. This patch and purple prose was penned by
>>> in honor of "Talk like Shakespeare" day.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> Philip Davis of the university’s School of English said :
>>
>>   "Shakespeare surprises the brain and catches it off guard in
>>   a manner that produces a sudden burst of activity - a sense 
>>   of drama created out of the simplest of things."
>>
>> http://www.physorg.com/news85664210.html
>>
>>> ---
>>> What hath changed over the last two setting suns:
>>>   * more words, mostly correct...
>>>
>>>   * no need to locketh for writeh on current cpu tis 
>>>     always so
>>>
>>>   * the locking of all cpu's on replace is always done as
>>>     part of the get_counters cycle, so the sychronize swip
>>>     in replace tables is gone with only a comment remaing
>>>
>>>  include/linux/netfilter/x_tables.h |   55 ++++++++++++++--
>>>  net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
>>>  net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
>>>  net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
>>>  net/netfilter/x_tables.c           |   55 ++++++++--------
>>>  5 files changed, 188 insertions(+), 296 deletions(-)
>>>
>>
>>>  
>>>  static int __init xt_init(void)
>>>  {
>>> -	int i, rv;
>>> +	unsigned int i;
>>> +	int rv;
>>> +	static struct lock_class_key xt_lock_key[NR_CPUS];
>> Could we avoid this [NR_CPUS] thing ?
>>
>>> +
>>> +	for_each_possible_cpu(i) {
>>> +		rwlock_t *lock = &per_cpu(xt_info_locks, i);
>>> +
>>> +		rwlock_init(lock);
>>> +		lockdep_set_class(lock, xt_lock_key+i);
>>> +	}
>>
>> Did you tried :
>>
>> static DECLARE_PER_CPU(struct lock_class_key, xt_locks_key);
>>
> 
> The lock keys are really only used by lock dep, and I thought per cpu
> space was more scarce on some arch.
> 

Maybe I'm wrong but after this change: "- only acquire one cpu write
lock at a time" lockdep_set_class() might be unnecessary. Alas I'm
not able to test it.

Jarek P.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: iptables no lockdep is needed..
  2009-04-24 20:43                                                                                             ` Jarek Poplawski
@ 2009-04-25 20:30                                                                                               ` Stephen Hemminger
  2009-04-26  8:18                                                                                                 ` Jarek Poplawski
  2009-04-26 18:24                                                                                                 ` [PATCH] netfilter: use per-CPU recursive lock {XV} Eric Dumazet
  0 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-25 20:30 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Eric Dumazet, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers

Epilogue due to master Jarek. Lockdep carest not about the locking
doth bestowed. Therefore no keys are needed.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/netfilter/x_tables.c |    9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

--- a/net/netfilter/x_tables.c	2009-04-25 13:25:48.115026283 -0700
+++ b/net/netfilter/x_tables.c	2009-04-25 13:26:15.646215635 -0700
@@ -1145,14 +1145,9 @@ static int __init xt_init(void)
 {
 	unsigned int i;
 	int rv;
-	static struct lock_class_key xt_lock_key[NR_CPUS];
 
-	for_each_possible_cpu(i) {
-		rwlock_t *lock = &per_cpu(xt_info_locks, i);
-
-		rwlock_init(lock);
-		lockdep_set_class(lock, xt_lock_key+i);
-	}
+	for_each_possible_cpu(i)
+		rwlock_init(&per_cpu(xt_info_locks, i));
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: iptables no lockdep is needed..
  2009-04-25 20:30                                                                                               ` [PATCH] netfilter: iptables no lockdep is needed Stephen Hemminger
@ 2009-04-26  8:18                                                                                                 ` Jarek Poplawski
  2009-04-26 18:24                                                                                                 ` [PATCH] netfilter: use per-CPU recursive lock {XV} Eric Dumazet
  1 sibling, 0 replies; 254+ messages in thread
From: Jarek Poplawski @ 2009-04-26  8:18 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, David Miller, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh, mathieu.desnoyers

On Sat, Apr 25, 2009 at 01:30:52PM -0700, Stephen Hemminger wrote:
> Epilogue due to master Jarek. Lockdep carest not about the locking
> doth bestowed. Therefore no keys are needed.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Very nice! I guess this Shakespeare guy Will sign off this all too. ;-)

Thanks,
Jarek P.

> 
> ---
>  net/netfilter/x_tables.c |    9 ++-------
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> --- a/net/netfilter/x_tables.c	2009-04-25 13:25:48.115026283 -0700
> +++ b/net/netfilter/x_tables.c	2009-04-25 13:26:15.646215635 -0700
> @@ -1145,14 +1145,9 @@ static int __init xt_init(void)
>  {
>  	unsigned int i;
>  	int rv;
> -	static struct lock_class_key xt_lock_key[NR_CPUS];
>  
> -	for_each_possible_cpu(i) {
> -		rwlock_t *lock = &per_cpu(xt_info_locks, i);
> -
> -		rwlock_init(lock);
> -		lockdep_set_class(lock, xt_lock_key+i);
> -	}
> +	for_each_possible_cpu(i)
> +		rwlock_init(&per_cpu(xt_info_locks, i));
>  
>  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
>  	if (!xt)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-25 20:30                                                                                               ` [PATCH] netfilter: iptables no lockdep is needed Stephen Hemminger
  2009-04-26  8:18                                                                                                 ` Jarek Poplawski
@ 2009-04-26 18:24                                                                                                 ` Eric Dumazet
  2009-04-26 18:56                                                                                                   ` Mathieu Desnoyers
  2009-04-26 19:31                                                                                                   ` [PATCH] netfilter: use per-CPU recursive " Mathieu Desnoyers
  1 sibling, 2 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-26 18:24 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller
  Cc: Jarek Poplawski, Linus Torvalds, Ingo Molnar, Paul Mackerras,
	paulmck, Evgeniy Polyakov, kaber, jeff.chua.linux, laijs,
	jengelh, r000n, linux-kernel, netfilter-devel, netdev, benh,
	mathieu.desnoyers

From: Stephen Hemminger <shemminger@vyatta.com>

> Epilogue due to master Jarek. Lockdep carest not about the locking
> doth bestowed. Therefore no keys are needed.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

So far, so good, should be ready for inclusion now, nobody complained :)

I include the final patch, merge of your last two patches.

David, could you please review it once again and apply it if it's OK ?

Thanks to all for your help and patience

[PATCH] netfilter: use per-CPU recursive lock {XV}

In days of old in 2.6.29, netfilter did locketh using a 
lock of the reader kind when doing its table business, and do
a writer when with pen in hand like a overworked accountant
did replace the tables. This sucketh and caused the single
lock to fly back and forth like a poor errant boy.

But then netfilter was blessed with RCU and the performance
was divine, but alas there were those that suffered for
trying to replace their many rules one at a time.

So now RCU must be vanquished from the scene, and better
chastity belts be placed upon this valuable asset most dear.
The locks that were but one are now replaced by one per suitor.

The repair was made after much discussion involving
Eric the wise, and Linus the foul. With flowers springing
up amid the thorns some peace has finally prevailed and
all is soothed. This patch and purple prose was penned by
in honor of "Talk like Shakespeare" day.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
What hath changed over the last two setting suns:

  * more words, mostly correct...

  * no need to locketh for writeh on current cpu tis 
    always so

  * the locking of all cpu's on replace is always done as
    part of the get_counters cycle, so the sychronize swip
    in replace tables is gone with only a comment remaing

  * Epilogue due to master Jarek. Lockdep carest not about
    the locking doth bestowed. Therefore no keys are needed.

 include/linux/netfilter/x_tables.h |   55 ++++++++++-
 net/ipv4/netfilter/arp_tables.c    |  125 +++++++-------------------
 net/ipv4/netfilter/ip_tables.c     |  126 +++++++--------------------
 net/ipv6/netfilter/ip6_tables.c    |  123 +++++++-------------------
 net/netfilter/x_tables.c           |   50 +++++-----
 5 files changed, 183 insertions(+), 296 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652..511debb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,56 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU read/write lock associated with per-cpu table entries.
+ * This is not a general solution but makes reader locking fast since
+ * there is no shared variable to cause cache ping-pong; but adds an
+ * additional write-side penalty since update must lock all
+ * possible CPU's.
+ *
+ * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because read locks handle nesting.
+ *
+ * Write lock is used in two cases:
+ *    1. reading counter values
+ *       all readers need to be stopped and the per-CPU values are summed.
+ *
+ *    2. replacing tables
+ *       any readers that are using the old tables have to complete
+ *       before freeing the old table. This is handled by reading
+ *	  as a side effect of reading counters
+ */
+DECLARE_PER_CPU(rwlock_t, xt_info_locks);
+
+static inline void xt_info_rdlock_bh(void)
+{
+	/*
+	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
+	 * because need to ensure that preemption is disable before
+	 * acquiring per-cpu-variable, so do it as a two step process
+	 */
+	local_bh_disable();
+	read_lock(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	read_unlock_bh(&__get_cpu_var(xt_info_locks));
+}
+
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	write_lock(&per_cpu(xt_info_locks, cpu));
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+
+	write_unlock(&per_cpu(xt_info_locks, cpu));
+}
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d..831fe18 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b6..2ec8d72 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae85..219e165 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a956..5807a4d 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(rwlock_t, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,11 @@ static struct pernet_operations xt_net_ops = {
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i)
+		rwlock_init(&per_cpu(xt_info_locks, i));
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 18:24                                                                                                 ` [PATCH] netfilter: use per-CPU recursive lock {XV} Eric Dumazet
@ 2009-04-26 18:56                                                                                                   ` Mathieu Desnoyers
  2009-04-26 21:57                                                                                                     ` Stephen Hemminger
  2009-04-26 19:31                                                                                                   ` [PATCH] netfilter: use per-CPU recursive " Mathieu Desnoyers
  1 sibling, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-26 18:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

* Eric Dumazet (dada1@cosmosbay.com) wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> 
> > Epilogue due to master Jarek. Lockdep carest not about the locking
> > doth bestowed. Therefore no keys are needed.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> So far, so good, should be ready for inclusion now, nobody complained :)
> 
> I include the final patch, merge of your last two patches.
> 
> David, could you please review it once again and apply it if it's OK ?
> 
> Thanks to all for your help and patience
> 
> [PATCH] netfilter: use per-CPU recursive lock {XV}

Hi Eric,

Please... could you rename this patch according to Linus'comments ?

Suitable name would probably be :

[PATCH] netfilter: use bh disabling with per-cpu read-write lock

> 
> In days of old in 2.6.29, netfilter did locketh using a 
> lock of the reader kind when doing its table business, and do
> a writer when with pen in hand like a overworked accountant
> did replace the tables. This sucketh and caused the single
> lock to fly back and forth like a poor errant boy.
> 
> But then netfilter was blessed with RCU and the performance
> was divine, but alas there were those that suffered for
> trying to replace their many rules one at a time.
> 
> So now RCU must be vanquished from the scene, and better
> chastity belts be placed upon this valuable asset most dear.
> The locks that were but one are now replaced by one per suitor.
> 
> The repair was made after much discussion involving
> Eric the wise, and Linus the foul. With flowers springing
> up amid the thorns some peace has finally prevailed and
> all is soothed. This patch and purple prose was penned by
> in honor of "Talk like Shakespeare" day.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> What hath changed over the last two setting suns:
> 
>   * more words, mostly correct...
> 
>   * no need to locketh for writeh on current cpu tis 
>     always so
> 
>   * the locking of all cpu's on replace is always done as
>     part of the get_counters cycle, so the sychronize swip
>     in replace tables is gone with only a comment remaing
> 
>   * Epilogue due to master Jarek. Lockdep carest not about
>     the locking doth bestowed. Therefore no keys are needed.
> 
>  include/linux/netfilter/x_tables.h |   55 ++++++++++-
>  net/ipv4/netfilter/arp_tables.c    |  125 +++++++-------------------
>  net/ipv4/netfilter/ip_tables.c     |  126 +++++++--------------------
>  net/ipv6/netfilter/ip6_tables.c    |  123 +++++++-------------------
>  net/netfilter/x_tables.c           |   50 +++++-----
>  5 files changed, 183 insertions(+), 296 deletions(-)
> 
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 7b1a652..511debb 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -354,9 +354,6 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
>  
> -	/* Lock for the curtain */
> -	struct mutex lock;
> -
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
>  
> @@ -434,8 +431,56 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
> -				    struct xt_table_info *new);
> +
> +/*
> + * Per-CPU read/write lock associated with per-cpu table entries.
> + * This is not a general solution but makes reader locking fast since
> + * there is no shared variable to cause cache ping-pong; but adds an
> + * additional write-side penalty since update must lock all
> + * possible CPU's.
> + *
> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
> + * It needs to ensure that the rules are not being changed while packet
> + * is being processed. In some cases, the read lock will be acquired
> + * twice on the same CPU; this is okay because read locks handle nesting.
> + *
> + * Write lock is used in two cases:
> + *    1. reading counter values
> + *       all readers need to be stopped and the per-CPU values are summed.
> + *
> + *    2. replacing tables
> + *       any readers that are using the old tables have to complete
> + *       before freeing the old table. This is handled by reading
> + *	  as a side effect of reading counters

Stating that the write lock must _always_ be taken with bh disabled
might not hurt here.

> + */
> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
> +
> +static inline void xt_info_rdlock_bh(void)
> +{
> +	/*
> +	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
> +	 * because need to ensure that preemption is disable before
> +	 * acquiring per-cpu-variable, so do it as a two step process
> +	 */
> +	local_bh_disable();
> +	read_lock(&__get_cpu_var(xt_info_locks));
> +}
> +
> +static inline void xt_info_rdunlock_bh(void)
> +{
> +	read_unlock_bh(&__get_cpu_var(xt_info_locks));
> +}
> +
> +static inline void xt_info_wrlock(unsigned int cpu)
> +{
> +	write_lock(&per_cpu(xt_info_locks, cpu));
> +}
> +
> +static inline void xt_info_wrunlock(unsigned int cpu)
> +{
> +
> +	write_unlock(&per_cpu(xt_info_locks, cpu));
> +}
>  
>  /*
>   * This helper is performance critical and must be inlined
> diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
> index 5ba533d..831fe18 100644
> --- a/net/ipv4/netfilter/arp_tables.c
> +++ b/net/ipv4/netfilter/arp_tables.c
> @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  	indev = in ? in->name : nulldevname;
>  	outdev = out ? out->name : nulldevname;
>  
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  	back = get_entry(table_base, private->underflow[hook]);
> @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> +

This is not a whitespace cleanup patch.

>  			ADD_COUNTER(e->counters, hdr_len, 1);
>  
>  			t = arpt_get_target(e);
> @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -

Whitespace cleanup ?

> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
>  
>  	if (hotdrop)
>  		return NF_DROP;
> @@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> +	 *
> +	 * Bottom half has to be disabled to prevent deadlock
> +	 * if new softirq were to run and call ipt_do_table
>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();
> +	curcpu = smp_processor_id();
>  
>  	i = 0;
>  	ARPT_ENTRY_ITERATE(t->entries[curcpu],
> @@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
>  				   counters,
>  				   &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -}
> -
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct arpt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	ARPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
>  	local_bh_enable();
>  }

Did you really need to move add_counter_to_entry and put_counters in
this patch ? This also seems more like a cleanup to me, if it is even
one. It does make the patch harder to follow though.

>  
> -static inline int
> -zero_entry_counter(struct arpt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	 * (other than comefrom, which userspace doesn't care
> @@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> -
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> +		return ERR_PTR(-ENOMEM);
>  
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int copy_entries_to_user(unsigned int total_size,
> @@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
>  	    (newinfo->number <= oldinfo->initial_entries))
>  		module_put(t->me);
>  
> -	/* Get the old counters. */
> +	/* Get the old counters, and synchronize with replace */
>  	get_counters(oldinfo, counters);
> +
>  	/* Decrease module usage counts and free resource */
>  	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
>  	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
> @@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct arpt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  			   int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	loc_cpu_entry = private->entries[curcpu];
> +	xt_info_wrlock(curcpu);
>  	ARPT_ENTRY_ITERATE(loc_cpu_entry,
>  			   private->size,
>  			   add_counter_to_entry,
>  			   paddc,
>  			   &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> -
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
> index 810c0b6..2ec8d72 100644
> --- a/net/ipv4/netfilter/ip_tables.c
> +++ b/net/ipv4/netfilter/ip_tables.c
> @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
>  	tgpar.hooknum = hook;
>  
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
>  
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
> -	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> +	 * with data used by 'current' CPU.
> +	 *
> +	 * Bottom half has to be disabled to prevent deadlock
> +	 * if new softirq were to run and call ipt_do_table
>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();
> +	curcpu = smp_processor_id();
>  
>  	i = 0;
>  	IPT_ENTRY_ITERATE(t->entries[curcpu],
> @@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[cpu],
> -			  t->size,
> -			  add_counter_to_entry,
> -			  counters,
> -			  &i);
>  	local_bh_enable();
>  }
>  
> -
> -static inline int
> -zero_entry_counter(struct ipt_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				  zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters * alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> +		return ERR_PTR(-ENOMEM);
>  
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
>  	    (newinfo->number <= oldinfo->initial_entries))
>  		module_put(t->me);
>  
> -	/* Get the old counters. */
> +	/* Get the old counters, and synchronize with replace */
>  	get_counters(oldinfo, counters);
> +
>  	/* Decrease module usage counts and free resource */
>  	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
>  	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
> @@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	loc_cpu_entry = private->entries[curcpu];
> +	xt_info_wrlock(curcpu);
>  	IPT_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
> index 800ae85..219e165 100644
> --- a/net/ipv6/netfilter/ip6_tables.c
> +++ b/net/ipv6/netfilter/ip6_tables.c
> @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
>  
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>  
> -	rcu_read_lock_bh();
> -	private = rcu_dereference(table->private);
> -	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +	xt_info_rdlock_bh();
> +	private = table->private;
> +	table_base = private->entries[smp_processor_id()];
>  
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
> @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
>  #ifdef CONFIG_NETFILTER_DEBUG
>  	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
>  #endif
> -	rcu_read_unlock_bh();
> +	xt_info_rdunlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
>  	/* Instead of clearing (by a previous call to memset())
>  	 * the counters and using adds, we set the counters
>  	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> +	 *
> +	 * Bottom half has to be disabled to prevent deadlock
> +	 * if new softirq were to run and call ipt_do_table
>  	 */
> -	curcpu = raw_smp_processor_id();
> +	local_bh_disable();
> +	curcpu = smp_processor_id();
>  
>  	i = 0;
>  	IP6T_ENTRY_ITERATE(t->entries[curcpu],
> @@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
>  		if (cpu == curcpu)
>  			continue;
>  		i = 0;
> +		xt_info_wrlock(cpu);
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
>  				  counters,
>  				  &i);
> +		xt_info_wrunlock(cpu);
>  	}
> -}
> -
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ip6t_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
> -
> -/* Take values from counters and add them back onto the current cpu */
> -static void put_counters(struct xt_table_info *t,
> -			 const struct xt_counters counters[])
> -{
> -	unsigned int i, cpu;
> -
> -	local_bh_disable();
> -	cpu = smp_processor_id();
> -	i = 0;
> -	IP6T_ENTRY_ITERATE(t->entries[cpu],
> -			   t->size,
> -			   add_counter_to_entry,
> -			   counters,
> -			   &i);
>  	local_bh_enable();
>  }
>  
> -static inline int
> -zero_entry_counter(struct ip6t_entry *e, void *arg)
> -{
> -	e->counters.bcnt = 0;
> -	e->counters.pcnt = 0;
> -	return 0;
> -}
> -
> -static void
> -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
> -{
> -	unsigned int cpu;
> -	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
> -
> -	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
> -	for_each_possible_cpu(cpu) {
> -		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
> -		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
> -				   zero_entry_counter, NULL);
> -	}
> -}
> -
>  static struct xt_counters *alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
>  	struct xt_table_info *private = table->private;
> -	struct xt_table_info *info;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		goto nomem;
> +		return ERR_PTR(-ENOMEM);
>  
> -	info = xt_alloc_table_info(private->size);
> -	if (!info)
> -		goto free_counters;
> -
> -	clone_counters(info, private);
> -
> -	mutex_lock(&table->lock);
> -	xt_table_entry_swap_rcu(private, info);
> -	synchronize_net();	/* Wait until smoke has cleared */
> -
> -	get_counters(info, counters);
> -	put_counters(private, counters);
> -	mutex_unlock(&table->lock);
> -
> -	xt_free_table_info(info);
> +	get_counters(private, counters);
>  
>  	return counters;
> -
> - free_counters:
> -	vfree(counters);
> - nomem:
> -	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
>  	    (newinfo->number <= oldinfo->initial_entries))
>  		module_put(t->me);
>  
> -	/* Get the old counters. */
> +	/* Get the old counters, and synchronize with replace */
>  	get_counters(oldinfo, counters);
> +
>  	/* Decrease module usage counts and free resource */
>  	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
>  	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
> @@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
>  	return ret;
>  }
>  
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ip6t_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		int compat)
>  {
> -	unsigned int i;
> +	unsigned int i, curcpu;
>  	struct xt_counters_info tmp;
>  	struct xt_counters *paddc;
>  	unsigned int num_counters;
> @@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
>  		goto free;
>  	}
>  
> -	mutex_lock(&t->lock);
> +

Incorrect whiteline added.

> +	local_bh_disable();
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> -	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
> -	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> +	curcpu = smp_processor_id();
> +	xt_info_wrlock(curcpu);
> +	loc_cpu_entry = private->entries[curcpu];
>  	IP6T_ENTRY_ITERATE(loc_cpu_entry,
>  			  private->size,
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> -	preempt_enable();
> +	xt_info_wrunlock(curcpu);
> +

Inconsistent whiteline.

>   unlock_up_free:
> -	mutex_unlock(&t->lock);
> +	local_bh_enable();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
> index 509a956..5807a4d 100644
> --- a/net/netfilter/x_tables.c
> +++ b/net/netfilter/x_tables.c
> @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
>  }
>  EXPORT_SYMBOL(xt_free_table_info);
>  
> -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
> -			     struct xt_table_info *newinfo)
> -{
> -	unsigned int cpu;
> -
> -	for_each_possible_cpu(cpu) {
> -		void *p = oldinfo->entries[cpu];
> -		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
> -		newinfo->entries[cpu] = p;
> -	}
> -
> -}
> -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
> -
>  /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
>  struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
>  				    const char *name)
> @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
>  EXPORT_SYMBOL_GPL(xt_compat_unlock);
>  #endif
>  
> +DEFINE_PER_CPU(rwlock_t, xt_info_locks);
> +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
> +
> +
>  struct xt_table_info *
>  xt_replace_table(struct xt_table *table,
>  	      unsigned int num_counters,
>  	      struct xt_table_info *newinfo,
>  	      int *error)
>  {
> -	struct xt_table_info *oldinfo, *private;
> +	struct xt_table_info *private;
>  
>  	/* Do the substitution. */
> -	mutex_lock(&table->lock);
> +	local_bh_disable();
>  	private = table->private;
> +
>  	/* Check inside lock: is the old number correct? */
>  	if (num_counters != private->number) {
>  		duprintf("num_counters != table->private->number (%u/%u)\n",
>  			 num_counters, private->number);
> -		mutex_unlock(&table->lock);
> +		local_bh_enable();
>  		*error = -EAGAIN;
>  		return NULL;
>  	}
> -	oldinfo = private;
> -	rcu_assign_pointer(table->private, newinfo);
> -	newinfo->initial_entries = oldinfo->initial_entries;
> -	mutex_unlock(&table->lock);
>  

Whiteline.....

Mathieu

> -	synchronize_net();
> -	return oldinfo;
> +	table->private = newinfo;
> +	newinfo->initial_entries = private->initial_entries;
> +
> +	/*
> +	 * Even though table entries have now been swapped, other CPU's
> +	 * may still be using the old entries. This is okay, because
> +	 * resynchronization happens because of the locking done
> +	 * during the get_counters() routine.
> +	 */
> +	local_bh_enable();
> +
> +	return private;
>  }
>  EXPORT_SYMBOL_GPL(xt_replace_table);
>  
> @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
>  
>  	/* Simplifies replace_table code. */
>  	table->private = bootstrap;
> -	mutex_init(&table->lock);
>  
>  	if (!xt_replace_table(table, 0, newinfo, &ret))
>  		goto unlock;
> @@ -1147,7 +1143,11 @@ static struct pernet_operations xt_net_ops = {
>  
>  static int __init xt_init(void)
>  {
> -	int i, rv;
> +	unsigned int i;
> +	int rv;
> +
> +	for_each_possible_cpu(i)
> +		rwlock_init(&per_cpu(xt_info_locks, i));
>  
>  	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
>  	if (!xt)

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 18:24                                                                                                 ` [PATCH] netfilter: use per-CPU recursive lock {XV} Eric Dumazet
  2009-04-26 18:56                                                                                                   ` Mathieu Desnoyers
@ 2009-04-26 19:31                                                                                                   ` Mathieu Desnoyers
  2009-04-26 20:55                                                                                                       ` Eric Dumazet
  1 sibling, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-26 19:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

* Eric Dumazet (dada1@cosmosbay.com) wrote:
> From: Stephen Hemminger <shemminger@vyatta.com>
> 
> > Epilogue due to master Jarek. Lockdep carest not about the locking
> > doth bestowed. Therefore no keys are needed.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> So far, so good, should be ready for inclusion now, nobody complained :)
> 
> I include the final patch, merge of your last two patches.
> 
> David, could you please review it once again and apply it if it's OK ?
> 
[...]
> +/*
> + * Per-CPU read/write lock associated with per-cpu table entries.
> + * This is not a general solution but makes reader locking fast since
> + * there is no shared variable to cause cache ping-pong; but adds an
> + * additional write-side penalty since update must lock all
> + * possible CPU's.
> + *
> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
> + * It needs to ensure that the rules are not being changed while packet
> + * is being processed. In some cases, the read lock will be acquired
> + * twice on the same CPU; this is okay because read locks handle nesting.
> + *
> + * Write lock is used in two cases:
> + *    1. reading counter values
> + *       all readers need to be stopped and the per-CPU values are summed.
> + *
> + *    2. replacing tables
> + *       any readers that are using the old tables have to complete
> + *       before freeing the old table. This is handled by reading
> + *	  as a side effect of reading counters
> + */
> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
> +
> +static inline void xt_info_rdlock_bh(void)
> +{
> +	/*
> +	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
> +	 * because need to ensure that preemption is disable before
> +	 * acquiring per-cpu-variable, so do it as a two step process
> +	 */
> +	local_bh_disable();

Why do you need to disable bottom halves on the read-side ? You could
probably just disable preemption, given this lock is nestable on the
read-side anyway. Or I'm missing something obvious ?

> +	read_lock(&__get_cpu_var(xt_info_locks));
> +}
> +
> +static inline void xt_info_rdunlock_bh(void)
> +{
> +	read_unlock_bh(&__get_cpu_var(xt_info_locks));
> +}
> +
> +static inline void xt_info_wrlock(unsigned int cpu)
> +{
> +	write_lock(&per_cpu(xt_info_locks, cpu));
> +}
> +
> +static inline void xt_info_wrunlock(unsigned int cpu)
> +{
> +
> +	write_unlock(&per_cpu(xt_info_locks, cpu));
> +}
>  

[...]

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 19:31                                                                                                   ` [PATCH] netfilter: use per-CPU recursive " Mathieu Desnoyers
@ 2009-04-26 20:55                                                                                                       ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-26 20:55 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Stephen Hemminger, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Mathieu Desnoyers a écrit :
> * Eric Dumazet (dada1@cosmosbay.com) wrote:
>> From: Stephen Hemminger <shemminger@vyatta.com>
>>
>>> Epilogue due to master Jarek. Lockdep carest not about the locking
>>> doth bestowed. Therefore no keys are needed.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>> So far, so good, should be ready for inclusion now, nobody complained :)
>>
>> I include the final patch, merge of your last two patches.
>>
>> David, could you please review it once again and apply it if it's OK ?
>>
> [...]
>> +/*
>> + * Per-CPU read/write lock associated with per-cpu table entries.
>> + * This is not a general solution but makes reader locking fast since
>> + * there is no shared variable to cause cache ping-pong; but adds an
>> + * additional write-side penalty since update must lock all
>> + * possible CPU's.
>> + *
>> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
>> + * It needs to ensure that the rules are not being changed while packet
>> + * is being processed. In some cases, the read lock will be acquired
>> + * twice on the same CPU; this is okay because read locks handle nesting.
>> + *
>> + * Write lock is used in two cases:
>> + *    1. reading counter values
>> + *       all readers need to be stopped and the per-CPU values are summed.
>> + *
>> + *    2. replacing tables
>> + *       any readers that are using the old tables have to complete
>> + *       before freeing the old table. This is handled by reading
>> + *	  as a side effect of reading counters
>> + */
>> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
>> +
>> +static inline void xt_info_rdlock_bh(void)
>> +{
>> +	/*
>> +	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
>> +	 * because need to ensure that preemption is disable before
>> +	 * acquiring per-cpu-variable, so do it as a two step process
>> +	 */
>> +	local_bh_disable();
> 
> Why do you need to disable bottom halves on the read-side ? You could
> probably just disable preemption, given this lock is nestable on the
> read-side anyway. Or I'm missing something obvious ?

It may not be obvious, but subject already raised on this list, so I'll
try to be as precise as possible (But may be wrong on some points, I'll
let Patrick correct me if necessary)

ipt_do_table() is not a readonly function returning a verdict.

1) It handles a stack (check how is used next->comefrom) that seems to
be stored on rules themselves. (This is how I understand this code)
This is safe as each cpu has its own copy of rules/counters, and BH protected.

2) It also updates two 64 bit counters (bytes/packets) on each matched rule.

3) Some netfilter matches/targets probably rely on the fact their handlers
are run with BH disabled by their caller (ipt_do_table()/arp/ip6...)

These must be BH protected (and preempt disabled too), or else :

1) A softirq could interrupt a process in the middle of ipt_do_table()
and corrupt its "stack".

2) A softirq could interrupt a process in ipt_do_table() in the middle
 of the ADD_COUNTER(). Some counters could be corrupted.

3) Some netfiler extensions would break.

Previous linux versions already used a read_lock_bh() here, on a single
and shared rwlock, there is nothing new on this BH locking AFAIK.

Thank you


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
@ 2009-04-26 20:55                                                                                                       ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-26 20:55 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Stephen Hemminger, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

Mathieu Desnoyers a écrit :
> * Eric Dumazet (dada1@cosmosbay.com) wrote:
>> From: Stephen Hemminger <shemminger@vyatta.com>
>>
>>> Epilogue due to master Jarek. Lockdep carest not about the locking
>>> doth bestowed. Therefore no keys are needed.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>> So far, so good, should be ready for inclusion now, nobody complained :)
>>
>> I include the final patch, merge of your last two patches.
>>
>> David, could you please review it once again and apply it if it's OK ?
>>
> [...]
>> +/*
>> + * Per-CPU read/write lock associated with per-cpu table entries.
>> + * This is not a general solution but makes reader locking fast since
>> + * there is no shared variable to cause cache ping-pong; but adds an
>> + * additional write-side penalty since update must lock all
>> + * possible CPU's.
>> + *
>> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
>> + * It needs to ensure that the rules are not being changed while packet
>> + * is being processed. In some cases, the read lock will be acquired
>> + * twice on the same CPU; this is okay because read locks handle nesting.
>> + *
>> + * Write lock is used in two cases:
>> + *    1. reading counter values
>> + *       all readers need to be stopped and the per-CPU values are summed.
>> + *
>> + *    2. replacing tables
>> + *       any readers that are using the old tables have to complete
>> + *       before freeing the old table. This is handled by reading
>> + *	  as a side effect of reading counters
>> + */
>> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
>> +
>> +static inline void xt_info_rdlock_bh(void)
>> +{
>> +	/*
>> +	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
>> +	 * because need to ensure that preemption is disable before
>> +	 * acquiring per-cpu-variable, so do it as a two step process
>> +	 */
>> +	local_bh_disable();
> 
> Why do you need to disable bottom halves on the read-side ? You could
> probably just disable preemption, given this lock is nestable on the
> read-side anyway. Or I'm missing something obvious ?

It may not be obvious, but subject already raised on this list, so I'll
try to be as precise as possible (But may be wrong on some points, I'll
let Patrick correct me if necessary)

ipt_do_table() is not a readonly function returning a verdict.

1) It handles a stack (check how is used next->comefrom) that seems to
be stored on rules themselves. (This is how I understand this code)
This is safe as each cpu has its own copy of rules/counters, and BH protected.

2) It also updates two 64 bit counters (bytes/packets) on each matched rule.

3) Some netfilter matches/targets probably rely on the fact their handlers
are run with BH disabled by their caller (ipt_do_table()/arp/ip6...)

These must be BH protected (and preempt disabled too), or else :

1) A softirq could interrupt a process in the middle of ipt_do_table()
and corrupt its "stack".

2) A softirq could interrupt a process in ipt_do_table() in the middle
 of the ADD_COUNTER(). Some counters could be corrupted.

3) Some netfiler extensions would break.

Previous linux versions already used a read_lock_bh() here, on a single
and shared rwlock, there is nothing new on this BH locking AFAIK.

Thank you

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 20:55                                                                                                       ` Eric Dumazet
  (?)
@ 2009-04-26 21:39                                                                                                       ` Mathieu Desnoyers
  -1 siblings, 0 replies; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-26 21:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

* Eric Dumazet (dada1@cosmosbay.com) wrote:
> Mathieu Desnoyers a écrit :
> > * Eric Dumazet (dada1@cosmosbay.com) wrote:
> >> From: Stephen Hemminger <shemminger@vyatta.com>
> >>
> >>> Epilogue due to master Jarek. Lockdep carest not about the locking
> >>> doth bestowed. Therefore no keys are needed.
> >>>
> >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >> So far, so good, should be ready for inclusion now, nobody complained :)
> >>
> >> I include the final patch, merge of your last two patches.
> >>
> >> David, could you please review it once again and apply it if it's OK ?
> >>
> > [...]
> >> +/*
> >> + * Per-CPU read/write lock associated with per-cpu table entries.
> >> + * This is not a general solution but makes reader locking fast since
> >> + * there is no shared variable to cause cache ping-pong; but adds an
> >> + * additional write-side penalty since update must lock all
> >> + * possible CPU's.
> >> + *
> >> + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu.
> >> + * It needs to ensure that the rules are not being changed while packet
> >> + * is being processed. In some cases, the read lock will be acquired
> >> + * twice on the same CPU; this is okay because read locks handle nesting.
> >> + *
> >> + * Write lock is used in two cases:
> >> + *    1. reading counter values
> >> + *       all readers need to be stopped and the per-CPU values are summed.
> >> + *
> >> + *    2. replacing tables
> >> + *       any readers that are using the old tables have to complete
> >> + *       before freeing the old table. This is handled by reading
> >> + *	  as a side effect of reading counters
> >> + */
> >> +DECLARE_PER_CPU(rwlock_t, xt_info_locks);
> >> +
> >> +static inline void xt_info_rdlock_bh(void)
> >> +{
> >> +	/*
> >> +	 * Note: can not use read_lock_bh(&__get_cpu_var(xt_info_locks))
> >> +	 * because need to ensure that preemption is disable before
> >> +	 * acquiring per-cpu-variable, so do it as a two step process
> >> +	 */
> >> +	local_bh_disable();
> > 
> > Why do you need to disable bottom halves on the read-side ? You could
> > probably just disable preemption, given this lock is nestable on the
> > read-side anyway. Or I'm missing something obvious ?
> 
> It may not be obvious, but subject already raised on this list, so I'll
> try to be as precise as possible (But may be wrong on some points, I'll
> let Patrick correct me if necessary)
> 
> ipt_do_table() is not a readonly function returning a verdict.
> 
> 1) It handles a stack (check how is used next->comefrom) that seems to
> be stored on rules themselves. (This is how I understand this code)
> This is safe as each cpu has its own copy of rules/counters, and BH protected.
> 
> 2) It also updates two 64 bit counters (bytes/packets) on each matched rule.
> 
> 3) Some netfilter matches/targets probably rely on the fact their handlers
> are run with BH disabled by their caller (ipt_do_table()/arp/ip6...)
> 
> These must be BH protected (and preempt disabled too), or else :
> 
> 1) A softirq could interrupt a process in the middle of ipt_do_table()
> and corrupt its "stack".
> 
> 2) A softirq could interrupt a process in ipt_do_table() in the middle
>  of the ADD_COUNTER(). Some counters could be corrupted.
> 
> 3) Some netfiler extensions would break.
> 
> Previous linux versions already used a read_lock_bh() here, on a single
> and shared rwlock, there is nothing new on this BH locking AFAIK.
> 
> Thank you

Thanks for the explanation. It might help to document the role of bh
disabling for the reader in a supplementary code comment, otherwise one
might think it's been put there to match the bottom half disabling used
on the write-side, which has the supplementary role of making sure bh
will not deadlock (and this precise behavior is not needed usually on
the read-side).

One more point :

 *    1. reading counter values
 *       all readers need to be stopped and the per-CPU values are summed.

Maybe it's just me, but this sentence does not seem to clearly indicate
that we have :

for_each_cpu()
  write lock()
  read data
  write unlock()

One might interpret it as :

for_each_cpu()
  write lock()

read data

for_each_cpu()
  write unlock()

Or maybe it's just my understanding of English that's not perfect.
Anyhow, rewording this sentence might not hurt. Something along the
lines of :

"reading counter values
 all readers are iteratively stopped to have their per-CPU values
 summed"

This is an important difference, as this behaves more like a RCU-based
mechanism than a global per-cpu read/write lock where all the write
locks would be taken at once.

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 18:56                                                                                                   ` Mathieu Desnoyers
@ 2009-04-26 21:57                                                                                                     ` Stephen Hemminger
  2009-04-26 22:32                                                                                                       ` Mathieu Desnoyers
  2009-04-27 17:44                                                                                                       ` Peter Zijlstra
  0 siblings, 2 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-26 21:57 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Eric Dumazet, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh

On Sun, 26 Apr 2009 14:56:46 -0400
Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:

> * Eric Dumazet (dada1@cosmosbay.com) wrote:
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > > Epilogue due to master Jarek. Lockdep carest not about the locking
> > > doth bestowed. Therefore no keys are needed.
> > > 
> > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > So far, so good, should be ready for inclusion now, nobody complained :)
> > 
> > I include the final patch, merge of your last two patches.
> > 
> > David, could you please review it once again and apply it if it's OK ?
> 
> > Thanks to all for your help and patience
> > 
> > [PATCH] netfilter: use per-CPU recursive lock {XV}
> 
> Hi Eric, 
> 
> Suitable name would probably be :
> 

But Linus is trying to delude himself.

This usage is recursive even if he doesn't like the terminology.
The same CPU has to be able to reacquire the read lock without deadlocking.
If reader/writer locks were implemented in a pure writer gets priority
method, then this code would break!  So yes read locks can be used recursively
now in Linux, but if the were implemented differently then this code
would break.  For example, the -rt kernel turns all read/write locks into
mutexs, so the -rt kernel developers will have to address this.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 21:57                                                                                                     ` Stephen Hemminger
@ 2009-04-26 22:32                                                                                                       ` Mathieu Desnoyers
  2009-04-27 17:44                                                                                                       ` Peter Zijlstra
  1 sibling, 0 replies; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-26 22:32 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, David Miller, Jarek Poplawski, Linus Torvalds,
	Ingo Molnar, Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, jengelh, r000n, linux-kernel,
	netfilter-devel, netdev, benh, Thomas Gleixner, Steven Rostedt

* Stephen Hemminger (shemminger@vyatta.com) wrote:
> On Sun, 26 Apr 2009 14:56:46 -0400
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > * Eric Dumazet (dada1@cosmosbay.com) wrote:
> > > From: Stephen Hemminger <shemminger@vyatta.com>
> > > 
> > > > Epilogue due to master Jarek. Lockdep carest not about the locking
> > > > doth bestowed. Therefore no keys are needed.
> > > > 
> > > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > > 
> > > So far, so good, should be ready for inclusion now, nobody complained :)
> > > 
> > > I include the final patch, merge of your last two patches.
> > > 
> > > David, could you please review it once again and apply it if it's OK ?
> > 
> > > Thanks to all for your help and patience
> > > 
> > > [PATCH] netfilter: use per-CPU recursive lock {XV}
> > 
> > Hi Eric, 
> > 
> > Suitable name would probably be :
> > 

Hi Stephen,

[I see that you have cutted my name proposal from the original email,
which might make it difficult for others to follow. I will assume you
did it by mistake.]

(re-added)
[PATCH] netfilter: use bh disabling with per-cpu read-write lock

> 
> But Linus is trying to delude himself.
> 
> This usage is recursive even if he doesn't like the terminology.
> The same CPU has to be able to reacquire the read lock without deadlocking.
> If reader/writer locks were implemented in a pure writer gets priority
> method, then this code would break!  So yes read locks can be used recursively
> now in Linux, but if the were implemented differently then this code
> would break.  For example, the -rt kernel turns all read/write locks into
> mutexs, so the -rt kernel developers will have to address this.

Reading Documentation/spinlocks.txt, which states the lock usage
guidelines :

"Note that you can be clever with read-write locks and interrupts. For
example, if you know that the interrupt only ever gets a read-lock, then
you can use a non-irq version of read locks everywhere - because they
don't block on each other (and thus there is no dead-lock wrt interrupts.
But when you do the write-lock, you have to use the irq-safe version."

So it's assumed in the kernel-wide read lock usage that nested read
locks are OK. I'm adding Thomas and Steven in CC, but I'm quite sure
they must have dealt with nested read-lock transformation into mutexes
by detecting nesting in some way in -rt. But I'll let them confirm this.

So I don't see why you are dreaming about a different semantic than the
one of the primitives you are using. I guess I'll leave the semantics to
you. I just find it astonishing that you persist saying that everbody is
wrong on a topic like semantic, which is in the end a mean to
communicate ideas clearly within the overall community you disagree
with.

Good luck !

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU recursive lock {XV}
  2009-04-26 21:57                                                                                                     ` Stephen Hemminger
  2009-04-26 22:32                                                                                                       ` Mathieu Desnoyers
@ 2009-04-27 17:44                                                                                                       ` Peter Zijlstra
  2009-04-27 18:30                                                                                                         ` [PATCH] netfilter: use per-CPU r**ursive " Stephen Hemminger
  1 sibling, 1 reply; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-27 17:44 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Linus Torvalds, Ingo Molnar, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Sun, 2009-04-26 at 14:57 -0700, Stephen Hemminger wrote:
> On Sun, 26 Apr 2009 14:56:46 -0400
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> 
> > * Eric Dumazet (dada1@cosmosbay.com) wrote:
> > > From: Stephen Hemminger <shemminger@vyatta.com>
> > > 
> > > > Epilogue due to master Jarek. Lockdep carest not about the locking
> > > > doth bestowed. Therefore no keys are needed.
> > > > 
> > > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > > 
> > > So far, so good, should be ready for inclusion now, nobody complained :)
> > > 
> > > I include the final patch, merge of your last two patches.
> > > 
> > > David, could you please review it once again and apply it if it's OK ?
> > 
> > > Thanks to all for your help and patience
> > > 
> > > [PATCH] netfilter: use per-CPU recursive lock {XV}
> > 
> > Hi Eric, 
> > 
> > Suitable name would probably be :
> > 
> 
> But Linus is trying to delude himself.
> 
> This usage is recursive even if he doesn't like the terminology.
> The same CPU has to be able to reacquire the read lock without deadlocking.
> If reader/writer locks were implemented in a pure writer gets priority
> method, then this code would break!  So yes read locks can be used recursively
> now in Linux, but if the were implemented differently then this code
> would break.  For example, the -rt kernel turns all read/write locks into
> mutexs, so the -rt kernel developers will have to address this.

A recursive lock has the property:

lock()
{
  if (lock->owner == current) {
    lock->depth++;
    return;
  }

  /* regular lock stuff */
}

unlock()
{
  if (!--lock->depth)
    /* regular unlock */
}

non of the linux kernel locking primitives have this -- with the
possible exception of the cpu-hotplug lock.

What rwlock_t has, is reader bias to the point where you can utterly
starve writers, with the side effect that you can obtain multiple read
ownerships without causing a deadlock.

This is not what is called a recursive lock. A recursive lock would have
each owner only once, this rwlock_t thing is simply so unfair that it
can have unlimited owners, including multiple copies of the same one.

rwsem has fifo fairness, and therefore can deadlock in this scenario,
suppose thread A does a read, thread B tries a write and blocks, then
thread A recurses and tries to obtain another read ownership --
deadlock, as the FIFO fairness will demand the second read ownership
will wait on the pending writer, which will wait on the outstanding read
owner.

Now if rwsem were a fifo-fair recursive lock, the above would not
deadlock, since it would detect that the task already had (read)
ownership and simply increment the depth, instead of trying to acquire a
second ownership.

This is all common and well understood terminology, not something Linus
invented just to harras you with.

Generally speaking we do not condone recursive locking strategies -- and
afaik reiserfs (as per the BKL) and the network code (as per abusing
rwlock_t unfairness) are the only offenders.

Like Linus stated, recursive locking is generally poor taste and
indicates you basically gave up on trying to find a proper locking
scheme. We should very much work towards getting rid of these
abberations instead of adding new ones.

Linus is very much right on what he said, and you calling him delusional
only high-lights your ignorance on the issue.

[ PS. -rt implements rwlock_t as a proper recursive lock (either a mutex
  or a full multi-owner reader-writer lock with PI fairness) so if
  anybody abuses rwlock_t unfairness in a way that is not strict owner
  recursive we have a problem. ]

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 17:44                                                                                                       ` Peter Zijlstra
@ 2009-04-27 18:30                                                                                                         ` Stephen Hemminger
  2009-04-27 18:54                                                                                                           ` Ingo Molnar
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-27 18:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Linus Torvalds, Ingo Molnar, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Mon, 27 Apr 2009 19:44:57 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Sun, 2009-04-26 at 14:57 -0700, Stephen Hemminger wrote:
> > On Sun, 26 Apr 2009 14:56:46 -0400
> > Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> wrote:
> > 
> > > * Eric Dumazet (dada1@cosmosbay.com) wrote:
> > > > From: Stephen Hemminger <shemminger@vyatta.com>
> > > > 
> > > > > Epilogue due to master Jarek. Lockdep carest not about the locking
> > > > > doth bestowed. Therefore no keys are needed.
> > > > > 
> > > > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > > > 
> > > > So far, so good, should be ready for inclusion now, nobody complained :)
> > > > 
> > > > I include the final patch, merge of your last two patches.
> > > > 
> > > > David, could you please review it once again and apply it if it's OK ?
> > > 
> > > > Thanks to all for your help and patience
> > > > 
> > > > [PATCH] netfilter: use per-CPU recursive lock {XV}
> > > 
> > > Hi Eric, 
> > > 
> > > Suitable name would probably be :
> > > 
> > 
> > But Linus is trying to delude himself.
> > 
> > This usage is recursive even if he doesn't like the terminology.
> > The same CPU has to be able to reacquire the read lock without deadlocking.
> > If reader/writer locks were implemented in a pure writer gets priority
> > method, then this code would break!  So yes read locks can be used recursively
> > now in Linux, but if the were implemented differently then this code
> > would break.  For example, the -rt kernel turns all read/write locks into
> > mutexs, so the -rt kernel developers will have to address this.
> 
> A recursive lock has the property:
> 
> lock()
> {
>   if (lock->owner == current) {
>     lock->depth++;
>     return;
>   }
> 
>   /* regular lock stuff */
> }
> 
> unlock()
> {
>   if (!--lock->depth)
>     /* regular unlock */
> }

Only on Linux, and only because you look at locking from
the point of view of the magic variable "current" process
point of view.

> non of the linux kernel locking primitives have this -- with the
> possible exception of the cpu-hotplug lock.
> 
> What rwlock_t has, is reader bias to the point where you can utterly
> starve writers, with the side effect that you can obtain multiple read
> ownerships without causing a deadlock.

But what happens when this side effect disappears?
 
> This is not what is called a recursive lock. A recursive lock would have
> each owner only once, this rwlock_t thing is simply so unfair that it
> can have unlimited owners, including multiple copies of the same one.
> 
> rwsem has fifo fairness, and therefore can deadlock in this scenario,
> suppose thread A does a read, thread B tries a write and blocks, then
> thread A recurses and tries to obtain another read ownership --
> deadlock, as the FIFO fairness will demand the second read ownership
> will wait on the pending writer, which will wait on the outstanding read
> owner.
> 
> Now if rwsem were a fifo-fair recursive lock, the above would not
> deadlock, since it would detect that the task already had (read)
> ownership and simply increment the depth, instead of trying to acquire a
> second ownership.
> 
> This is all common and well understood terminology, not something Linus
> invented just to harras you with.

In Documentation/ ?  online ?  Where is the definition? The only reference
I se is indirectly in DocBook/kernel-locking.tmpl.

> Generally speaking we do not condone recursive locking strategies -- and
> afaik reiserfs (as per the BKL) and the network code (as per abusing
> rwlock_t unfairness) are the only offenders.
> 
> Like Linus stated, recursive locking is generally poor taste and
> indicates you basically gave up on trying to find a proper locking
> scheme. We should very much work towards getting rid of these
> abberations instead of adding new ones.

The people complaining about naming never seem to be the ones providing
workable suggestions or patches.

> Linus is very much right on what he said, and you calling him delusional
> only high-lights your ignorance on the issue.
> 
> [ PS. -rt implements rwlock_t as a proper recursive lock (either a mutex
>   or a full multi-owner reader-writer lock with PI fairness) so if
>   anybody abuses rwlock_t unfairness in a way that is not strict owner
>   recursive we have a problem. ]

Name it "dog's breath locking" for all I care. I am not bothering
with arguments over names; there is real work to do elsewhere.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 18:30                                                                                                         ` [PATCH] netfilter: use per-CPU r**ursive " Stephen Hemminger
@ 2009-04-27 18:54                                                                                                           ` Ingo Molnar
  2009-04-27 19:06                                                                                                             ` Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Ingo Molnar @ 2009-04-27 18:54 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Peter Zijlstra, Mathieu Desnoyers, Eric Dumazet, David Miller,
	Jarek Poplawski, Linus Torvalds, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh


* Stephen Hemminger <shemminger@vyatta.com> wrote:

> > non of the linux kernel locking primitives have this -- with the 
> > possible exception of the cpu-hotplug lock.
> > 
> > What rwlock_t has, is reader bias to the point where you can 
> > utterly starve writers, with the side effect that you can obtain 
> > multiple read ownerships without causing a deadlock.
> 
> But what happens when this side effect disappears?

Then well written code works, badly written code breaks.

> > [...]
> >
> > This is all common and well understood terminology, not 
> > something Linus invented just to harras you with.
> 
> In Documentation/ ?  online ?  Where is the definition? The only 
> reference I se is indirectly in DocBook/kernel-locking.tmpl.

Sure, see:

    http://tinyurl.com/c6fakc

> > Generally speaking we do not condone recursive locking 
> > strategies -- and afaik reiserfs (as per the BKL) and the 
> > network code (as per abusing rwlock_t unfairness) are the only 
> > offenders.
> > 
> > Like Linus stated, recursive locking is generally poor taste and 
> > indicates you basically gave up on trying to find a proper 
> > locking scheme. We should very much work towards getting rid of 
> > these abberations instead of adding new ones.
> 
> The people complaining about naming never seem to be the ones 
> providing workable suggestions or patches.

The thing is, while you now have named your locking primitive 
correctly, you are still abusing it by using it recursively.

So it's not 'just about naming'. You should not use read-locks as 
recursive locks. It's poor code.

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 18:54                                                                                                           ` Ingo Molnar
@ 2009-04-27 19:06                                                                                                             ` Stephen Hemminger
  2009-04-27 19:46                                                                                                               ` Linus Torvalds
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-27 19:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Mathieu Desnoyers, Eric Dumazet, David Miller,
	Jarek Poplawski, Linus Torvalds, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Mon, 27 Apr 2009 20:54:23 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Stephen Hemminger <shemminger@vyatta.com> wrote:
> 
> > > non of the linux kernel locking primitives have this -- with the 
> > > possible exception of the cpu-hotplug lock.
> > > 
> > > What rwlock_t has, is reader bias to the point where you can 
> > > utterly starve writers, with the side effect that you can obtain 
> > > multiple read ownerships without causing a deadlock.
> > 
> > But what happens when this side effect disappears?
> 
> Then well written code works, badly written code breaks.
> 
> > > [...]
> > >
> > > This is all common and well understood terminology, not 
> > > something Linus invented just to harras you with.
> > 
> > In Documentation/ ?  online ?  Where is the definition? The only 
> > reference I se is indirectly in DocBook/kernel-locking.tmpl.
> 
> Sure, see:
> 
>     http://tinyurl.com/c6fakc

All those references support my argument that the lock is being
used recursively in this case. It is being acquired multiple
times by the same CPU. This is not new, it has always been
acquired multiple times, so my change does not break anything.

If other implementations of reader locks to not nest the same
way, then on those system iptables can deadlock. Nothing was
changed by this.

> > > Generally speaking we do not condone recursive locking 
> > > strategies -- and afaik reiserfs (as per the BKL) and the 
> > > network code (as per abusing rwlock_t unfairness) are the only 
> > > offenders.
> > > 
> > > Like Linus stated, recursive locking is generally poor taste and 
> > > indicates you basically gave up on trying to find a proper 
> > > locking scheme. We should very much work towards getting rid of 
> > > these abberations instead of adding new ones.
> > 
> > The people complaining about naming never seem to be the ones 
> > providing workable suggestions or patches.
> 
> The thing is, while you now have named your locking primitive 
> correctly, you are still abusing it by using it recursively.
> 
> So it's not 'just about naming'. You should not use read-locks as 
> recursive locks. It's poor code.

If you don't like the proposal please, think of a better alternative.
Not just pseudo code that is broken with handwaving arguments.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 19:06                                                                                                             ` Stephen Hemminger
@ 2009-04-27 19:46                                                                                                               ` Linus Torvalds
  2009-04-27 19:48                                                                                                                 ` Linus Torvalds
                                                                                                                                   ` (2 more replies)
  0 siblings, 3 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 19:46 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Eric Dumazet,
	David Miller, Jarek Poplawski, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Mon, 27 Apr 2009, Stephen Hemminger wrote:
>
> All those references support my argument that the lock is being
> used recursively in this case.

What's so hard between understanding the difference between "used 
recursively" and "recursive lock"?

THEY MEAN TWO TOTALLY DIFFERENT THINGS!

The fact that you don't seem to understand that is one of the things I've 
been complaining about all along.

So here's a final set of clue-bat:

Clue bat #1:

 - "can be used in a recursive context" is not the same as "is recursive".

   Analogy time: Look at "strtok_r()", and the difference between that and 
   "strtok()". "strtok_r()" can be used in a threaded environment. Does 
   that mean that 'strtok_r()' is threaded? No. When you call 
   'strtok_r()', it's still signle-threaded - it's just that it _can_ be 
   called in a threaded environment and then still has sane semantics.

   Now, replace "threaded" with "recursive". Do you get it?

Clue bat #2:

 - a lock that can count does not mean that it is recursive. 

   Counting and recursion are TWO TOTALLY DIFFERENT ISSUES. The 
   "countingness" means that there can be multiple users inside of it, and 
   that, in turn, implies that maybe it can be used in a recursive 
   context. But again, counting and recursion are not about the same thing 
   at all.

   Put another way: a read-write lock is not a "recursive lock" despite 
   the fact that you can recursively take the lock for reading. It just 
   happens to count readers, and allow more than one (from _any_ context, 
   not just from the "same context").

Clue bat #3:

 - A recursive lock is very much _about_ recursion. It is explicitly about 
   the fact that the _same_ thread re-enters the lock, not about another 
   thread being in the locked region at the same time.

   See the difference? Big, big difference. A recursive lock will lock out 
   other thread contexts, even if it allows the current one to recurse. 
   Notice how the _only_ thing a recursive lock allows is that recursive 
   behavior, and nothing else.

   IOW, a "recursive lock" is explicitly designed for recursion. But that 
   doesn't mean that recursive algorithms cannot use functions that 
   aren't.

   Can you use "memcpy()" in a recursive algorithm? Yes. Does that mean 
   that "memcpy()" is suddenly a "recursive memory copy"? No.

   See the difference?

Clue bat #3:

 - if you do not understand the difference between these two things, don't 
   then try to claim that somebody _else_ who does understand it is 
   "deluding himself".

   Analogy time: Ethernet and a modem line can both get you on the
   internet. Now, let's say that Mr Peter Paste-Eater has heard of 
   ethernet, and knows you can get on the internet with an ethernet 
   connection, but he happens to use a modem line to do it.

   Now, Peter Paste-Eater talks to you, and tells you he is connecting to 
   the internet with ethernet, and proudly shows you his serial line and 
   modem, and tells you how he uses ethernet to get onto the internet. You 
   correct him, and tell him it's not ethernet.  He argues for several 
   days about how he gets on the internet, and that it must thus be 
   ethernet, and that you're obviously just "deluding yourself".

Now, can you see why people react badly to you talking about "recursive 
locks"? You're acting like Peter Paste-Eater calling his serial line 
ethernet. The fact that two _different_ things can be used for the same 
end result DOES NOT MAKE THEM THE SAME THING.

In other words:

 - "Recursive locks" are different from reader-writer locks.

 - The ability to count is different from recursion, although in a lock it 
   can obviously be _related_ to whether it can be used in a recursive 
   environment or not. If you don't have a counter, you probably cannot 
   recurse, but it's also not true that a counter implies that you always 
   can.

   A traditional counting lock is the old-fasioned 'semaphore' we have, 
   where the count allows for more than just simple mutual exclusion, and 
   is used when you might want to allow concurrency, but need to limit it
   to some number 'n' (although, almost always, n==1)

 - What the netfilter code wants is simply not a recursive lock. It wants 
   a form of locking that allows recursive _use_, but as mentioned, that 
   is totally and utterly irrelevant from what we call it.

   You _could_ use a recursive lock for it. BUT NONE OF THE PATCHES THAT 
   HAVE EVER BEEN POSTED HAVE BEEN RECURSIVE LOCKS! Nada. None. Zero. 
   Zilch.

So don't talk about recursive locks.

Get it? Finally? Or are you going to continue to be that Paste-Eater guy 
who refuses to understand that he is talking about something else than 
ethernet?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 19:46                                                                                                               ` Linus Torvalds
@ 2009-04-27 19:48                                                                                                                 ` Linus Torvalds
  2009-04-27 20:36                                                                                                                 ` Evgeniy Polyakov
  2009-04-28  7:42                                                                                                                 ` Jan Engelhardt
  2 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 19:48 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Eric Dumazet,
	David Miller, Jarek Poplawski, Paul Mackerras, paulmck,
	Evgeniy Polyakov, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Mon, 27 Apr 2009, Linus Torvalds wrote:
> 
> Clue bat #1:
> 
> Clue bat #2:
> 
> Clue bat #3:
> 
> Clue bat #3:

Note to self: learn to count beyond three one of these days.

		Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 19:46                                                                                                               ` Linus Torvalds
  2009-04-27 19:48                                                                                                                 ` Linus Torvalds
@ 2009-04-27 20:36                                                                                                                 ` Evgeniy Polyakov
  2009-04-27 20:58                                                                                                                   ` Linus Torvalds
  2009-04-28  7:42                                                                                                                 ` Jan Engelhardt
  2 siblings, 1 reply; 254+ messages in thread
From: Evgeniy Polyakov @ 2009-04-27 20:36 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Paul Mackerras, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh

Hi.

On Mon, Apr 27, 2009 at 12:46:48PM -0700, Linus Torvalds (torvalds@linux-foundation.org) wrote:
> > All those references support my argument that the lock is being
> > used recursively in this case.
> 
> What's so hard between understanding the difference between "used 
> recursively" and "recursive lock"?
> 
> THEY MEAN TWO TOTALLY DIFFERENT THINGS!
> 
> The fact that you don't seem to understand that is one of the things I've 
> been complaining about all along.

Just ot be sure readers will not lose the discssion topic: do you object
against naming or realizaion? If its about the former, does 'dog's
breath lock' proposed by Stephen fit?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 20:36                                                                                                                 ` Evgeniy Polyakov
@ 2009-04-27 20:58                                                                                                                   ` Linus Torvalds
  2009-04-27 21:40                                                                                                                       ` Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 20:58 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Stephen Hemminger, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Paul Mackerras, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh



On Tue, 28 Apr 2009, Evgeniy Polyakov wrote:
> 
> Just ot be sure readers will not lose the discssion topic: do you object
> against naming or realizaion?

I said _long_ ago that I thought the patch was fine. 

What I object to is people mis-representing the lock, and apparently 
having a really hard time admitting that having a good grounding in 
networking doesn't necessarily mean that you know everything about 
something as basic as locking.

> If its about the former, does 'dog's breath lock' proposed by Stephen 
> fit?

Is that just an attempt to avoid admitting that they were wrong about lock 
naming? And then trying to trivialize it by calling things by a 
_different_ wrong name, but silly enough that they hope people won't call 
them on it?

Why not just use the correct name? I think it was Mathieu that just 
suggested:

	[PATCH] netfilter: use bh disabling with per-cpu read-write lock

or just call it "netfilter: use per-CPU read-write lock".

Why are people so against calling things by their correct names? Why do 
certain network people seem to force a stupid and incorrect description, 
when they have been told (a) that it's wrong and (b) why it's wrong 
several times?

What's so hard with just doing the TechnicallyRightThing(tm) and 
describing it as such?

And btw - don't get me wrong. There are _other_ ways to do that locking 
too. You don't have to use a rwlock. You can do it with explicit counting, 
the way Eric's original patch did. But it would be wrong to call that one 
"recursive lock" too.

Or you _could_ use a recursive lock, but nobody has actually posted such 
patches. It would work. No question about it. And if it actually _were_ a 
recursive lock, I wouldn't have objected about the description saying it 
is (although I would probably have objected to it being unnecessarily 
complex, when a simpler rwlock or the explicit count thing would be 
sufficient).

But since the current simple patch is using a rwlock, why not just say 
that? Why call it something incorrect ("recursive lock") or nonsensical 
("dog's breath lock").

As I tried to explain with an analogy, networking people would (quite 
correctly) object to me calling a serial line an "ethernet cable". Why is 
it so hard for netfilter people to then see why it's wrong to call a 
rwlock a "recursive lock".

I mean, guys, if you don't want to read up on decades of locking work, 
just google for it. Google for "recursive lock" (_with_ the quotes). At 
least for me, the very first hit gives a reasonable explanation for it, 
and it says:

  "POSIX allows mutexes to be recursive. That means the same thread can 
   lock the same mutex twice and won't deadlock."

and realize that the "same thread" part is very much a keyword, not juat 
a random implementation detail (the first answer to the question is 
better than the question, but even the question at the top really does 
get at the basics).

And please do realize that neither rwlocks nor the counting locks from 
Dumazet's original patch do that. Never did. They simply aren't recursive 
locks.

So just don't call them that. But is "dog's breath" any better? Yes, maybe 
it's less _misleading_, but it sure as hell isn't any more descriptive.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 20:58                                                                                                                   ` Linus Torvalds
@ 2009-04-27 21:40                                                                                                                       ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-27 21:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Mon, 27 Apr 2009 13:58:48 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 28 Apr 2009, Evgeniy Polyakov wrote:
> > 
> > Just ot be sure readers will not lose the discssion topic: do you object
> > against naming or realizaion?
> 
> I said _long_ ago that I thought the patch was fine. 
> 
> What I object to is people mis-representing the lock, and apparently 
> having a really hard time admitting that having a good grounding in 
> networking doesn't necessarily mean that you know everything about 
> something as basic as locking.
> 
> > If its about the former, does 'dog's breath lock' proposed by Stephen 
> > fit?
> 
> Is that just an attempt to avoid admitting that they were wrong about lock 
> naming? And then trying to trivialize it by calling things by a 
> _different_ wrong name, but silly enough that they hope people won't call 
> them on it?

The part that concerns me is that the reader lock used in a nested manner on
same cpu may still be broken on -rt. Other than that it is just language
lawyering; violent agreement that the lock gets used multiple times by
same CPU. I never had the occasion to address this before
(and avoided such usage), but this legacy code exists and needs to
be dealt with.


> Why not just use the correct name? I think it was Mathieu that just 
> suggested:
> 
> 	[PATCH] netfilter: use bh disabling with per-cpu read-write lock
> 
> or just call it "netfilter: use per-CPU read-write lock".

[PATCH] netfilter: Ceci n'est pas une serrure récurrente

I don't care. I don't care. Don't you get the point yet.



> 
> Why are people so against calling things by their correct names? Why do 
> certain network people seem to force a stupid and incorrect description, 
> when they have been told (a) that it's wrong and (b) why it's wrong 
> several times?

Because meaning comes from context, and my meaning comes from different
context. So we disagree on correct names. 

> What's so hard with just doing the TechnicallyRightThing(tm) and 
> describing it as such?
> 
> And btw - don't get me wrong. There are _other_ ways to do that locking 
> too. You don't have to use a rwlock. You can do it with explicit counting, 
> the way Eric's original patch did. But it would be wrong to call that one 
> "recursive lock" too.
> 
> Or you _could_ use a recursive lock, but nobody has actually posted such 
> patches. It would work. No question about it. And if it actually _were_ a 
> recursive lock, I wouldn't have objected about the description saying it 
> is (although I would probably have objected to it being unnecessarily 
> complex, when a simpler rwlock or the explicit count thing would be 
> sufficient).
> 
> But since the current simple patch is using a rwlock, why not just say 
> that? Why call it something incorrect ("recursive lock") or nonsensical 
> ("dog's breath lock").
> 
> As I tried to explain with an analogy, networking people would (quite 
> correctly) object to me calling a serial line an "ethernet cable". Why is 
> it so hard for netfilter people to then see why it's wrong to call a 
> rwlock a "recursive lock".
> 
> I mean, guys, if you don't want to read up on decades of locking work, 
> just google for it. Google for "recursive lock" (_with_ the quotes). At 
> least for me, the very first hit gives a reasonable explanation for it, 
> and it says:
> 
>   "POSIX allows mutexes to be recursive. That means the same thread can 
>    lock the same mutex twice and won't deadlock."
> 
> and realize that the "same thread" part is very much a keyword, not juat 
> a random implementation detail (the first answer to the question is 
> better than the question, but even the question at the top really does 
> get at the basics).
> 
> And please do realize that neither rwlocks nor the counting locks from 
> Dumazet's original patch do that. Never did. They simply aren't recursive 
> locks.
> 
> So just don't call them that. But is "dog's breath" any better? Yes, maybe 
> it's less _misleading_, but it sure as hell isn't any more descriptive.
> 
> 			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
@ 2009-04-27 21:40                                                                                                                       ` Stephen Hemminger
  0 siblings, 0 replies; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-27 21:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Mon, 27 Apr 2009 13:58:48 -0700 (PDT)
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> 
> 
> On Tue, 28 Apr 2009, Evgeniy Polyakov wrote:
> > 
> > Just ot be sure readers will not lose the discssion topic: do you object
> > against naming or realizaion?
> 
> I said _long_ ago that I thought the patch was fine. 
> 
> What I object to is people mis-representing the lock, and apparently 
> having a really hard time admitting that having a good grounding in 
> networking doesn't necessarily mean that you know everything about 
> something as basic as locking.
> 
> > If its about the former, does 'dog's breath lock' proposed by Stephen 
> > fit?
> 
> Is that just an attempt to avoid admitting that they were wrong about lock 
> naming? And then trying to trivialize it by calling things by a 
> _different_ wrong name, but silly enough that they hope people won't call 
> them on it?

The part that concerns me is that the reader lock used in a nested manner on
same cpu may still be broken on -rt. Other than that it is just language
lawyering; violent agreement that the lock gets used multiple times by
same CPU. I never had the occasion to address this before
(and avoided such usage), but this legacy code exists and needs to
be dealt with.


> Why not just use the correct name? I think it was Mathieu that just 
> suggested:
> 
> 	[PATCH] netfilter: use bh disabling with per-cpu read-write lock
> 
> or just call it "netfilter: use per-CPU read-write lock".

[PATCH] netfilter: Ceci n'est pas une serrure récurrente

I don't care. I don't care. Don't you get the point yet.



> 
> Why are people so against calling things by their correct names? Why do 
> certain network people seem to force a stupid and incorrect description, 
> when they have been told (a) that it's wrong and (b) why it's wrong 
> several times?

Because meaning comes from context, and my meaning comes from different
context. So we disagree on correct names. 

> What's so hard with just doing the TechnicallyRightThing(tm) and 
> describing it as such?
> 
> And btw - don't get me wrong. There are _other_ ways to do that locking 
> too. You don't have to use a rwlock. You can do it with explicit counting, 
> the way Eric's original patch did. But it would be wrong to call that one 
> "recursive lock" too.
> 
> Or you _could_ use a recursive lock, but nobody has actually posted such 
> patches. It would work. No question about it. And if it actually _were_ a 
> recursive lock, I wouldn't have objected about the description saying it 
> is (although I would probably have objected to it being unnecessarily 
> complex, when a simpler rwlock or the explicit count thing would be 
> sufficient).
> 
> But since the current simple patch is using a rwlock, why not just say 
> that? Why call it something incorrect ("recursive lock") or nonsensical 
> ("dog's breath lock").
> 
> As I tried to explain with an analogy, networking people would (quite 
> correctly) object to me calling a serial line an "ethernet cable". Why is 
> it so hard for netfilter people to then see why it's wrong to call a 
> rwlock a "recursive lock".
> 
> I mean, guys, if you don't want to read up on decades of locking work, 
> just google for it. Google for "recursive lock" (_with_ the quotes). At 
> least for me, the very first hit gives a reasonable explanation for it, 
> and it says:
> 
>   "POSIX allows mutexes to be recursive. That means the same thread can 
>    lock the same mutex twice and won't deadlock."
> 
> and realize that the "same thread" part is very much a keyword, not juat 
> a random implementation detail (the first answer to the question is 
> better than the question, but even the question at the top really does 
> get at the basics).
> 
> And please do realize that neither rwlocks nor the counting locks from 
> Dumazet's original patch do that. Never did. They simply aren't recursive 
> locks.
> 
> So just don't call them that. But is "dog's breath" any better? Yes, maybe 
> it's less _misleading_, but it sure as hell isn't any more descriptive.
> 
> 			Linus
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 21:40                                                                                                                       ` Stephen Hemminger
  (?)
@ 2009-04-27 22:24                                                                                                                       ` Linus Torvalds
  2009-04-27 23:01                                                                                                                         ` Linus Torvalds
  -1 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 22:24 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Mon, 27 Apr 2009, Stephen Hemminger wrote:
>
> The part that concerns me is that the reader lock used in a nested manner on
> same cpu may still be broken on -rt.

I think that's a valid concern, and I don't actually object to not using a 
rwlock, but using the "explicit counting + spinlock" that we had at one 
point. It _might_ even be faster, since a spinlock can be faster than a 
rwlock, and the (rare) case where you recurse you can then avoid the 
atomic entirely.

But EVEN IF YOU DO THAT, it's still wrong to call it a "recursive lock". 
Because it still wouldn't be one.

That's kind of my point, and always has been. It was why I objected to the 
original patch description by Dumazet. It wasn't a recursive lock back 
then _either_. For all the reasons I tried to explain to you, and you seem 
to not care about.

Btw, if you do use the "explicit count" case, then please please please 
make sure it's documented and bug-free. And dammit, correct documentation 
in this case very much talks about how it is _not_ a "recursive lock", but 
a spinlock that then has an explicit counter that avoids taking the lock 
entirely in one very specific path (that happens to be recursive).

The thing is, using a rwlock kind of implicitly documents all of that, 
because you can tell somebody who doesn't even read the code that it's a 
"per-cpu rwlock", and they then know what the semantics are (given that 
they know the kernel semantics for locking in the first place).

But once you start doing your own locking rules, you really have to 
explain why it works, and what it does. And you do have to be very 
careful, because it's so easy to get locking wrong.

> I don't care. I don't care. Don't you get the point yet.

If you don't care about the naming, then use the right one. 

And if you don't care about the naming, why do you then say I'm deluding 
myself, when I'm _correct_, and I _do_ happen to care about correct 
naming.

Locking really is important.  Code that gets locking wrong breaks in
really subtle and nasty ways.  And it sadly tends to "work" in testing,
since the breakage cases require just the right timing.  So locking
should be robust and as "obviously correct" as possible.

And naming really is important.  Misnaming things makes people make
assumptions that aren't true.  If you talk about recursive locks, it
should be reasonable that people who know how recursive locks work would
then make assumptions about them.  If the code then doesn't actually
match those rules, that's bad. 

It's like having a comment in front of a piece of code that says something 
totally different than what the code actually does. It's _bad_. That's why 
naming matters so much - because naming is commentary.  If you mis-name 
things on purpose, it's simply a bug.

Do _you_ get the point?

You _do_ care about bugs, don't you?

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 22:24                                                                                                                       ` Linus Torvalds
@ 2009-04-27 23:01                                                                                                                         ` Linus Torvalds
  2009-04-27 23:03                                                                                                                           ` Linus Torvalds
  2009-04-27 23:32                                                                                                                           ` Linus Torvalds
  0 siblings, 2 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 23:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh


On Mon, 27 Apr 2009, Linus Torvalds wrote:
> 
> Btw, if you do use the "explicit count" case, then please please please 
> make sure it's documented and bug-free. And dammit, correct documentation 
> in this case very much talks about how it is _not_ a "recursive lock", but 
> a spinlock that then has an explicit counter that avoids taking the lock 
> entirely in one very specific path (that happens to be recursive).

So, just as an example, I would not object to the counter approach, as 
long as it really talks about the issues, and why it works (and as long 
that doesn't call the locks "recursive locks").

So if you are just nervous about rwlocks, then something like this might
work (needs testing, and other people to sanity-check it).

I left the commentary about "readers" and "writers", because in many
ways it's correct, and what the code actually does is very much to
emulate a reader-writer lock.  I put quotes around the uses in the
comments to high-light that it largely _acts_ as a reader-writer lock. 

Now, it would actually be a really _bad_ reader-writer lock in general, 
exactly because it's not very "atomic" (ie this would be a truly sucky and 
broken lock if we didn't have the strict per-cpu usage rules).

So it just so happens that the per-cpu'ness of the lock, together with the 
very limited way in which it is used, make that sucky implementation 
possible - and indeed possibly faster than the standard (and generic) 
kernel rwlock implementation.

So it's basically a special-case rwlock that happens to work due to the
specific rules. 

And exactly because it really wouldn't work in the absense of those
rules, those rules really do need to have big comments on them so that 
people don't then later forget about the limitations.

BTW: THIS IS TOTALLY UNTESTED.  I just cut-and-pasted the existing
rwlock version from one of the later patches, used the counting approach
from one of the earlier ones, and basically just added what I think
would be minimal required comments for this special case.  All of it was
written inside the mail reader, none of it has been compiled or tested
in any other way. 

Because it's exactly the "lock semantics awareness" that I think is so
important here (and that's why all the naming and commentary is so
critical). 

Btw, the "counter" could probably be just a single bit, since apparently
the nesting level is always supposed to be just one.  I made it
"unsigned char" just because sometimes spinlocks can be small, and it
seemed the simplest type.  But perhaps it would be better as an
"unsigned int", since some architectures potentially could do that
faster (eg old alphas).

			Linus

---
/*
 * Per-CPU spinlock associated with per-cpu table entries, and
 * with a counter for the "reading" side that allows a recursive
 * reader to avoid taking the lock and deadlocking.
 *
 * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
 * It needs to ensure that the rules are not being changed while the packet
 * is being processed. In some cases, the read lock will be acquired
 * twice on the same CPU; this is okay because of the count.
 *
 * The write lock is used in two cases:
 *    1. reading counter values
 *       all rule processing need to be stopped and the per-CPU values are summed.
 *
 *    2. replacing tables
 *       any readers that are using the old tables have to complete
 *       before freeing the old table. This is handled by reading
 *       as a side effect of reading counters
 */
struct xt_info_lock {
	spin_lock_t lock;
	unsigned char readers;
};
DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);

/*
 * Note: we need to ensure that preemption is disabled before acquiring
 * the per-cpu-variable, so we do it as a two step process rather than
 * using "spin_lock_bh()". 
 *
 * We _also_ need to disable bottom half processing before updating our
 * nesting count, to make sure that the only kind of re-entrancy is this
 * code being called by itself: since the count+lock is not an atomic
 * operation, we can allow no races. 
 *
 * _Only_ that special combination of being per-cpu and never getting
 * re-entered asynchronously means that the count is safe. 
 */
static inline void xt_info_rdlock_bh(void)
{
	struct xt_info_lock *lock;

	local_bh_disable();
	lock = &__get_cpu_var(xt_info_locks);
	if (!lock->readers++)
		spin_lock(&lock->lock);
}

static inline void xt_info_rdunlock_bh(void)
{
	struct xt_info_lock *lock;

	lock = &__get_cpu_var(xt_info_locks);
	if (!--lock->readers)
		spin_unlock(&lock->lock);
}

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers.  This must be called with bottom half
 * processing (and thus also preemption) disabled. 
 */
static inline void xt_info_wrlock(unsigned int cpu)
{
	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
}

static inline void xt_info_wrunlock(unsigned int cpu)
{
	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
}

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 23:01                                                                                                                         ` Linus Torvalds
@ 2009-04-27 23:03                                                                                                                           ` Linus Torvalds
  2009-04-28  6:58                                                                                                                               ` Eric Dumazet
  2009-04-27 23:32                                                                                                                           ` Linus Torvalds
  1 sibling, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 23:03 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Mon, 27 Apr 2009, Linus Torvalds wrote:
> 
> BTW: THIS IS TOTALLY UNTESTED.

Gaah. I should have read through it one more time before sending.

> static inline void xt_info_rdunlock_bh(void)
> {
> 	struct xt_info_lock *lock;
> 
> 	lock = &__get_cpu_var(xt_info_locks);
> 	if (!--lock->readers)
> 		spin_unlock(&lock->lock);
> }

This one was missing the "local_bh_enable()" at the end.

There may be other bugs, but that's the one I noticed immediately when 
reading what I sent out. Oops.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 23:01                                                                                                                         ` Linus Torvalds
  2009-04-27 23:03                                                                                                                           ` Linus Torvalds
@ 2009-04-27 23:32                                                                                                                           ` Linus Torvalds
  2009-04-28  7:41                                                                                                                             ` Peter Zijlstra
  1 sibling, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-27 23:32 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Eric Dumazet, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Mon, 27 Apr 2009, Linus Torvalds wrote:
> 
> I left the commentary about "readers" and "writers", because in many
> ways it's correct, and what the code actually does is very much to
> emulate a reader-writer lock.  I put quotes around the uses in the
> comments to high-light that it largely _acts_ as a reader-writer lock. 

Btw, I think it was Paul who pointed out that technically it's probably 
better to call them "local" and "global" lockers instead of "readers" and 
"writers".

That also probably clarifies the rules on when you use one over the other 
(ie reading off all the statistics is a "global" operation, as is 
obviously replacing the tables).

Of course, "readers" and "writers" is something most Linux lock people are 
more used to. Or "brlock" for the old-timers, but that involves a heavy 
dose of bad taste. The new use is much nicer, especially since it never 
takes the global lock on _all_ cpu's (which was really a killer in so 
many ways).

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 23:03                                                                                                                           ` Linus Torvalds
@ 2009-04-28  6:58                                                                                                                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-28  6:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Linus Torvalds a écrit :
> 
> On Mon, 27 Apr 2009, Linus Torvalds wrote:
>> BTW: THIS IS TOTALLY UNTESTED.
> 
> Gaah. I should have read through it one more time before sending.
> 
>> static inline void xt_info_rdunlock_bh(void)
>> {
>> 	struct xt_info_lock *lock;
>>
>> 	lock = &__get_cpu_var(xt_info_locks);
>> 	if (!--lock->readers)
>> 		spin_unlock(&lock->lock);
>> }
> 
> This one was missing the "local_bh_enable()" at the end.
> 
> There may be other bugs, but that's the one I noticed immediately when 
> reading what I sent out. Oops.

I am not sure my day job will permit me to polish a patch mixing all
the bits and comments. But I am glad we eventually got back spinlocks
which are probably better than rwlocks for implementing this stuff.

Instead of submitting a full patch again, we could first submit a new
 include file containg all comments and inline functions ?

This include file could be local to netfilter, with a big stick on
it to forbids its use on other areas (No changes in Documentation/ )

Then, as soon as we can go back to pure RCU solution, we can safely
delete this controversial-locking-nesting-per-cpu-thing ?


Instead of local/global name that Paul suggested, that was about
'global' locking all locks at the same time. Not any more the good
name IMHO

Maybe something like local/remote or owner/foreigner ?

xt_info_owner_lock_bh(), xt_info_owner_unlock_bh()
xt_info_foreigner_lock(), xt_info_foreigner_unlock()

One comment about this comment you wrote :

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers.  This must be called with bottom half
 * processing (and thus also preemption) disabled. 
 */
static inline void xt_info_wrlock(unsigned int cpu)
{
	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
}

static inline void xt_info_wrunlock(unsigned int cpu)
{
	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
}

Its true that BH should be disabled if caller runs
on the cpu it wants to lock. 
For other ones (true foreigners), there is
no requirement about BH (current cpu could be interrupted
by a softirq and packets could fly)

We could use following construct and not require disabling BH
more than a short period of time.
(But preemption disabled for the whole duration)

preempt_disable(); // could be cpu_migration_disable();

int curcpu = smp_processor_id();
/*
 * Gather stats for current cpu : must disable BH
 * before trying to lock.
 */
local_bh_disable();
xt_info_wrlock(curcpu);
// copy stats of this cpu on my private data (not shown here)
xt_info_wrunlock(curcpu);
local_bh_enable();

for_each_possible_cpu(cpu) {
	if (cpu == curcpu)
		continue;
	xt_info_wrlock(cpu);
	// fold stats of "cpu" on my private data (not shown here)
	xt_info_wrunlock((cpu);
}
preempt_enable(); // could be cpu_migration_enable();


So your initial comment could be changed to :

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers. If caller is about to lock its own lock,
 * he must have disabled BH before. For other cpus, no special
 * care but preemption disabled to guarantee no cpu migration.
 */

Back to work now :)


^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
@ 2009-04-28  6:58                                                                                                                               ` Eric Dumazet
  0 siblings, 0 replies; 254+ messages in thread
From: Eric Dumazet @ 2009-04-28  6:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

Linus Torvalds a écrit :
> 
> On Mon, 27 Apr 2009, Linus Torvalds wrote:
>> BTW: THIS IS TOTALLY UNTESTED.
> 
> Gaah. I should have read through it one more time before sending.
> 
>> static inline void xt_info_rdunlock_bh(void)
>> {
>> 	struct xt_info_lock *lock;
>>
>> 	lock = &__get_cpu_var(xt_info_locks);
>> 	if (!--lock->readers)
>> 		spin_unlock(&lock->lock);
>> }
> 
> This one was missing the "local_bh_enable()" at the end.
> 
> There may be other bugs, but that's the one I noticed immediately when 
> reading what I sent out. Oops.

I am not sure my day job will permit me to polish a patch mixing all
the bits and comments. But I am glad we eventually got back spinlocks
which are probably better than rwlocks for implementing this stuff.

Instead of submitting a full patch again, we could first submit a new
 include file containg all comments and inline functions ?

This include file could be local to netfilter, with a big stick on
it to forbids its use on other areas (No changes in Documentation/ )

Then, as soon as we can go back to pure RCU solution, we can safely
delete this controversial-locking-nesting-per-cpu-thing ?


Instead of local/global name that Paul suggested, that was about
'global' locking all locks at the same time. Not any more the good
name IMHO

Maybe something like local/remote or owner/foreigner ?

xt_info_owner_lock_bh(), xt_info_owner_unlock_bh()
xt_info_foreigner_lock(), xt_info_foreigner_unlock()

One comment about this comment you wrote :

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers.  This must be called with bottom half
 * processing (and thus also preemption) disabled. 
 */
static inline void xt_info_wrlock(unsigned int cpu)
{
	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
}

static inline void xt_info_wrunlock(unsigned int cpu)
{
	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
}

Its true that BH should be disabled if caller runs
on the cpu it wants to lock. 
For other ones (true foreigners), there is
no requirement about BH (current cpu could be interrupted
by a softirq and packets could fly)

We could use following construct and not require disabling BH
more than a short period of time.
(But preemption disabled for the whole duration)

preempt_disable(); // could be cpu_migration_disable();

int curcpu = smp_processor_id();
/*
 * Gather stats for current cpu : must disable BH
 * before trying to lock.
 */
local_bh_disable();
xt_info_wrlock(curcpu);
// copy stats of this cpu on my private data (not shown here)
xt_info_wrunlock(curcpu);
local_bh_enable();

for_each_possible_cpu(cpu) {
	if (cpu == curcpu)
		continue;
	xt_info_wrlock(cpu);
	// fold stats of "cpu" on my private data (not shown here)
	xt_info_wrunlock((cpu);
}
preempt_enable(); // could be cpu_migration_enable();


So your initial comment could be changed to :

/*
 * The "writer" side needs to get exclusive access to the lock,
 * regardless of readers. If caller is about to lock its own lock,
 * he must have disabled BH before. For other cpus, no special
 * care but preemption disabled to guarantee no cpu migration.
 */

Back to work now :)

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 23:32                                                                                                                           ` Linus Torvalds
@ 2009-04-28  7:41                                                                                                                             ` Peter Zijlstra
  2009-04-28 14:22                                                                                                                               ` Paul E. McKenney
  0 siblings, 1 reply; 254+ messages in thread
From: Peter Zijlstra @ 2009-04-28  7:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Evgeniy Polyakov, Ingo Molnar,
	Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Paul Mackerras, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh

On Mon, 2009-04-27 at 16:32 -0700, Linus Torvalds wrote:
> 
> On Mon, 27 Apr 2009, Linus Torvalds wrote:
> > 
> > I left the commentary about "readers" and "writers", because in many
> > ways it's correct, and what the code actually does is very much to
> > emulate a reader-writer lock.  I put quotes around the uses in the
> > comments to high-light that it largely _acts_ as a reader-writer lock. 
> 
> Btw, I think it was Paul who pointed out that technically it's probably 
> better to call them "local" and "global" lockers instead of "readers" and 
> "writers".

exclusive vs non-exclusive is what the literature would call them in
most cases I think.

> That also probably clarifies the rules on when you use one over the other 
> (ie reading off all the statistics is a "global" operation, as is 
> obviously replacing the tables).
> 
> Of course, "readers" and "writers" is something most Linux lock people are 
> more used to. Or "brlock" for the old-timers, but that involves a heavy 
> dose of bad taste. The new use is much nicer, especially since it never 
> takes the global lock on _all_ cpu's (which was really a killer in so 
> many ways).
> 
> 			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-27 19:46                                                                                                               ` Linus Torvalds
  2009-04-27 19:48                                                                                                                 ` Linus Torvalds
  2009-04-27 20:36                                                                                                                 ` Evgeniy Polyakov
@ 2009-04-28  7:42                                                                                                                 ` Jan Engelhardt
  2 siblings, 0 replies; 254+ messages in thread
From: Jan Engelhardt @ 2009-04-28  7:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Stephen Hemminger, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Paul Mackerras, paulmck, Evgeniy Polyakov, kaber,
	jeff.chua.linux, laijs, r000n, linux-kernel, netfilter-devel,
	netdev, benh


On Monday 2009-04-27 21:46, Linus Torvalds wrote:

>Clue bat #3 [sic #4]:
>
> - if you do not understand the difference between these two things, don't 
>   then try to claim that somebody _else_ who does understand it is 
>   "deluding himself".
>
>   Analogy time: Ethernet and a modem line can both get you on the
>   internet. Now, let's say that Mr Peter Paste-Eater has heard of 
>   ethernet, and knows you can get on the internet with an ethernet 
>   connection, but he happens to use a modem line to do it.
>
>   Now, Peter Paste-Eater talks to you, and tells you he is connecting to 
>   the internet with ethernet, and proudly shows you his serial line and 
>   modem, and tells you how he uses ethernet to get onto the internet. You 
>   correct him, and tell him it's not ethernet.  He argues for several 
>   days about how he gets on the internet, and that it must thus be 
>   ethernet, and that you're obviously just "deluding yourself".
>
>Now, can you see why people react badly to you talking about "recursive 
>locks"? You're acting like Peter Paste-Eater calling his serial line 
>ethernet.

It could be worse. He could be running Ethernet over serial, e.g. L2TP.
Or his serial line is a TP cable with RJ45 plugs - consumers like
to call that Ethernet (cable) too.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28  6:58                                                                                                                               ` Eric Dumazet
  (?)
@ 2009-04-28 11:53                                                                                                                               ` David Miller
  2009-04-28 12:40                                                                                                                                 ` Ingo Molnar
  -1 siblings, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-28 11:53 UTC (permalink / raw)
  To: dada1
  Cc: torvalds, shemminger, zbr, mingo, peterz, mathieu.desnoyers,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 28 Apr 2009 08:58:05 +0200

> I am not sure my day job will permit me to polish a patch mixing all
> the bits and comments. But I am glad we eventually got back spinlocks
> which are probably better than rwlocks for implementing this stuff.
> 
> Instead of submitting a full patch again, we could first submit a new
>  include file containg all comments and inline functions ?
> 
> This include file could be local to netfilter, with a big stick on
> it to forbids its use on other areas (No changes in Documentation/ )
> 
> Then, as soon as we can go back to pure RCU solution, we can safely
> delete this controversial-locking-nesting-per-cpu-thing ?

I say we merge Linus's locking idea into the XV patch, fixup the
commit message wording, and move on with life.

For something that's going to get deleted as soon as the faster grace
period RCU stuff is available, it has consumed an inordinate amount of
our time :-)

I might take a stab at this before hittng bed tonight, no promises :)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 11:53                                                                                                                               ` David Miller
@ 2009-04-28 12:40                                                                                                                                 ` Ingo Molnar
  2009-04-28 13:43                                                                                                                                   ` David Miller
  0 siblings, 1 reply; 254+ messages in thread
From: Ingo Molnar @ 2009-04-28 12:40 UTC (permalink / raw)
  To: David Miller
  Cc: dada1, torvalds, shemminger, zbr, peterz, mathieu.desnoyers,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh


* David Miller <davem@davemloft.net> wrote:

> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Tue, 28 Apr 2009 08:58:05 +0200
> 
> > I am not sure my day job will permit me to polish a patch mixing all
> > the bits and comments. But I am glad we eventually got back spinlocks
> > which are probably better than rwlocks for implementing this stuff.
> > 
> > Instead of submitting a full patch again, we could first submit a new
> >  include file containg all comments and inline functions ?
> > 
> > This include file could be local to netfilter, with a big stick on
> > it to forbids its use on other areas (No changes in Documentation/ )
> > 
> > Then, as soon as we can go back to pure RCU solution, we can 
> > safely delete this controversial-locking-nesting-per-cpu-thing ?
> 
> I say we merge Linus's locking idea into the XV patch, fixup the 
> commit message wording, and move on with life.
> 
> For something that's going to get deleted as soon as the faster 
> grace period RCU stuff is available, it has consumed an inordinate 
> amount of our time :-)

One more reason to factor out this code into general locking code.

The latest code looks a bit similar to the old big-reader-locks hack 
(which got dropped for good many eons ago and with which i deny any 
involvement with, such as having authored it. [oh, did i say that 
out loud? crap.]), implemented cleanly and properly.

IMHO this locking construct should be considered for 
linux/local_lock.h and kernel/local_lock.c. Even if the netfilter 
code drops its use soon afterwards ;-)

[ The _only_ thing i am worried about is the apparent fact that
  there's so much confusion about recursion versus read-access.
  Recursion might be hard to factor out of the netfilter code, and
  maybe it's not even possible realistically (we fought years with
  the BKL and are still fighting it) but if its harms are not even
  _realized_ that difficult task turns into an impossible task ;-) ]

	Ingo

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 12:40                                                                                                                                 ` Ingo Molnar
@ 2009-04-28 13:43                                                                                                                                   ` David Miller
  2009-04-28 13:52                                                                                                                                     ` Mathieu Desnoyers
  2009-04-28 15:42                                                                                                                                     ` [PATCH] netfilter: use per-CPU r**ursive lock {XV} Paul E. McKenney
  0 siblings, 2 replies; 254+ messages in thread
From: David Miller @ 2009-04-28 13:43 UTC (permalink / raw)
  To: mingo
  Cc: dada1, torvalds, shemminger, zbr, peterz, mathieu.desnoyers,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh

From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Apr 2009 14:40:33 +0200

> IMHO this locking construct should be considered for 
> linux/local_lock.h and kernel/local_lock.c. Even if the netfilter 
> code drops its use soon afterwards ;-)

If you can show me have to pass a per-cpu variable (the variable,
not a dereference of it) as an argument to an inline function,
I'll implement this :-)

It has to be dereferenced after local_bh_disable() for the
read side acquisition.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 13:43                                                                                                                                   ` David Miller
@ 2009-04-28 13:52                                                                                                                                     ` Mathieu Desnoyers
  2009-04-28 14:37                                                                                                                                       ` David Miller
  2009-04-28 15:42                                                                                                                                     ` [PATCH] netfilter: use per-CPU r**ursive lock {XV} Paul E. McKenney
  1 sibling, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-28 13:52 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, dada1, torvalds, shemminger, zbr, peterz, jarkao2, paulus,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

* David Miller (davem@davemloft.net) wrote:
> From: Ingo Molnar <mingo@elte.hu>
> Date: Tue, 28 Apr 2009 14:40:33 +0200
> 
> > IMHO this locking construct should be considered for 
> > linux/local_lock.h and kernel/local_lock.c. Even if the netfilter 
> > code drops its use soon afterwards ;-)
> 
> If you can show me have to pass a per-cpu variable (the variable,
> not a dereference of it) as an argument to an inline function,
> I'll implement this :-)
> 
> It has to be dereferenced after local_bh_disable() for the
> read side acquisition.

The local_bh_disable() could be outside of the locking construct. This
would make it easier to adapt it to various users (irq disable, bh
disable, preempt disable) depending on the contexts from which they much
be protected.

And if it still does not work for some reason, using a #define is
discouraged, but could work.

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28  7:41                                                                                                                             ` Peter Zijlstra
@ 2009-04-28 14:22                                                                                                                               ` Paul E. McKenney
  0 siblings, 0 replies; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-28 14:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Stephen Hemminger, Evgeniy Polyakov, Ingo Molnar,
	Mathieu Desnoyers, Eric Dumazet, David Miller, Jarek Poplawski,
	Paul Mackerras, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

On Tue, Apr 28, 2009 at 09:41:08AM +0200, Peter Zijlstra wrote:
> On Mon, 2009-04-27 at 16:32 -0700, Linus Torvalds wrote:
> > 
> > On Mon, 27 Apr 2009, Linus Torvalds wrote:
> > > 
> > > I left the commentary about "readers" and "writers", because in many
> > > ways it's correct, and what the code actually does is very much to
> > > emulate a reader-writer lock.  I put quotes around the uses in the
> > > comments to high-light that it largely _acts_ as a reader-writer lock. 
> > 
> > Btw, I think it was Paul who pointed out that technically it's probably 
> > better to call them "local" and "global" lockers instead of "readers" and 
> > "writers".
> 
> exclusive vs non-exclusive is what the literature would call them in
> most cases I think.

I would argue that the non-exclusive category includes both reader-writer
locking and local-global locking.  That said, we have an unusual variant
of local-global in this case, as the global processing acquires only one
of the locks at a time.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 13:52                                                                                                                                     ` Mathieu Desnoyers
@ 2009-04-28 14:37                                                                                                                                       ` David Miller
  2009-04-28 14:49                                                                                                                                         ` Mathieu Desnoyers
  0 siblings, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-28 14:37 UTC (permalink / raw)
  To: mathieu.desnoyers
  Cc: mingo, dada1, torvalds, shemminger, zbr, peterz, jarkao2, paulus,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 28 Apr 2009 09:52:19 -0400

> The local_bh_disable() could be outside of the locking construct. This
> would make it easier to adapt it to various users (irq disable, bh
> disable, preempt disable) depending on the contexts from which they much
> be protected.
> 
> And if it still does not work for some reason, using a #define is
> discouraged, but could work.

That's what I was hoping to avoid, things like macros and having
the callers of this thing expand the two parts of the operation.

What's the point in making this generic if it ends up being ugly
as hell?

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 14:37                                                                                                                                       ` David Miller
@ 2009-04-28 14:49                                                                                                                                         ` Mathieu Desnoyers
  2009-04-28 15:00                                                                                                                                           ` David Miller
  0 siblings, 1 reply; 254+ messages in thread
From: Mathieu Desnoyers @ 2009-04-28 14:49 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, dada1, torvalds, shemminger, zbr, peterz, jarkao2, paulus,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

* David Miller (davem@davemloft.net) wrote:
> From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date: Tue, 28 Apr 2009 09:52:19 -0400
> 
> > The local_bh_disable() could be outside of the locking construct. This
> > would make it easier to adapt it to various users (irq disable, bh
> > disable, preempt disable) depending on the contexts from which they much
> > be protected.
> > 
> > And if it still does not work for some reason, using a #define is
> > discouraged, but could work.
> 
> That's what I was hoping to avoid, things like macros and having
> the callers of this thing expand the two parts of the operation.
> 
> What's the point in making this generic if it ends up being ugly
> as hell?

.. and what's the point in making it generic if it can be replaced
by a proper RCU implementation ? :-) I am not convinced of the added
value we get in making it a generic header this soon. I would wait for
other users to express similar needs, otherwise this could soon become
an orphaned piece of locking code.

Mathieu

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 14:49                                                                                                                                         ` Mathieu Desnoyers
@ 2009-04-28 15:00                                                                                                                                           ` David Miller
  2009-04-28 16:24                                                                                                                                             ` [PATCH] netfilter: revised locking for x_tables Stephen Hemminger
  0 siblings, 1 reply; 254+ messages in thread
From: David Miller @ 2009-04-28 15:00 UTC (permalink / raw)
  To: mathieu.desnoyers
  Cc: mingo, dada1, torvalds, shemminger, zbr, peterz, jarkao2, paulus,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 28 Apr 2009 10:49:20 -0400

> .. and what's the point in making it generic if it can be replaced
> by a proper RCU implementation ? :-) I am not convinced of the added
> value we get in making it a generic header this soon. I would wait for
> other users to express similar needs, otherwise this could soon become
> an orphaned piece of locking code.

That is my opinion as well.

Anyways, here is a patch that builds, I haven't started working
on the commit message yet.

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652..086e976 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,79 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU spinlock associated with per-cpu table entries, and
+ * with a counter for the "reading" side that allows a recursive
+ * reader to avoid taking the lock and deadlocking.
+ *
+ * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while the packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because of the count.
+ *
+ * The write lock is used in two cases:
+ *    1. reading counter values
+ *       all rule processing need to be stopped and the per-CPU values are summed.
+ *
+ *    2. replacing tables
+ *       any readers that are using the old tables have to complete
+ *       before freeing the old table. This is handled by reading
+ *       as a side effect of reading counters
+ */
+struct xt_info_lock {
+	spinlock_t lock;
+	unsigned char readers;
+};
+DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+/*
+ * Note: we need to ensure that preemption is disabled before acquiring
+ * the per-cpu-variable, so we do it as a two step process rather than
+ * using "spin_lock_bh()". 
+ *
+ * We _also_ need to disable bottom half processing before updating our
+ * nesting count, to make sure that the only kind of re-entrancy is this
+ * code being called by itself: since the count+lock is not an atomic
+ * operation, we can allow no races. 
+ *
+ * _Only_ that special combination of being per-cpu and never getting
+ * re-entered asynchronously means that the count is safe. 
+ */
+static inline void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	local_bh_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (!lock->readers++)
+		spin_lock(&lock->lock);
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	lock = &__get_cpu_var(xt_info_locks);
+	if (!--lock->readers)
+		spin_unlock(&lock->lock);
+	local_bh_enable();
+}
+
+/*
+ * The "writer" side needs to get exclusive access to the lock,
+ * regardless of readers.  This must be called with bottom half
+ * processing (and thus also preemption) disabled. 
+ */
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+}
 
 /*
  * This helper is performance critical and must be inlined
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d..831fe18 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
+		return ERR_PTR(-ENOMEM);
 
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b6..2ec8d72 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae85..219e165 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a956..020e97b 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,11 @@ static struct pernet_operations xt_net_ops = {
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(xt_info_locks, i).lock);
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28  6:58                                                                                                                               ` Eric Dumazet
  (?)
  (?)
@ 2009-04-28 15:09                                                                                                                               ` Linus Torvalds
  -1 siblings, 0 replies; 254+ messages in thread
From: Linus Torvalds @ 2009-04-28 15:09 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Evgeniy Polyakov, Ingo Molnar, Peter Zijlstra,
	Mathieu Desnoyers, David Miller, Jarek Poplawski, Paul Mackerras,
	paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh



On Tue, 28 Apr 2009, Eric Dumazet wrote:
> 
> Instead of submitting a full patch again, we could first submit a new
>  include file containg all comments and inline functions ?

Well, I actually already suggested to David that he should just merge the 
last patch I saw floating around (with the "recursive" -> "readwrite" fix 
in the comment ;), so that we can at least get the basic issue fixed, and 
then we can tweak it a bit with smaller patches flying around.

And at least right now, the difference between the rwlock and the 
"count+spinlock" should be basically almost unnoticeable, and a very small 
implementation issue. They're entirely interchangeable, after all.

> This include file could be local to netfilter, with a big stick on
> it to forbids its use on other areas (No changes in Documentation/ )
>
> Then, as soon as we can go back to pure RCU solution, we can safely
> delete this controversial-locking-nesting-per-cpu-thing ?

I don't have any strogn preferences, but I'd almost prefer to not abstract 
things out even that much. It's already pretty well hidden inside 
<netfilter/x_tables.h>, I'd hate to add a new file just for this. 

As to just adding more commenting that it must not be used anywhere else, 
I certainly agree with that.

> Instead of local/global name that Paul suggested, that was about
> 'global' locking all locks at the same time. Not any more the good
> name IMHO
> 
> Maybe something like local/remote or owner/foreigner ?

local/remote works for me, and yes, since we only take the remote side one 
CPU at a time, I guess "global" is misleading. But "owner/foreigner" 
sounds pretty odd.

> One comment about this comment you wrote :
> 
> /*
>  * The "writer" side needs to get exclusive access to the lock,
>  * regardless of readers.  This must be called with bottom half
>  * processing (and thus also preemption) disabled. 
>  */
> 
> Its true that BH should be disabled if caller runs
> on the cpu it wants to lock. 
> For other ones (true foreigners), there is
> no requirement about BH (current cpu could be interrupted
> by a softirq and packets could fly)

Yes. Other CPU's just require preemption protection. 

> We could use following construct and not require disabling BH
> more than a short period of time.
> (But preemption disabled for the whole duration)
> 
> preempt_disable(); // could be cpu_migration_disable();
> 
> int curcpu = smp_processor_id();
> /*
>  * Gather stats for current cpu : must disable BH
>  * before trying to lock.
>  */
> local_bh_disable();
> xt_info_wrlock(curcpu);
> // copy stats of this cpu on my private data (not shown here)
> xt_info_wrunlock(curcpu);
> local_bh_enable();
> 
> for_each_possible_cpu(cpu) {
> 	if (cpu == curcpu)
> 		continue;
> 	xt_info_wrlock(cpu);
> 	// fold stats of "cpu" on my private data (not shown here)
> 	xt_info_wrunlock((cpu);
> }
> preempt_enable(); // could be cpu_migration_enable();

Agreed. 

> So your initial comment could be changed to :
> 
> /*
>  * The "writer" side needs to get exclusive access to the lock,
>  * regardless of readers. If caller is about to lock its own lock,
>  * he must have disabled BH before. For other cpus, no special
>  * care but preemption disabled to guarantee no cpu migration.
>  */

Ack.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 13:43                                                                                                                                   ` David Miller
  2009-04-28 13:52                                                                                                                                     ` Mathieu Desnoyers
@ 2009-04-28 15:42                                                                                                                                     ` Paul E. McKenney
  2009-04-28 17:35                                                                                                                                       ` Christoph Lameter
  1 sibling, 1 reply; 254+ messages in thread
From: Paul E. McKenney @ 2009-04-28 15:42 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, dada1, torvalds, shemminger, zbr, peterz,
	mathieu.desnoyers, jarkao2, paulus, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

On Tue, Apr 28, 2009 at 06:43:40AM -0700, David Miller wrote:
> From: Ingo Molnar <mingo@elte.hu>
> Date: Tue, 28 Apr 2009 14:40:33 +0200
> 
> > IMHO this locking construct should be considered for 
> > linux/local_lock.h and kernel/local_lock.c. Even if the netfilter 
> > code drops its use soon afterwards ;-)
> 
> If you can show me have to pass a per-cpu variable (the variable,
> not a dereference of it) as an argument to an inline function,
> I'll implement this :-)
> 
> It has to be dereferenced after local_bh_disable() for the
> read side acquisition.

The way I did this in treercu.c was to create an array of references
to the per-CPU data in question.  Not necessarily recommended, but
one way of doing it.  That said, one could argue that we should wait
until we have at least three users before creating a generic primitive.

And I just know that I am going to regret this deeply, but I cannot
resist posting the following URL:

http://en.wikipedia.org/wiki/Wikipedia:Avoid_Parkinson's_Bicycle_Shed_Effect

							Thanx, Paul

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: revised locking for x_tables
  2009-04-28 15:00                                                                                                                                           ` David Miller
@ 2009-04-28 16:24                                                                                                                                             ` Stephen Hemminger
  2009-04-28 16:50                                                                                                                                               ` Linus Torvalds
  0 siblings, 1 reply; 254+ messages in thread
From: Stephen Hemminger @ 2009-04-28 16:24 UTC (permalink / raw)
  To: David Miller
  Cc: mathieu.desnoyers, mingo, dada1, torvalds, zbr, peterz, jarkao2,
	paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh, r000n,
	linux-kernel, netfilter-devel, netdev, benh

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer 
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Probably same as dave/linus but with different comments.

 include/linux/netfilter/x_tables.h |   73 +++++++++++++++++++--
 net/ipv4/netfilter/arp_tables.c    |  125 ++++++++++--------------------------
 net/ipv4/netfilter/ip_tables.c     |  126 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    |  123 ++++++++++--------------------------
 net/netfilter/x_tables.c           |   53 ++++++++-------
 5 files changed, 204 insertions(+), 296 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-04-28 08:01:59.942151297 -0700
+++ b/include/linux/netfilter/x_tables.h	2009-04-28 09:15:09.240990339 -0700
@@ -354,9 +354,6 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	struct mutex lock;
-
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
 
@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
-extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
-				    struct xt_table_info *new);
+
+/*
+ * Per-CPU spinlock associated with per-cpu table entries, and
+ * with a counter for the "reading" side that allows a recursive
+ * reader to avoid taking the lock and deadlocking.
+ *
+ * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
+ * It needs to ensure that the rules are not being changed while the packet
+ * is being processed. In some cases, the read lock will be acquired
+ * twice on the same CPU; this is okay because of the count.
+ *
+ * "writing" is used when reading counters.
+ *  During replace any readers that are using the old tables have to complete
+ *  before freeing the old table. This is handled by the write locking
+ *  necessary for reading the counters.
+ */
+struct xt_info_lock {
+	spinlock_t lock;
+	unsigned char readers;
+};
+DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+
+/*
+ * Note: we need to ensure that preemption is disabled before acquiring
+ * the per-cpu-variable, so we do it as a two step process rather than
+ * using "spin_lock_bh()".
+ *
+ * We _also_ need to disable bottom half processing before updating our
+ * nesting count, to make sure that the only kind of re-entrancy is this
+ * code being called by itself: since the count+lock is not an atomic
+ * operation, we can allow no races.
+ *
+ * _Only_ that special combination of being per-cpu and never getting
+ * re-entered asynchronously means that the count is safe.
+ */
+static inline void xt_info_rdlock_bh(void)
+{
+	struct xt_info_lock *lock;
+
+	local_bh_disable();
+	lock = &__get_cpu_var(xt_info_locks);
+	if (!lock->readers++)
+		spin_lock(&lock->lock);
+}
+
+static inline void xt_info_rdunlock_bh(void)
+{
+	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+
+	if (!--lock->readers)
+		spin_unlock(&lock->lock);
+	local_bh_enable();
+}
+
+/*
+ * The "writer" side needs to get exclusive access to the lock,
+ * regardless of readers.  This must be called with bottom half
+ * processing (and thus also preemption) disabled.
+ */
+static inline void xt_info_wrlock(unsigned int cpu)
+{
+	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+}
+
+static inline void xt_info_wrunlock(unsigned int cpu)
+{
+	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+}
 
 /*
  * This helper is performance critical and must be inlined
--- a/net/ipv4/netfilter/arp_tables.c	2009-04-28 08:01:59.925950000 -0700
+++ b/net/ipv4/netfilter/arp_tables.c	2009-04-28 08:02:15.211234592 -0700
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+
 			ADD_COUNTER(e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct arpt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
+		return ERR_PTR(-ENOMEM);
 
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	ARPT_ENTRY_ITERATE(loc_cpu_entry,
 			   private->size,
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
-
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/ip_tables.c	2009-04-28 08:01:59.933149333 -0700
+++ b/net/ipv4/netfilter/ip_tables.c	2009-04-28 08:02:15.221802628 -0700
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
 	tgpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info 
 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 * with data used by 'current' CPU.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info 
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[cpu],
-			  t->size,
-			  add_counter_to_entry,
-			  counters,
-			  &i);
 	local_bh_enable();
 }
 
-
-static inline int
-zero_entry_counter(struct ipt_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				  zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
+		return ERR_PTR(-ENOMEM);
 
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
-
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	loc_cpu_entry = private->entries[curcpu];
+	xt_info_wrlock(curcpu);
 	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-04-28 08:01:59.920112241 -0700
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-04-28 08:02:15.239235774 -0700
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	rcu_read_lock_bh();
-	private = rcu_dereference(table->private);
-	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+	xt_info_rdlock_bh();
+	private = table->private;
+	table_base = private->entries[smp_processor_id()];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	rcu_read_unlock_bh();
+	xt_info_rdunlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info 
 	/* Instead of clearing (by a previous call to memset())
 	 * the counters and using adds, we set the counters
 	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
+	 *
+	 * Bottom half has to be disabled to prevent deadlock
+	 * if new softirq were to run and call ipt_do_table
 	 */
-	curcpu = raw_smp_processor_id();
+	local_bh_disable();
+	curcpu = smp_processor_id();
 
 	i = 0;
 	IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info 
 		if (cpu == curcpu)
 			continue;
 		i = 0;
+		xt_info_wrlock(cpu);
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
+		xt_info_wrunlock(cpu);
 	}
-}
-
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
-/* Take values from counters and add them back onto the current cpu */
-static void put_counters(struct xt_table_info *t,
-			 const struct xt_counters counters[])
-{
-	unsigned int i, cpu;
-
-	local_bh_disable();
-	cpu = smp_processor_id();
-	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[cpu],
-			   t->size,
-			   add_counter_to_entry,
-			   counters,
-			   &i);
 	local_bh_enable();
 }
 
-static inline int
-zero_entry_counter(struct ip6t_entry *e, void *arg)
-{
-	e->counters.bcnt = 0;
-	e->counters.pcnt = 0;
-	return 0;
-}
-
-static void
-clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
-{
-	unsigned int cpu;
-	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
-
-	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
-	for_each_possible_cpu(cpu) {
-		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
-		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
-				   zero_entry_counter, NULL);
-	}
-}
-
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
 	struct xt_table_info *private = table->private;
-	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		goto nomem;
-
-	info = xt_alloc_table_info(private->size);
-	if (!info)
-		goto free_counters;
-
-	clone_counters(info, private);
-
-	mutex_lock(&table->lock);
-	xt_table_entry_swap_rcu(private, info);
-	synchronize_net();	/* Wait until smoke has cleared */
-
-	get_counters(info, counters);
-	put_counters(private, counters);
-	mutex_unlock(&table->lock);
+		return ERR_PTR(-ENOMEM);
 
-	xt_free_table_info(info);
+	get_counters(private, counters);
 
 	return counters;
-
- free_counters:
-	vfree(counters);
- nomem:
-	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
-	/* Get the old counters. */
+	/* Get the old counters, and synchronize with replace */
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i;
+	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	mutex_lock(&t->lock);
+
+	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
-	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	curcpu = smp_processor_id();
+	xt_info_wrlock(curcpu);
+	loc_cpu_entry = private->entries[curcpu];
 	IP6T_ENTRY_ITERATE(loc_cpu_entry,
 			  private->size,
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
-	preempt_enable();
+	xt_info_wrunlock(curcpu);
+
  unlock_up_free:
-	mutex_unlock(&t->lock);
+	local_bh_enable();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-04-28 08:01:59.909139984 -0700
+++ b/net/netfilter/x_tables.c	2009-04-28 09:21:33.947865827 -0700
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
-			     struct xt_table_info *newinfo)
-{
-	unsigned int cpu;
-
-	for_each_possible_cpu(cpu) {
-		void *p = oldinfo->entries[cpu];
-		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
-		newinfo->entries[cpu] = p;
-	}
-
-}
-EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
-
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
+DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
 	      struct xt_table_info *newinfo,
 	      int *error)
 {
-	struct xt_table_info *oldinfo, *private;
+	struct xt_table_info *private;
 
 	/* Do the substitution. */
-	mutex_lock(&table->lock);
+	local_bh_disable();
 	private = table->private;
+
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		mutex_unlock(&table->lock);
+		local_bh_enable();
 		*error = -EAGAIN;
 		return NULL;
 	}
-	oldinfo = private;
-	rcu_assign_pointer(table->private, newinfo);
-	newinfo->initial_entries = oldinfo->initial_entries;
-	mutex_unlock(&table->lock);
 
-	synchronize_net();
-	return oldinfo;
+	table->private = newinfo;
+	newinfo->initial_entries = private->initial_entries;
+
+	/*
+	 * Even though table entries have now been swapped, other CPU's
+	 * may still be using the old entries. This is okay, because
+	 * resynchronization happens because of the locking done
+	 * during the get_counters() routine.
+	 */
+	local_bh_enable();
+
+	return private;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
 
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	mutex_init(&table->lock);
 
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_o
 
 static int __init xt_init(void)
 {
-	int i, rv;
+	unsigned int i;
+	int rv;
+
+	for_each_possible_cpu(i) {
+		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
+		spin_lock_init(&lock->lock);
+		lock->readers = 0;
+	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
 	if (!xt)

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
  2009-04-28 16:24                                                                                                                                             ` [PATCH] netfilter: revised locking for x_tables Stephen Hemminger
@ 2009-04-28 16:50                                                                                                                                               ` Linus Torvalds
  2009-04-28 16:55                                                                                                                                                 ` Linus Torvalds
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-28 16:50 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, mathieu.desnoyers, mingo, dada1, zbr, peterz,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh



Ack. 

It could do with the update from Eric about how non-current CPU writelocks 
only require preemp-disable around get_counters() (and then the 
local_bh_disable() only around the current-CPU case).

I _think_ get_counters() is the only case that can use that optimization, 
but it's quite possible that it's worth doing especially for machines with 
lots of cores, if BH latency is an issue (and it might be).

Of course, for the lots-and-lots of cores case, even the preemption 
disable might be an issue. And then it really does get much more 
complicated. At that point, you probably want the RCU thing.

			Linus			

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
  2009-04-28 16:50                                                                                                                                               ` Linus Torvalds
@ 2009-04-28 16:55                                                                                                                                                 ` Linus Torvalds
  2009-04-29  5:37                                                                                                                                                   ` David Miller
  0 siblings, 1 reply; 254+ messages in thread
From: Linus Torvalds @ 2009-04-28 16:55 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, mathieu.desnoyers, mingo, dada1, zbr, peterz,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh



On Tue, 28 Apr 2009, Linus Torvalds wrote:
>
> Ack. 
> 
> It could do with the update from Eric about how non-current CPU writelocks 
> only require preemp-disable around get_counters() (and then the 
> local_bh_disable() only around the current-CPU case).

Btw, regardless, that's an incremental improvement, and does not negate 
the "Ack" part.

			Linus

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use per-CPU r**ursive lock {XV}
  2009-04-28 15:42                                                                                                                                     ` [PATCH] netfilter: use per-CPU r**ursive lock {XV} Paul E. McKenney
@ 2009-04-28 17:35                                                                                                                                       ` Christoph Lameter
  0 siblings, 0 replies; 254+ messages in thread
From: Christoph Lameter @ 2009-04-28 17:35 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: David Miller, mingo, dada1, torvalds, shemminger, zbr, peterz,
	mathieu.desnoyers, jarkao2, paulus, kaber, jeff.chua.linux,
	laijs, jengelh, r000n, linux-kernel, netfilter-devel, netdev,
	benh

On Tue, 28 Apr 2009, Paul E. McKenney wrote:

> > If you can show me have to pass a per-cpu variable (the variable,
> > not a dereference of it) as an argument to an inline function,
> > I'll implement this :-)
> >
> > It has to be dereferenced after local_bh_disable() for the
> > read side acquisition.
>
> The way I did this in treercu.c was to create an array of references
> to the per-CPU data in question.  Not necessarily recommended, but
> one way of doing it.  That said, one could argue that we should wait
> until we have at least three users before creating a generic primitive.

The new percpu allocator allows you to create a per cpu pointer and pass
it to functions.

per_cpu_ptr(pointer,cpu) is used to select an instance.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
  2009-04-28 16:55                                                                                                                                                 ` Linus Torvalds
@ 2009-04-29  5:37                                                                                                                                                   ` David Miller
  2009-04-30  3:26                                                                                                                                                       ` Jeff Chua
  2009-05-01  8:38                                                                                                                                                     ` [PATCH] netfilter: use likely() in xt_info_rdlock_bh() Eric Dumazet
  0 siblings, 2 replies; 254+ messages in thread
From: David Miller @ 2009-04-29  5:37 UTC (permalink / raw)
  To: torvalds
  Cc: shemminger, mathieu.desnoyers, mingo, dada1, zbr, peterz,
	jarkao2, paulus, paulmck, kaber, jeff.chua.linux, laijs, jengelh,
	r000n, linux-kernel, netfilter-devel, netdev, benh

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 28 Apr 2009 09:55:32 -0700 (PDT)

> On Tue, 28 Apr 2009, Linus Torvalds wrote:
>>
>> Ack. 
>> 
>> It could do with the update from Eric about how non-current CPU writelocks 
>> only require preemp-disable around get_counters() (and then the 
>> local_bh_disable() only around the current-CPU case).
> 
> Btw, regardless, that's an incremental improvement, and does not negate 
> the "Ack" part.

I've applied this, thanks everyone!

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
@ 2009-04-30  3:26                                                                                                                                                       ` Jeff Chua
  0 siblings, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-30  3:26 UTC (permalink / raw)
  To: David Miller
  Cc: torvalds, mathieu.desnoyers, mingo, dada1, zbr, peterz, jarkao2,
	paulus, paulmck, kaber, laijs, jengelh, r000n, netfilter-devel,
	netdev, benh, Rafael J. Wysocki, Linux Kernel Mailing List,
	Kernel Testers List, Stephen Hemminger

On Wed, Apr 29, 2009 at 1:37 PM, David Miller <davem@davemloft.net> wrote:
> I've applied this, thanks everyone!

I see the patch is already in Linus's tree that I just git pulled.
Tested for 200 iptables rules ... as fast as before the slow down.

real    0m0.211s
user    0m0.060s
sys     0m0.144s

Thank you all for fixing this bug!


On Sun, Apr 26, 2009 at 3:31 AM, Rafael J. Wysocki <rjw@sisk.pl> wrote:
> Bug-Entry       : http://bugzilla.kernel.org/show_bug.cgi?id=13118
> Subject         : iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
> Submitter       : Jeff Chua <jeff.chua.linux@gmail.com>
> Date            : 2009-04-10 16:05 (16 days old)
> References      : http://lkml.org/lkml/2009/4/10/111
> Handled-By      : Eric Dumazet <dada1@cosmosbay.com>

Rafael, it's fixed. Please close the case.


Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
@ 2009-04-30  3:26                                                                                                                                                       ` Jeff Chua
  0 siblings, 0 replies; 254+ messages in thread
From: Jeff Chua @ 2009-04-30  3:26 UTC (permalink / raw)
  To: David Miller
  Cc: torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	mathieu.desnoyers-scC8bbJcJLCw5LPnMra/2Q, mingo-X9Un+BFzKDI,
	dada1-fPLkHRcR87vqlBn2x/YWAg, zbr-i6C2adt8DTjR7s880joybQ,
	peterz-wEGCiKHe2LqWVfeAwA7xHQ, jarkao2-Re5JQEeQqe8AvxtiuMwx3w,
	paulus-eUNUBHrolfbYtjvyW6yDsg,
	paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	kaber-dcUjhNyLwpNeoWH0uzbU5w, laijs-BthXqXjhjHXQFUHtdCDX3A,
	jengelh-nopoi9nDyk+ELgA04lAiVw, r000n-ioAbOcnQCpJeoWH0uzbU5w,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r, Rafael J. Wysocki,
	Linux Kernel Mailing List, Kernel Testers List,
	Stephen Hemminger

On Wed, Apr 29, 2009 at 1:37 PM, David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote:
> I've applied this, thanks everyone!

I see the patch is already in Linus's tree that I just git pulled.
Tested for 200 iptables rules ... as fast as before the slow down.

real    0m0.211s
user    0m0.060s
sys     0m0.144s

Thank you all for fixing this bug!


On Sun, Apr 26, 2009 at 3:31 AM, Rafael J. Wysocki <rjw-KKrjLPT3xs0@public.gmane.org> wrote:
> Bug-Entry       : http://bugzilla.kernel.org/show_bug.cgi?id=13118
> Subject         : iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49
> Submitter       : Jeff Chua <jeff.chua.linux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> Date            : 2009-04-10 16:05 (16 days old)
> References      : http://lkml.org/lkml/2009/4/10/111
> Handled-By      : Eric Dumazet <dada1-fPLkHRcR87vqlBn2x/YWAg@public.gmane.org>

Rafael, it's fixed. Please close the case.


Thanks,
Jeff.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
@ 2009-04-30  3:31                                                                                                                                                         ` David Miller
  0 siblings, 0 replies; 254+ messages in thread
From: David Miller @ 2009-04-30  3:31 UTC (permalink / raw)
  To: jeff.chua.linux
  Cc: torvalds, mathieu.desnoyers, mingo, dada1, zbr, peterz, jarkao2,
	paulus, paulmck, kaber, laijs, jengelh, r000n, netfilter-devel,
	netdev, benh, rjw, linux-kernel, kernel-testers, shemminger

From: Jeff Chua <jeff.chua.linux@gmail.com>
Date: Thu, 30 Apr 2009 11:26:40 +0800

> On Wed, Apr 29, 2009 at 1:37 PM, David Miller <davem@davemloft.net> wrote:
>> I've applied this, thanks everyone!
> 
> I see the patch is already in Linus's tree that I just git pulled.
> Tested for 200 iptables rules ... as fast as before the slow down.
> 
> real    0m0.211s
> user    0m0.060s
> sys     0m0.144s
> 
> Thank you all for fixing this bug!

Thanks for testing.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: revised locking for x_tables
@ 2009-04-30  3:31                                                                                                                                                         ` David Miller
  0 siblings, 0 replies; 254+ messages in thread
From: David Miller @ 2009-04-30  3:31 UTC (permalink / raw)
  To: jeff.chua.linux-Re5JQEeQqe8AvxtiuMwx3w
  Cc: torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	mathieu.desnoyers-scC8bbJcJLCw5LPnMra/2Q, mingo-X9Un+BFzKDI,
	dada1-fPLkHRcR87vqlBn2x/YWAg, zbr-i6C2adt8DTjR7s880joybQ,
	peterz-wEGCiKHe2LqWVfeAwA7xHQ, jarkao2-Re5JQEeQqe8AvxtiuMwx3w,
	paulus-eUNUBHrolfbYtjvyW6yDsg,
	paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	kaber-dcUjhNyLwpNeoWH0uzbU5w, laijs-BthXqXjhjHXQFUHtdCDX3A,
	jengelh-nopoi9nDyk+ELgA04lAiVw, r000n-ioAbOcnQCpJeoWH0uzbU5w,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r, rjw-KKrjLPT3xs0,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	kernel-testers-u79uwXL29TY76Z2rM5mHXA,
	shemminger-ZtmgI6mnKB3QT0dZR+AlfA

From: Jeff Chua <jeff.chua.linux-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Date: Thu, 30 Apr 2009 11:26:40 +0800

> On Wed, Apr 29, 2009 at 1:37 PM, David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote:
>> I've applied this, thanks everyone!
> 
> I see the patch is already in Linus's tree that I just git pulled.
> Tested for 200 iptables rules ... as fast as before the slow down.
> 
> real    0m0.211s
> user    0m0.060s
> sys     0m0.144s
> 
> Thank you all for fixing this bug!

Thanks for testing.

^ permalink raw reply	[flat|nested] 254+ messages in thread

* [PATCH] netfilter: use likely()  in xt_info_rdlock_bh()
  2009-04-29  5:37                                                                                                                                                   ` David Miller
  2009-04-30  3:26                                                                                                                                                       ` Jeff Chua
@ 2009-05-01  8:38                                                                                                                                                     ` Eric Dumazet
  2009-05-01 16:10                                                                                                                                                       ` David Miller
  1 sibling, 1 reply; 254+ messages in thread
From: Eric Dumazet @ 2009-05-01  8:38 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, kaber, netfilter-devel, netdev

David Miller a écrit :
> From: Linus Torvalds <torvalds@linux-foundation.org>
> Date: Tue, 28 Apr 2009 09:55:32 -0700 (PDT)
> 
>> On Tue, 28 Apr 2009, Linus Torvalds wrote:
>>> Ack. 
>>>
>>> It could do with the update from Eric about how non-current CPU writelocks 
>>> only require preemp-disable around get_counters() (and then the 
>>> local_bh_disable() only around the current-CPU case).
>> Btw, regardless, that's an incremental improvement, and does not negate 
>> the "Ack" part.
> 
> I've applied this, thanks everyone!

Small followup on this one, since the likely() were forgotten.
(I trimmed down CCed list, which was insane)

It makes a difference on my x86_32 machine, gcc-4.4.0

Thank you

[PATCH] netfilter: use likely() in xt_info_rdlock_bh()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1b2e435..c9efe03 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -472,7 +472,7 @@ static inline void xt_info_rdlock_bh(void)
 
 	local_bh_disable();
 	lock = &__get_cpu_var(xt_info_locks);
-	if (!lock->readers++)
+	if (likely(!lock->readers++))
 		spin_lock(&lock->lock);
 }
 
@@ -480,7 +480,7 @@ static inline void xt_info_rdunlock_bh(void)
 {
 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
 
-	if (!--lock->readers)
+	if (likely(!--lock->readers))
 		spin_unlock(&lock->lock);
 	local_bh_enable();
 }

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 254+ messages in thread

* Re: [PATCH] netfilter: use likely() in xt_info_rdlock_bh()
  2009-05-01  8:38                                                                                                                                                     ` [PATCH] netfilter: use likely() in xt_info_rdlock_bh() Eric Dumazet
@ 2009-05-01 16:10                                                                                                                                                       ` David Miller
  0 siblings, 0 replies; 254+ messages in thread
From: David Miller @ 2009-05-01 16:10 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, kaber, netfilter-devel, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 01 May 2009 10:38:20 +0200

> [PATCH] netfilter: use likely() in xt_info_rdlock_bh()
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>

Applied.

^ permalink raw reply	[flat|nested] 254+ messages in thread

end of thread, other threads:[~2009-05-01 16:10 UTC | newest]

Thread overview: 254+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-10  9:15 iptables very slow after commit784544739a25c30637397ace5489eeb6e15d7d49 Jeff Chua
2009-04-10 16:52 ` Stephen Hemminger
2009-04-11  1:07   ` Jeff Chua
2009-04-11  1:25   ` David Miller
2009-04-11  1:39     ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Linus Torvalds
2009-04-11  4:15       ` Paul E. McKenney
2009-04-11  5:14         ` Jan Engelhardt
2009-04-11  5:42           ` Paul E. McKenney
2009-04-11  6:00           ` David Miller
2009-04-11 18:12             ` Kyle Moffett
2009-04-11 18:12               ` Kyle Moffett
2009-04-11 18:32               ` Arkadiusz Miskiewicz
2009-04-11 18:32                 ` Arkadiusz Miskiewicz
2009-04-12  0:54               ` david
2009-04-12  5:05                 ` Kyle Moffett
2009-04-12  5:05                   ` Kyle Moffett
2009-04-12 12:30                 ` Harald Welte
2009-04-12 16:38             ` Jan Engelhardt
2009-04-11 15:07           ` Stephen Hemminger
2009-04-11 16:05             ` Jeff Chua
2009-04-11 16:05               ` Jeff Chua
2009-04-11 17:51           ` Linus Torvalds
2009-04-11  7:08         ` Ingo Molnar
2009-04-11 15:05           ` Stephen Hemminger
2009-04-11 17:48           ` Paul E. McKenney
2009-04-12 10:54             ` Ingo Molnar
2009-04-12 11:34             ` Paul Mackerras
2009-04-12 17:31               ` Paul E. McKenney
2009-04-13  1:13                 ` David Miller
2009-04-13  4:04                   ` Paul E. McKenney
2009-04-13 16:53                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
2009-04-13 17:40                       ` Eric Dumazet
2009-04-13 17:40                         ` Eric Dumazet
2009-04-13 18:11                         ` Stephen Hemminger
2009-04-13 19:06                       ` Martin Josefsson
2009-04-13 19:17                         ` Linus Torvalds
2009-04-13 22:24                       ` Andrew Morton
2009-04-13 23:20                         ` Stephen Hemminger
2009-04-13 23:26                           ` Andrew Morton
2009-04-13 23:37                             ` Linus Torvalds
2009-04-13 23:52                               ` Ingo Molnar
2009-04-14 12:27                       ` Patrick McHardy
2009-04-14 14:23                         ` Eric Dumazet
2009-04-14 14:45                           ` Stephen Hemminger
2009-04-14 15:49                             ` Eric Dumazet
2009-04-14 15:49                               ` Eric Dumazet
2009-04-14 16:51                               ` Jeff Chua
2009-04-14 18:17                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v2) Stephen Hemminger
2009-04-14 19:28                                   ` Eric Dumazet
2009-04-14 21:11                                     ` Stephen Hemminger
2009-04-14 21:13                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Stephen Hemminger
2009-04-14 21:40                                       ` Eric Dumazet
2009-04-14 21:40                                         ` Eric Dumazet
2009-04-15 10:59                                         ` Patrick McHardy
2009-04-15 10:59                                           ` Patrick McHardy
2009-04-15 16:31                                           ` Stephen Hemminger
2009-04-15 16:31                                             ` Stephen Hemminger
2009-04-15 20:55                                           ` Stephen Hemminger
2009-04-15 21:07                                             ` Eric Dumazet
2009-04-15 21:55                                               ` Jan Engelhardt
2009-04-16 12:12                                                 ` Patrick McHardy
2009-04-16 12:24                                                   ` Jan Engelhardt
2009-04-16 12:24                                                     ` Jan Engelhardt
2009-04-16 12:31                                                     ` Patrick McHardy
2009-04-16 12:31                                                       ` Patrick McHardy
2009-04-15 21:57                                               ` [PATCH] netfilter: use per-cpu rwlock rather than RCU (v4) Stephen Hemminger
2009-04-15 23:48                                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) David Miller
2009-04-16  0:01                                                 ` Stephen Hemminger
2009-04-16  0:05                                                   ` David Miller
2009-04-16 12:28                                                     ` Patrick McHardy
2009-04-16  0:10                                                   ` Linus Torvalds
2009-04-16  0:45                                                     ` [PATCH] netfilter: use per-cpu spinlock and RCU (v5) Stephen Hemminger
2009-04-16  5:01                                                       ` Eric Dumazet
2009-04-16 13:53                                                         ` Patrick McHardy
2009-04-16 13:53                                                           ` Patrick McHardy
2009-04-16 14:47                                                           ` Paul E. McKenney
2009-04-16 14:47                                                             ` Paul E. McKenney
2009-04-16 16:10                                                             ` [PATCH] netfilter: use per-cpu recursive spinlock (v6) Eric Dumazet
2009-04-16 16:10                                                               ` Eric Dumazet
2009-04-16 16:20                                                               ` Eric Dumazet
2009-04-16 16:20                                                                 ` Eric Dumazet
2009-04-16 16:37                                                               ` Linus Torvalds
2009-04-16 16:59                                                                 ` Patrick McHardy
2009-04-16 17:58                                                               ` Paul E. McKenney
2009-04-16 17:58                                                                 ` Paul E. McKenney
2009-04-16 18:41                                                                 ` Eric Dumazet
2009-04-16 20:49                                                                   ` [PATCH[] netfilter: use per-cpu reader-writer lock (v0.7) Stephen Hemminger
2009-04-16 21:02                                                                     ` Linus Torvalds
2009-04-16 23:04                                                                       ` Ingo Molnar
2009-04-17  0:13                                                                   ` [PATCH] netfilter: use per-cpu recursive spinlock (v6) Paul E. McKenney
2009-04-17  0:13                                                                     ` Paul E. McKenney
2009-04-16 13:11                                                     ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Patrick McHardy
2009-04-16 22:33                                                       ` David Miller
2009-04-16 23:49                                                         ` Paul E. McKenney
2009-04-16 23:52                                                           ` [PATCH] netfilter: per-cpu spin-lock with recursion (v0.8) Stephen Hemminger
2009-04-17  0:15                                                             ` Jeff Chua
2009-04-17  5:55                                                             ` Peter Zijlstra
2009-04-17  6:03                                                             ` Eric Dumazet
2009-04-17  6:14                                                               ` Eric Dumazet
2009-04-17  6:14                                                                 ` Eric Dumazet
2009-04-17 17:08                                                                 ` Peter Zijlstra
2009-04-17 11:17                                                               ` Patrick McHardy
2009-04-17 11:17                                                                 ` Patrick McHardy
2009-04-17  1:28                                                           ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Paul E. McKenney
2009-04-17  2:19                                                             ` Mathieu Desnoyers
2009-04-17  5:05                                                               ` Paul E. McKenney
2009-04-17  5:44                                                                 ` Mathieu Desnoyers
2009-04-17 14:51                                                                   ` Paul E. McKenney
2009-04-17  4:50                                                             ` Stephen Hemminger
2009-04-17  5:08                                                               ` Paul E. McKenney
2009-04-17  5:16                                                               ` Eric Dumazet
2009-04-17  5:16                                                                 ` Eric Dumazet
2009-04-17  5:40                                                                 ` Paul E. McKenney
2009-04-17  5:40                                                                   ` Paul E. McKenney
2009-04-17  8:07                                                                   ` David Miller
2009-04-17 15:00                                                                     ` Paul E. McKenney
2009-04-17 17:22                                                                     ` Peter Zijlstra
2009-04-17 17:32                                                                       ` Linus Torvalds
2009-04-17  6:12                                                             ` Peter Zijlstra
2009-04-17 16:33                                                               ` Paul E. McKenney
2009-04-17 16:51                                                                 ` Peter Zijlstra
2009-04-17 21:29                                                                   ` Paul E. McKenney
2009-04-18  9:40                                                             ` Evgeniy Polyakov
2009-04-18 14:14                                                               ` Paul E. McKenney
2009-04-20 17:34                                                                 ` [PATCH] netfilter: use per-cpu recursive lock (v10) Stephen Hemminger
2009-04-20 18:21                                                                   ` Paul E. McKenney
2009-04-20 18:25                                                                   ` Eric Dumazet
2009-04-20 18:25                                                                     ` Eric Dumazet
2009-04-20 20:32                                                                     ` Stephen Hemminger
2009-04-20 20:42                                                                     ` Stephen Hemminger
2009-04-20 21:05                                                                       ` Paul E. McKenney
2009-04-20 21:05                                                                         ` Paul E. McKenney
2009-04-20 21:23                                                                     ` Paul Mackerras
2009-04-20 21:58                                                                       ` Paul E. McKenney
2009-04-20 22:41                                                                         ` Paul Mackerras
2009-04-20 23:01                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v11) Stephen Hemminger
2009-04-21  3:41                                                                             ` Lai Jiangshan
2009-04-21  3:56                                                                               ` Eric Dumazet
2009-04-21  4:15                                                                                 ` Stephen Hemminger
2009-04-21  5:22                                                                                 ` Lai Jiangshan
2009-04-21  5:45                                                                                   ` Stephen Hemminger
2009-04-21  5:45                                                                                     ` Stephen Hemminger
2009-04-21  6:52                                                                                     ` Lai Jiangshan
2009-04-21  8:16                                                                                       ` Evgeniy Polyakov
2009-04-21  8:42                                                                                         ` Lai Jiangshan
2009-04-21  8:49                                                                                           ` David Miller
2009-04-21  8:55                                                                                         ` Eric Dumazet
2009-04-21  9:22                                                                                           ` Evgeniy Polyakov
2009-04-21  9:34                                                                                           ` Lai Jiangshan
2009-04-21  9:34                                                                                             ` Lai Jiangshan
2009-04-21  5:34                                                                                 ` Lai Jiangshan
2009-04-21  5:34                                                                                   ` Lai Jiangshan
2009-04-21  4:59                                                                             ` Eric Dumazet
2009-04-21  4:59                                                                               ` Eric Dumazet
2009-04-21 16:37                                                                               ` Paul E. McKenney
2009-04-21  5:46                                                                             ` Lai Jiangshan
2009-04-21 16:13                                                                             ` Linus Torvalds
2009-04-21 16:43                                                                               ` Stephen Hemminger
2009-04-21 16:50                                                                                 ` Linus Torvalds
2009-04-21 18:02                                                                               ` Ingo Molnar
2009-04-21 18:15                                                                               ` Stephen Hemminger
2009-04-21 19:10                                                                                 ` Ingo Molnar
2009-04-21 19:46                                                                                   ` Eric Dumazet
2009-04-21 19:46                                                                                     ` Eric Dumazet
2009-04-22  7:35                                                                                     ` Ingo Molnar
2009-04-22  7:35                                                                                       ` Ingo Molnar
2009-04-22  8:53                                                                                       ` Eric Dumazet
2009-04-22 10:13                                                                                         ` Jarek Poplawski
2009-04-22 11:26                                                                                           ` Ingo Molnar
2009-04-22 11:39                                                                                             ` Jarek Poplawski
2009-04-22 11:18                                                                                         ` Ingo Molnar
2009-04-22 15:19                                                                                         ` Linus Torvalds
2009-04-22 16:57                                                                                           ` Eric Dumazet
2009-04-22 17:18                                                                                             ` Linus Torvalds
2009-04-22 20:46                                                                                               ` Jarek Poplawski
2009-04-22 17:48                                                                                         ` Ingo Molnar
2009-04-21 21:04                                                                                   ` Stephen Hemminger
2009-04-22  8:00                                                                                     ` Ingo Molnar
2009-04-21 19:39                                                                                 ` Ingo Molnar
2009-04-21 21:39                                                                                   ` [PATCH] netfilter: use per-cpu recursive lock (v13) Stephen Hemminger
2009-04-22  4:17                                                                                     ` Paul E. McKenney
2009-04-22 14:57                                                                                     ` Eric Dumazet
2009-04-22 15:32                                                                                     ` Linus Torvalds
2009-04-24  4:09                                                                                       ` [PATCH] netfilter: use per-CPU recursive lock {XIV} Stephen Hemminger
2009-04-24  4:58                                                                                         ` Eric Dumazet
2009-04-24 15:33                                                                                           ` Patrick McHardy
2009-04-24 15:33                                                                                             ` Patrick McHardy
2009-04-24 16:18                                                                                           ` Stephen Hemminger
2009-04-24 16:18                                                                                             ` Stephen Hemminger
2009-04-24 20:43                                                                                             ` Jarek Poplawski
2009-04-25 20:30                                                                                               ` [PATCH] netfilter: iptables no lockdep is needed Stephen Hemminger
2009-04-26  8:18                                                                                                 ` Jarek Poplawski
2009-04-26 18:24                                                                                                 ` [PATCH] netfilter: use per-CPU recursive lock {XV} Eric Dumazet
2009-04-26 18:56                                                                                                   ` Mathieu Desnoyers
2009-04-26 21:57                                                                                                     ` Stephen Hemminger
2009-04-26 22:32                                                                                                       ` Mathieu Desnoyers
2009-04-27 17:44                                                                                                       ` Peter Zijlstra
2009-04-27 18:30                                                                                                         ` [PATCH] netfilter: use per-CPU r**ursive " Stephen Hemminger
2009-04-27 18:54                                                                                                           ` Ingo Molnar
2009-04-27 19:06                                                                                                             ` Stephen Hemminger
2009-04-27 19:46                                                                                                               ` Linus Torvalds
2009-04-27 19:48                                                                                                                 ` Linus Torvalds
2009-04-27 20:36                                                                                                                 ` Evgeniy Polyakov
2009-04-27 20:58                                                                                                                   ` Linus Torvalds
2009-04-27 21:40                                                                                                                     ` Stephen Hemminger
2009-04-27 21:40                                                                                                                       ` Stephen Hemminger
2009-04-27 22:24                                                                                                                       ` Linus Torvalds
2009-04-27 23:01                                                                                                                         ` Linus Torvalds
2009-04-27 23:03                                                                                                                           ` Linus Torvalds
2009-04-28  6:58                                                                                                                             ` Eric Dumazet
2009-04-28  6:58                                                                                                                               ` Eric Dumazet
2009-04-28 11:53                                                                                                                               ` David Miller
2009-04-28 12:40                                                                                                                                 ` Ingo Molnar
2009-04-28 13:43                                                                                                                                   ` David Miller
2009-04-28 13:52                                                                                                                                     ` Mathieu Desnoyers
2009-04-28 14:37                                                                                                                                       ` David Miller
2009-04-28 14:49                                                                                                                                         ` Mathieu Desnoyers
2009-04-28 15:00                                                                                                                                           ` David Miller
2009-04-28 16:24                                                                                                                                             ` [PATCH] netfilter: revised locking for x_tables Stephen Hemminger
2009-04-28 16:50                                                                                                                                               ` Linus Torvalds
2009-04-28 16:55                                                                                                                                                 ` Linus Torvalds
2009-04-29  5:37                                                                                                                                                   ` David Miller
2009-04-30  3:26                                                                                                                                                     ` Jeff Chua
2009-04-30  3:26                                                                                                                                                       ` Jeff Chua
2009-04-30  3:31                                                                                                                                                       ` David Miller
2009-04-30  3:31                                                                                                                                                         ` David Miller
2009-05-01  8:38                                                                                                                                                     ` [PATCH] netfilter: use likely() in xt_info_rdlock_bh() Eric Dumazet
2009-05-01 16:10                                                                                                                                                       ` David Miller
2009-04-28 15:42                                                                                                                                     ` [PATCH] netfilter: use per-CPU r**ursive lock {XV} Paul E. McKenney
2009-04-28 17:35                                                                                                                                       ` Christoph Lameter
2009-04-28 15:09                                                                                                                               ` Linus Torvalds
2009-04-27 23:32                                                                                                                           ` Linus Torvalds
2009-04-28  7:41                                                                                                                             ` Peter Zijlstra
2009-04-28 14:22                                                                                                                               ` Paul E. McKenney
2009-04-28  7:42                                                                                                                 ` Jan Engelhardt
2009-04-26 19:31                                                                                                   ` [PATCH] netfilter: use per-CPU recursive " Mathieu Desnoyers
2009-04-26 20:55                                                                                                     ` Eric Dumazet
2009-04-26 20:55                                                                                                       ` Eric Dumazet
2009-04-26 21:39                                                                                                       ` Mathieu Desnoyers
2009-04-21 18:34                                                                               ` [PATCH] netfilter: use per-cpu recursive lock (v11) Paul E. McKenney
2009-04-21 20:14                                                                                 ` Linus Torvalds
2009-04-20 23:44                                                                           ` [PATCH] netfilter: use per-cpu recursive lock (v10) Paul E. McKenney
2009-04-16  0:02                                                 ` [PATCH] netfilter: use per-cpu spinlock rather than RCU (v3) Linus Torvalds
2009-04-16  6:26                                                 ` Eric Dumazet
2009-04-16 14:33                                                   ` Paul E. McKenney
2009-04-15  3:23                                       ` David Miller
2009-04-14 17:19                               ` [PATCH] netfilter: use per-cpu spinlock rather than RCU Stephen Hemminger
2009-04-11 15:50         ` iptables very slow after commit 784544739a25c30637397ace5489eeb6e15d7d49 Stephen Hemminger
2009-04-11 17:43           ` Paul E. McKenney
2009-04-11 18:57         ` Linus Torvalds
2009-04-12  0:34           ` Paul E. McKenney
2009-04-12  7:23             ` Evgeniy Polyakov
2009-04-12 16:06             ` Stephen Hemminger
2009-04-12 17:30               ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.