linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch v3 0/3] percpu_counter: cleanup and fix
@ 2011-04-14  2:04 shaohua.li
  2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: shaohua.li @ 2011-04-14  2:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet

Cleanup percpu_counter code and fix some bugs. The main purpose is to convert
percpu_counter to use atomic64, which is useful for workloads which cause
percpu_counter->lock contented. In a workload I tested, the atomic method is
7x faster (please see patch 3 for detail).

Note, patch 3 is against Christoph's 'percpu: preemptless __per_cpu_counter_add'

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [patch v3 1/3] percpu_counter: change return value and add comments
  2011-04-14  2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
@ 2011-04-14  2:04 ` shaohua.li
  2011-04-14  2:32   ` Eric Dumazet
  2011-04-15  3:41   ` Tejun Heo
  2011-04-14  2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
  2011-04-14  2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
  2 siblings, 2 replies; 14+ messages in thread
From: shaohua.li @ 2011-04-14  2:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li

[-- Attachment #1: percpu-counter-positive.patch --]
[-- Type: text/plain, Size: 1164 bytes --]

the percpu_counter_*_positive() API in UP case doesn't check if return value is
positive. Add comments to explain why we don't.
Also if count < 0, returns 0 instead of 1 for *read_positive().

Signed-off-by: Shaohua Li <shaohua.li@intel.com>

---
 include/linux/percpu_counter.h |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h	2011-04-13 16:09:22.000000000 +0800
+++ linux/include/linux/percpu_counter.h	2011-04-14 08:50:45.000000000 +0800
@@ -75,7 +75,7 @@ static inline s64 percpu_counter_read_po
 	barrier();		/* Prevent reloads of fbc->count */
 	if (ret >= 0)
 		return ret;
-	return 1;
+	return 0;
 }
 
 static inline int percpu_counter_initialized(struct percpu_counter *fbc)
@@ -133,6 +133,10 @@ static inline s64 percpu_counter_read(st
 	return fbc->count;
 }
 
+/*
+ * percpu_counter is intended to track positive number. In UP case, the number
+ * should never be negative.
+ */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
 	return fbc->count;


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
  2011-04-14  2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
  2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
@ 2011-04-14  2:04 ` shaohua.li
  2011-04-14  2:35   ` Eric Dumazet
  2011-04-14  2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
  2 siblings, 1 reply; 14+ messages in thread
From: shaohua.li @ 2011-04-14  2:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li

[-- Attachment #1: percpu-counter-32bits.patch --]
[-- Type: text/plain, Size: 2304 bytes --]

percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
we need some locking to protect it otherwise some very wrong value could be
accessed.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/percpu_counter.h |   37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h	2011-04-14 08:50:45.000000000 +0800
+++ linux/include/linux/percpu_counter.h	2011-04-14 09:09:14.000000000 +0800
@@ -89,9 +89,20 @@ struct percpu_counter {
 	s64 count;
 };
 
-static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
+#if BITS_PER_LONG == 32
+	preempt_disable();
 	fbc->count = amount;
+	preempt_enable();
+#else
+	fbc->count = amount;
+#endif
+}
+
+static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+{
+	percpu_counter_set(fbc, amount);
 	return 0;
 }
 
@@ -99,16 +110,25 @@ static inline void percpu_counter_destro
 {
 }
 
-static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
-	fbc->count = amount;
+#if BITS_PER_LONG == 32
+	s64 count;
+	preempt_disable();
+	count = fbc->count;
+	preempt_enable();
+	return count;
+#else
+	return fbc->count;
+#endif
 }
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 {
-	if (fbc->count > rhs)
+	s64 count = percpu_counter_read(fbc);
+	if (count > rhs)
 		return 1;
-	else if (fbc->count < rhs)
+	else if (count < rhs)
 		return -1;
 	else
 		return 0;
@@ -128,18 +148,13 @@ __percpu_counter_add(struct percpu_count
 	percpu_counter_add(fbc, amount);
 }
 
-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
-{
-	return fbc->count;
-}
-
 /*
  * percpu_counter is intended to track positive number. In UP case, the number
  * should never be negative.
  */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	return fbc->count;
+	return percpu_counter_read(fbc);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
  2011-04-14  2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
  2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
  2011-04-14  2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
@ 2011-04-14  2:04 ` shaohua.li
  2011-04-15  4:15   ` Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: shaohua.li @ 2011-04-14  2:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li

[-- Attachment #1: percpu-counter-atomic.patch --]
[-- Type: text/plain, Size: 5804 bytes --]

Uses atomic64 for percpu_counter, because it is cheaper than spinlock. This
doesn't slow fast path (percpu_counter_read). atomic64_read equals to fbc->count
for 64-bit system, or equals to spin_lock-read-spin_unlock for 32-bit system

Note, originally the percpu_counter_read for 32-bit system doesn't hold
spin_lock, but that is buggy and might cause very wrong value accessed.
This patch fixes the issue.

This can also improve some workloads with percpu_counter->lock heavily
contented. For example, vm_committed_as sometimes causes the contention.
We should tune the batch count, but if we can make percpu_counter better,
why not? In a 24 CPUs system and 24 processes, each runs:
while (1) {
	mmap(128M);
	munmap(128M);
}
we then measure how many loops each process can take:
orig: 1226976
patched: 8210626
The atomic method gives 7x faster.

In percpu_counter_set() and __percpu_counter_sum(), there will be no lock
protecting. This means we might get inprecise count, but we have the same issue
even with lock protecting, because __percpu_counter_add doesn't hold locking
to update cpu local count.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/percpu_counter.h |   18 +++--------------
 lib/percpu_counter.c           |   42 +++++++++++++++++++++--------------------
 2 files changed, 26 insertions(+), 34 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h	2011-04-14 09:50:36.000000000 +0800
+++ linux/include/linux/percpu_counter.h	2011-04-14 09:53:56.000000000 +0800
@@ -16,8 +16,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
-	s64 count;
+	atomic64_t count;
 #ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
 #endif
@@ -26,16 +25,7 @@ struct percpu_counter {
 
 extern int percpu_counter_batch;
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
-			  struct lock_class_key *key);
-
-#define percpu_counter_init(fbc, value)					\
-	({								\
-		static struct lock_class_key __key;			\
-									\
-		__percpu_counter_init(fbc, value, &__key);		\
-	})
-
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
@@ -60,7 +50,7 @@ static inline s64 percpu_counter_sum(str
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
-	return fbc->count;
+	return atomic64_read(&fbc->count);
 }
 
 /*
@@ -70,7 +60,7 @@ static inline s64 percpu_counter_read(st
  */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	s64 ret = fbc->count;
+	s64 ret = percpu_counter_read(fbc);
 
 	barrier();		/* Prevent reloads of fbc->count */
 	if (ret >= 0)
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c	2011-04-14 09:53:04.000000000 +0800
+++ linux/lib/percpu_counter.c	2011-04-14 10:01:29.000000000 +0800
@@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
 {
 	int cpu;
 
-	spin_lock(&fbc->lock);
+	/*
+	 * Don't really need to disable preempt here, just make sure there is
+	 * no big latency because of preemption
+	 */
+	preempt_disable();
 	for_each_possible_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		*pcount = 0;
 	}
-	fbc->count = amount;
-	spin_unlock(&fbc->lock);
+	atomic64_set(&fbc->count, amount);
+	preempt_enable();
 }
 EXPORT_SYMBOL(percpu_counter_set);
 
@@ -79,11 +83,11 @@ void __percpu_counter_add(struct percpu_
 		new = count + amount;
 		/* In case of overflow fold it into the global counter instead */
 		if (new >= batch || new <= -batch) {
-			spin_lock(&fbc->lock);
+			preempt_disable();
 			count = __this_cpu_read(*fbc->counters);
-			fbc->count += count + amount;
+			atomic64_add(count + amount, &fbc->count);
 			__this_cpu_write(*fbc->counters, 0);
-			spin_unlock(&fbc->lock);
+			preempt_enable();
 			return;
 		}
 
@@ -97,26 +101,27 @@ EXPORT_SYMBOL(__percpu_counter_add);
  */
 s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
-	s64 ret;
+	s64 ret = 0;
 	int cpu;
 
-	spin_lock(&fbc->lock);
-	ret = fbc->count;
+	/*
+	 * Don't really need to disable preempt here, just make sure there is
+	 * no big latency because of preemption
+	 */
+	preempt_disable();
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-	spin_unlock(&fbc->lock);
+	ret += atomic64_read(&fbc->count);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL(__percpu_counter_sum);
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
-			  struct lock_class_key *key)
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
-	spin_lock_init(&fbc->lock);
-	lockdep_set_class(&fbc->lock, key);
-	fbc->count = amount;
+	atomic64_set(&fbc->count, amount);
 	fbc->counters = alloc_percpu(s32);
 	if (!fbc->counters)
 		return -ENOMEM;
@@ -131,7 +136,7 @@ int __percpu_counter_init(struct percpu_
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(percpu_counter_init);
 
 void percpu_counter_destroy(struct percpu_counter *fbc)
 {
@@ -175,13 +180,10 @@ static int __cpuinit percpu_counter_hotc
 	mutex_lock(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
 		s32 *pcount;
-		unsigned long flags;
 
-		spin_lock_irqsave(&fbc->lock, flags);
 		pcount = per_cpu_ptr(fbc->counters, cpu);
-		fbc->count += *pcount;
+		atomic64_add(*pcount, &fbc->count);
 		*pcount = 0;
-		spin_unlock_irqrestore(&fbc->lock, flags);
 	}
 	mutex_unlock(&percpu_counters_lock);
 #endif


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 1/3] percpu_counter: change return value and add comments
  2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
@ 2011-04-14  2:32   ` Eric Dumazet
  2011-04-15  3:41   ` Tejun Heo
  1 sibling, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14  2:32 UTC (permalink / raw)
  To: shaohua.li; +Cc: linux-kernel, akpm, tj, cl

Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> pièce jointe document texte brut (percpu-counter-positive.patch)
> the percpu_counter_*_positive() API in UP case doesn't check if return value is
> positive. Add comments to explain why we don't.
> Also if count < 0, returns 0 instead of 1 for *read_positive().
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> 
> ---
>  include/linux/percpu_counter.h |    6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> Index: linux/include/linux/percpu_counter.h
> ===================================================================
> --- linux.orig/include/linux/percpu_counter.h	2011-04-13 16:09:22.000000000 +0800
> +++ linux/include/linux/percpu_counter.h	2011-04-14 08:50:45.000000000 +0800
> @@ -75,7 +75,7 @@ static inline s64 percpu_counter_read_po
>  	barrier();		/* Prevent reloads of fbc->count */
>  	if (ret >= 0)
>  		return ret;
> -	return 1;
> +	return 0;
>  }
>  
>  static inline int percpu_counter_initialized(struct percpu_counter *fbc)
> @@ -133,6 +133,10 @@ static inline s64 percpu_counter_read(st
>  	return fbc->count;
>  }
>  
> +/*
> + * percpu_counter is intended to track positive number. In UP case, the number
> + * should never be negative.
> + */
>  static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
>  {
>  	return fbc->count;
> 


Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
  2011-04-14  2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
@ 2011-04-14  2:35   ` Eric Dumazet
  2011-04-14  2:44     ` Shaohua Li
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14  2:35 UTC (permalink / raw)
  To: shaohua.li; +Cc: linux-kernel, akpm, tj, cl

Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> pièce jointe document texte brut (percpu-counter-32bits.patch)
> percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
> we need some locking to protect it otherwise some very wrong value could be
> accessed.
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
>  include/linux/percpu_counter.h |   37 ++++++++++++++++++++++++++-----------
>  1 file changed, 26 insertions(+), 11 deletions(-)
> 
> Index: linux/include/linux/percpu_counter.h
> ===================================================================
> --- linux.orig/include/linux/percpu_counter.h	2011-04-14 08:50:45.000000000 +0800
> +++ linux/include/linux/percpu_counter.h	2011-04-14 09:09:14.000000000 +0800
> @@ -89,9 +89,20 @@ struct percpu_counter {
>  	s64 count;
>  };
>  
> -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
>  {
> +#if BITS_PER_LONG == 32
> +	preempt_disable();
>  	fbc->count = amount;
> +	preempt_enable();
> +#else
> +	fbc->count = amount;
> +#endif
> +}
> +
> +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> +{
> +	percpu_counter_set(fbc, amount);
>  	return 0;
>  }
>  

Please dont do this and obfuscate the code.

percpu_counter_init() cannot be racy.

Its like saying spin_lock_init() could be racy.

No other thread/cpu is supposed to use the object if not yet
initialized.




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
  2011-04-14  2:35   ` Eric Dumazet
@ 2011-04-14  2:44     ` Shaohua Li
  2011-04-14  2:54       ` Eric Dumazet
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2011-04-14  2:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, akpm, tj, cl

On Thu, 2011-04-14 at 10:35 +0800, Eric Dumazet wrote:
> Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> > pièce jointe document texte brut (percpu-counter-32bits.patch)
> > percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
> > we need some locking to protect it otherwise some very wrong value could be
> > accessed.
> > 
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > ---
> >  include/linux/percpu_counter.h |   37 ++++++++++++++++++++++++++-----------
> >  1 file changed, 26 insertions(+), 11 deletions(-)
> > 
> > Index: linux/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux.orig/include/linux/percpu_counter.h	2011-04-14 08:50:45.000000000 +0800
> > +++ linux/include/linux/percpu_counter.h	2011-04-14 09:09:14.000000000 +0800
> > @@ -89,9 +89,20 @@ struct percpu_counter {
> >  	s64 count;
> >  };
> >  
> > -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> > +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> >  {
> > +#if BITS_PER_LONG == 32
> > +	preempt_disable();
> >  	fbc->count = amount;
> > +	preempt_enable();
> > +#else
> > +	fbc->count = amount;
> > +#endif
> > +}
> > +
> > +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> > +{
> > +	percpu_counter_set(fbc, amount);
> >  	return 0;
> >  }
> >  
> 
> Please dont do this and obfuscate the code.
> 
> percpu_counter_init() cannot be racy.
> 
> Its like saying spin_lock_init() could be racy.
> 
> No other thread/cpu is supposed to use the object if not yet
> initialized.
that's true. I just want the code looks consistent and this doesn't
really obfuscate the code to me.

Thanks,
Shaohua


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
  2011-04-14  2:44     ` Shaohua Li
@ 2011-04-14  2:54       ` Eric Dumazet
  2011-04-18  7:41         ` Shaohua Li
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14  2:54 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-kernel, akpm, tj, cl

Le jeudi 14 avril 2011 à 10:44 +0800, Shaohua Li a écrit :
> that's true. I just want the code looks consistent and this doesn't
> really obfuscate the code to me.

Doing so gives a false sense of security to user.

percpu_counter_init() is unsafe by nature.




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 1/3] percpu_counter: change return value and add comments
  2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
  2011-04-14  2:32   ` Eric Dumazet
@ 2011-04-15  3:41   ` Tejun Heo
  1 sibling, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2011-04-15  3:41 UTC (permalink / raw)
  To: shaohua.li; +Cc: linux-kernel, akpm, cl, eric.dumazet

On Thu, Apr 14, 2011 at 10:04:48AM +0800, shaohua.li@intel.com wrote:
> the percpu_counter_*_positive() API in UP case doesn't check if return value is
> positive. Add comments to explain why we don't.
> Also if count < 0, returns 0 instead of 1 for *read_positive().
> 
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>

Applied to percpu#for-2.6.40 w/ Eric's Acked-by added.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
  2011-04-14  2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
@ 2011-04-15  4:15   ` Tejun Heo
  2011-04-18  7:43     ` Shaohua Li
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2011-04-15  4:15 UTC (permalink / raw)
  To: shaohua.li; +Cc: linux-kernel, akpm, cl, eric.dumazet

Hello,

On Thu, Apr 14, 2011 at 10:04:50AM +0800, shaohua.li@intel.com wrote:
> Index: linux/lib/percpu_counter.c
> ===================================================================
> --- linux.orig/lib/percpu_counter.c	2011-04-14 09:53:04.000000000 +0800
> +++ linux/lib/percpu_counter.c	2011-04-14 10:01:29.000000000 +0800
> @@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
>  {
>  	int cpu;
>  
> -	spin_lock(&fbc->lock);
> +	/*
> +	 * Don't really need to disable preempt here, just make sure there is
> +	 * no big latency because of preemption
> +	 */
> +	preempt_disable();
>  	for_each_possible_cpu(cpu) {
>  		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
>  		*pcount = 0;
>  	}
> -	fbc->count = amount;
> -	spin_unlock(&fbc->lock);
> +	atomic64_set(&fbc->count, amount);
> +	preempt_enable();

Disabling preemption here doesn't make any sense.
percpu_counter_set() inherently requires its users to guarantee that
no other user is modifying the percpu counter.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
  2011-04-14  2:54       ` Eric Dumazet
@ 2011-04-18  7:41         ` Shaohua Li
  0 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2011-04-18  7:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: linux-kernel, akpm, tj, cl

On Thu, 2011-04-14 at 10:54 +0800, Eric Dumazet wrote:
> Le jeudi 14 avril 2011 à 10:44 +0800, Shaohua Li a écrit :
> > that's true. I just want the code looks consistent and this doesn't
> > really obfuscate the code to me.
> 
> Doing so gives a false sense of security to user.
> 
> percpu_counter_init() is unsafe by nature.
I really don't think this matters, but since you insist, here is the
updated the patch.

Subject: percpu_counter: fix code for 32bit systems for UP

percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
we need some locking to protect it otherwise some very wrong value could be
accessed.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/percpu_counter.h |   31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h	2011-04-18 15:36:31.000000000 +0800
+++ linux/include/linux/percpu_counter.h	2011-04-18 15:37:35.000000000 +0800
@@ -101,14 +101,34 @@ static inline void percpu_counter_destro
 
 static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 {
+#if BITS_PER_LONG == 32
+	preempt_disable();
+	fbc->count = amount;
+	preempt_enable();
+#else
 	fbc->count = amount;
+#endif
+}
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+#if BITS_PER_LONG == 32
+	s64 count;
+	preempt_disable();
+	count = fbc->count;
+	preempt_enable();
+	return count;
+#else
+	return fbc->count;
+#endif
 }
 
 static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 {
-	if (fbc->count > rhs)
+	s64 count = percpu_counter_read(fbc);
+	if (count > rhs)
 		return 1;
-	else if (fbc->count < rhs)
+	else if (count < rhs)
 		return -1;
 	else
 		return 0;
@@ -128,18 +148,13 @@ __percpu_counter_add(struct percpu_count
 	percpu_counter_add(fbc, amount);
 }
 
-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
-{
-	return fbc->count;
-}
-
 /*
  * percpu_counter is intended to track positive number. In UP case, the number
  * should never be negative.
  */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	return fbc->count;
+	return percpu_counter_read(fbc);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
  2011-04-15  4:15   ` Tejun Heo
@ 2011-04-18  7:43     ` Shaohua Li
  2011-04-18 14:25       ` Christoph Lameter
  0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2011-04-18  7:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: linux-kernel, akpm, cl, eric.dumazet

On Fri, 2011-04-15 at 12:15 +0800, Tejun Heo wrote:
> Hello,
> 
> On Thu, Apr 14, 2011 at 10:04:50AM +0800, shaohua.li@intel.com wrote:
> > Index: linux/lib/percpu_counter.c
> > ===================================================================
> > --- linux.orig/lib/percpu_counter.c	2011-04-14 09:53:04.000000000 +0800
> > +++ linux/lib/percpu_counter.c	2011-04-14 10:01:29.000000000 +0800
> > @@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
> >  {
> >  	int cpu;
> >  
> > -	spin_lock(&fbc->lock);
> > +	/*
> > +	 * Don't really need to disable preempt here, just make sure there is
> > +	 * no big latency because of preemption
> > +	 */
> > +	preempt_disable();
> >  	for_each_possible_cpu(cpu) {
> >  		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> >  		*pcount = 0;
> >  	}
> > -	fbc->count = amount;
> > -	spin_unlock(&fbc->lock);
> > +	atomic64_set(&fbc->count, amount);
> > +	preempt_enable();
> 
> Disabling preemption here doesn't make any sense.
> percpu_counter_set() inherently requires its users to guarantee that
> no other user is modifying the percpu counter.
ha, ok.
should I still rebase the patch against Christoph's patch? Looks that
one is still not settled down.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
  2011-04-18  7:43     ` Shaohua Li
@ 2011-04-18 14:25       ` Christoph Lameter
  2011-04-20  1:04         ` Shaohua Li
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2011-04-18 14:25 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Tejun Heo, linux-kernel, akpm, eric.dumazet

On Mon, 18 Apr 2011, Shaohua Li wrote:

> > Disabling preemption here doesn't make any sense.
> > percpu_counter_set() inherently requires its users to guarantee that
> > no other user is modifying the percpu counter.
> ha, ok.
> should I still rebase the patch against Christoph's patch? Looks that
> one is still not settled down.

I am a kind of confused about some of the arguments made there right now
and having your patch in that does the conversion to atomic would
simplify my patch (removes the spin_lock/unlock sequence in overflow
handling).



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
  2011-04-18 14:25       ` Christoph Lameter
@ 2011-04-20  1:04         ` Shaohua Li
  0 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2011-04-20  1:04 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Tejun Heo, linux-kernel, akpm, eric.dumazet

On Mon, 2011-04-18 at 22:25 +0800, Christoph Lameter wrote:
> On Mon, 18 Apr 2011, Shaohua Li wrote:
> 
> > > Disabling preemption here doesn't make any sense.
> > > percpu_counter_set() inherently requires its users to guarantee that
> > > no other user is modifying the percpu counter.
> > ha, ok.
> > should I still rebase the patch against Christoph's patch? Looks that
> > one is still not settled down.
> 
> I am a kind of confused about some of the arguments made there right now
> and having your patch in that does the conversion to atomic would
> simplify my patch (removes the spin_lock/unlock sequence in overflow
> handling).
below is the updated patch against Christoph's latest patch.


Subject: percpu_counter: use atomic64 for counter in SMP

Uses atomic64 for percpu_counter, because it is cheaper than spinlock. This
doesn't slow fast path (percpu_counter_read). atomic64_read equals to fbc->count
for 64-bit system, or equals to spin_lock-read-spin_unlock for 32-bit system

Note, originally the percpu_counter_read for 32-bit system doesn't hold
spin_lock, but that is buggy and might cause very wrong value accessed.
This patch fixes the issue.

This can also improve some workloads with percpu_counter->lock heavily
contented. For example, vm_committed_as sometimes causes the contention.
We should tune the batch count, but if we can make percpu_counter better,
why not? In a 24 CPUs system and 24 processes, each runs:
while (1) {
	mmap(128M);
	munmap(128M);
}
we then measure how many loops each process can take:
orig: 1226976
patched: 8210626
The atomic method gives 7x faster.

In percpu_counter_set() and __percpu_counter_sum(), there will be no lock
protecting. This means we might get inprecise count, but we have the same issue
even with lock protecting, because __percpu_counter_add doesn't hold locking
to update cpu local count.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
 include/linux/percpu_counter.h |   18 ++++--------------
 lib/percpu_counter.c           |   37 +++++++++++++++----------------------
 2 files changed, 19 insertions(+), 36 deletions(-)

Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h	2011-04-19 08:52:14.000000000 +0800
+++ linux/include/linux/percpu_counter.h	2011-04-19 08:52:58.000000000 +0800
@@ -16,8 +16,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
-	s64 count;
+	atomic64_t count;
 #ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
 #endif
@@ -26,16 +25,7 @@ struct percpu_counter {
 
 extern int percpu_counter_batch;
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
-			  struct lock_class_key *key);
-
-#define percpu_counter_init(fbc, value)					\
-	({								\
-		static struct lock_class_key __key;			\
-									\
-		__percpu_counter_init(fbc, value, &__key);		\
-	})
-
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
@@ -60,7 +50,7 @@ static inline s64 percpu_counter_sum(str
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
 {
-	return fbc->count;
+	return atomic64_read(&fbc->count);
 }
 
 /*
@@ -70,7 +60,7 @@ static inline s64 percpu_counter_read(st
  */
 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
 {
-	s64 ret = fbc->count;
+	s64 ret = percpu_counter_read(fbc);
 
 	barrier();		/* Prevent reloads of fbc->count */
 	if (ret >= 0)
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c	2011-04-19 08:52:10.000000000 +0800
+++ linux/lib/percpu_counter.c	2011-04-19 08:53:33.000000000 +0800
@@ -59,13 +59,11 @@ void percpu_counter_set(struct percpu_co
 {
 	int cpu;
 
-	spin_lock(&fbc->lock);
 	for_each_possible_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		*pcount = 0;
 	}
-	fbc->count = amount;
-	spin_unlock(&fbc->lock);
+	atomic64_set(&fbc->count, amount);
 }
 EXPORT_SYMBOL(percpu_counter_set);
 
@@ -85,11 +83,8 @@ void __percpu_counter_add(struct percpu_
 			overflow = 0;
 	} while (this_cpu_cmpxchg(*fbc->counters, count, new) != count);
 
-	if (unlikely(overflow)) {
-		spin_lock(&fbc->lock);
-		fbc->count += overflow;
-		spin_unlock(&fbc->lock);
-	}
+	if (unlikely(overflow))
+		atomic64_add(overflow, &fbc->count);
 }
 EXPORT_SYMBOL(__percpu_counter_add);
 
@@ -99,26 +94,27 @@ EXPORT_SYMBOL(__percpu_counter_add);
  */
 s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
-	s64 ret;
+	s64 ret = 0;
 	int cpu;
 
-	spin_lock(&fbc->lock);
-	ret = fbc->count;
+	/*
+	 * Don't really need to disable preempt here, just make sure there is
+	 * no big latency because of preemption
+	 */
+	preempt_disable();
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-	spin_unlock(&fbc->lock);
+	ret += atomic64_read(&fbc->count);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL(__percpu_counter_sum);
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
-			  struct lock_class_key *key)
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 {
-	spin_lock_init(&fbc->lock);
-	lockdep_set_class(&fbc->lock, key);
-	fbc->count = amount;
+	atomic64_set(&fbc->count, amount);
 	fbc->counters = alloc_percpu(s32);
 	if (!fbc->counters)
 		return -ENOMEM;
@@ -133,7 +129,7 @@ int __percpu_counter_init(struct percpu_
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(percpu_counter_init);
 
 void percpu_counter_destroy(struct percpu_counter *fbc)
 {
@@ -177,13 +173,10 @@ static int __cpuinit percpu_counter_hotc
 	mutex_lock(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
 		s32 *pcount;
-		unsigned long flags;
 
-		spin_lock_irqsave(&fbc->lock, flags);
 		pcount = per_cpu_ptr(fbc->counters, cpu);
-		fbc->count += *pcount;
+		atomic64_add(*pcount, &fbc->count);
 		*pcount = 0;
-		spin_unlock_irqrestore(&fbc->lock, flags);
 	}
 	mutex_unlock(&percpu_counters_lock);
 #endif



^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-04-20  1:04 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-14  2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
2011-04-14  2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
2011-04-14  2:32   ` Eric Dumazet
2011-04-15  3:41   ` Tejun Heo
2011-04-14  2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
2011-04-14  2:35   ` Eric Dumazet
2011-04-14  2:44     ` Shaohua Li
2011-04-14  2:54       ` Eric Dumazet
2011-04-18  7:41         ` Shaohua Li
2011-04-14  2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
2011-04-15  4:15   ` Tejun Heo
2011-04-18  7:43     ` Shaohua Li
2011-04-18 14:25       ` Christoph Lameter
2011-04-20  1:04         ` Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).