* [patch v3 0/3] percpu_counter: cleanup and fix
@ 2011-04-14 2:04 shaohua.li
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
` (2 more replies)
0 siblings, 3 replies; 14+ messages in thread
From: shaohua.li @ 2011-04-14 2:04 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet
Cleanup percpu_counter code and fix some bugs. The main purpose is to convert
percpu_counter to use atomic64, which is useful for workloads which cause
percpu_counter->lock contented. In a workload I tested, the atomic method is
7x faster (please see patch 3 for detail).
Note, patch 3 is against Christoph's 'percpu: preemptless __per_cpu_counter_add'
^ permalink raw reply [flat|nested] 14+ messages in thread
* [patch v3 1/3] percpu_counter: change return value and add comments
2011-04-14 2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
@ 2011-04-14 2:04 ` shaohua.li
2011-04-14 2:32 ` Eric Dumazet
2011-04-15 3:41 ` Tejun Heo
2011-04-14 2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
2011-04-14 2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
2 siblings, 2 replies; 14+ messages in thread
From: shaohua.li @ 2011-04-14 2:04 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li
[-- Attachment #1: percpu-counter-positive.patch --]
[-- Type: text/plain, Size: 1164 bytes --]
the percpu_counter_*_positive() API in UP case doesn't check if return value is
positive. Add comments to explain why we don't.
Also if count < 0, returns 0 instead of 1 for *read_positive().
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
include/linux/percpu_counter.h | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-04-13 16:09:22.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-04-14 08:50:45.000000000 +0800
@@ -75,7 +75,7 @@ static inline s64 percpu_counter_read_po
barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
return ret;
- return 1;
+ return 0;
}
static inline int percpu_counter_initialized(struct percpu_counter *fbc)
@@ -133,6 +133,10 @@ static inline s64 percpu_counter_read(st
return fbc->count;
}
+/*
+ * percpu_counter is intended to track positive number. In UP case, the number
+ * should never be negative.
+ */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
return fbc->count;
^ permalink raw reply [flat|nested] 14+ messages in thread
* [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
2011-04-14 2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
@ 2011-04-14 2:04 ` shaohua.li
2011-04-14 2:35 ` Eric Dumazet
2011-04-14 2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
2 siblings, 1 reply; 14+ messages in thread
From: shaohua.li @ 2011-04-14 2:04 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li
[-- Attachment #1: percpu-counter-32bits.patch --]
[-- Type: text/plain, Size: 2304 bytes --]
percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
we need some locking to protect it otherwise some very wrong value could be
accessed.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
include/linux/percpu_counter.h | 37 ++++++++++++++++++++++++++-----------
1 file changed, 26 insertions(+), 11 deletions(-)
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-04-14 08:50:45.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-04-14 09:09:14.000000000 +0800
@@ -89,9 +89,20 @@ struct percpu_counter {
s64 count;
};
-static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
+#if BITS_PER_LONG == 32
+ preempt_disable();
fbc->count = amount;
+ preempt_enable();
+#else
+ fbc->count = amount;
+#endif
+}
+
+static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_set(fbc, amount);
return 0;
}
@@ -99,16 +110,25 @@ static inline void percpu_counter_destro
{
}
-static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
- fbc->count = amount;
+#if BITS_PER_LONG == 32
+ s64 count;
+ preempt_disable();
+ count = fbc->count;
+ preempt_enable();
+ return count;
+#else
+ return fbc->count;
+#endif
}
static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
- if (fbc->count > rhs)
+ s64 count = percpu_counter_read(fbc);
+ if (count > rhs)
return 1;
- else if (fbc->count < rhs)
+ else if (count < rhs)
return -1;
else
return 0;
@@ -128,18 +148,13 @@ __percpu_counter_add(struct percpu_count
percpu_counter_add(fbc, amount);
}
-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
-{
- return fbc->count;
-}
-
/*
* percpu_counter is intended to track positive number. In UP case, the number
* should never be negative.
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- return fbc->count;
+ return percpu_counter_read(fbc);
}
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
^ permalink raw reply [flat|nested] 14+ messages in thread
* [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
2011-04-14 2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
2011-04-14 2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
@ 2011-04-14 2:04 ` shaohua.li
2011-04-15 4:15 ` Tejun Heo
2 siblings, 1 reply; 14+ messages in thread
From: shaohua.li @ 2011-04-14 2:04 UTC (permalink / raw)
To: linux-kernel; +Cc: akpm, tj, cl, eric.dumazet, Shaohua Li
[-- Attachment #1: percpu-counter-atomic.patch --]
[-- Type: text/plain, Size: 5804 bytes --]
Uses atomic64 for percpu_counter, because it is cheaper than spinlock. This
doesn't slow fast path (percpu_counter_read). atomic64_read equals to fbc->count
for 64-bit system, or equals to spin_lock-read-spin_unlock for 32-bit system
Note, originally the percpu_counter_read for 32-bit system doesn't hold
spin_lock, but that is buggy and might cause very wrong value accessed.
This patch fixes the issue.
This can also improve some workloads with percpu_counter->lock heavily
contented. For example, vm_committed_as sometimes causes the contention.
We should tune the batch count, but if we can make percpu_counter better,
why not? In a 24 CPUs system and 24 processes, each runs:
while (1) {
mmap(128M);
munmap(128M);
}
we then measure how many loops each process can take:
orig: 1226976
patched: 8210626
The atomic method gives 7x faster.
In percpu_counter_set() and __percpu_counter_sum(), there will be no lock
protecting. This means we might get inprecise count, but we have the same issue
even with lock protecting, because __percpu_counter_add doesn't hold locking
to update cpu local count.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
include/linux/percpu_counter.h | 18 +++--------------
lib/percpu_counter.c | 42 +++++++++++++++++++++--------------------
2 files changed, 26 insertions(+), 34 deletions(-)
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-04-14 09:50:36.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-04-14 09:53:56.000000000 +0800
@@ -16,8 +16,7 @@
#ifdef CONFIG_SMP
struct percpu_counter {
- spinlock_t lock;
- s64 count;
+ atomic64_t count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
#endif
@@ -26,16 +25,7 @@ struct percpu_counter {
extern int percpu_counter_batch;
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
- struct lock_class_key *key);
-
-#define percpu_counter_init(fbc, value) \
- ({ \
- static struct lock_class_key __key; \
- \
- __percpu_counter_init(fbc, value, &__key); \
- })
-
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
@@ -60,7 +50,7 @@ static inline s64 percpu_counter_sum(str
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
- return fbc->count;
+ return atomic64_read(&fbc->count);
}
/*
@@ -70,7 +60,7 @@ static inline s64 percpu_counter_read(st
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = percpu_counter_read(fbc);
barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c 2011-04-14 09:53:04.000000000 +0800
+++ linux/lib/percpu_counter.c 2011-04-14 10:01:29.000000000 +0800
@@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
{
int cpu;
- spin_lock(&fbc->lock);
+ /*
+ * Don't really need to disable preempt here, just make sure there is
+ * no big latency because of preemption
+ */
+ preempt_disable();
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
- fbc->count = amount;
- spin_unlock(&fbc->lock);
+ atomic64_set(&fbc->count, amount);
+ preempt_enable();
}
EXPORT_SYMBOL(percpu_counter_set);
@@ -79,11 +83,11 @@ void __percpu_counter_add(struct percpu_
new = count + amount;
/* In case of overflow fold it into the global counter instead */
if (new >= batch || new <= -batch) {
- spin_lock(&fbc->lock);
+ preempt_disable();
count = __this_cpu_read(*fbc->counters);
- fbc->count += count + amount;
+ atomic64_add(count + amount, &fbc->count);
__this_cpu_write(*fbc->counters, 0);
- spin_unlock(&fbc->lock);
+ preempt_enable();
return;
}
@@ -97,26 +101,27 @@ EXPORT_SYMBOL(__percpu_counter_add);
*/
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
- s64 ret;
+ s64 ret = 0;
int cpu;
- spin_lock(&fbc->lock);
- ret = fbc->count;
+ /*
+ * Don't really need to disable preempt here, just make sure there is
+ * no big latency because of preemption
+ */
+ preempt_disable();
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
- spin_unlock(&fbc->lock);
+ ret += atomic64_read(&fbc->count);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
- struct lock_class_key *key)
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
- spin_lock_init(&fbc->lock);
- lockdep_set_class(&fbc->lock, key);
- fbc->count = amount;
+ atomic64_set(&fbc->count, amount);
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
return -ENOMEM;
@@ -131,7 +136,7 @@ int __percpu_counter_init(struct percpu_
#endif
return 0;
}
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(percpu_counter_init);
void percpu_counter_destroy(struct percpu_counter *fbc)
{
@@ -175,13 +180,10 @@ static int __cpuinit percpu_counter_hotc
mutex_lock(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
- unsigned long flags;
- spin_lock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
- fbc->count += *pcount;
+ atomic64_add(*pcount, &fbc->count);
*pcount = 0;
- spin_unlock_irqrestore(&fbc->lock, flags);
}
mutex_unlock(&percpu_counters_lock);
#endif
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 1/3] percpu_counter: change return value and add comments
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
@ 2011-04-14 2:32 ` Eric Dumazet
2011-04-15 3:41 ` Tejun Heo
1 sibling, 0 replies; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14 2:32 UTC (permalink / raw)
To: shaohua.li; +Cc: linux-kernel, akpm, tj, cl
Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> pièce jointe document texte brut (percpu-counter-positive.patch)
> the percpu_counter_*_positive() API in UP case doesn't check if return value is
> positive. Add comments to explain why we don't.
> Also if count < 0, returns 0 instead of 1 for *read_positive().
>
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
>
> ---
> include/linux/percpu_counter.h | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> Index: linux/include/linux/percpu_counter.h
> ===================================================================
> --- linux.orig/include/linux/percpu_counter.h 2011-04-13 16:09:22.000000000 +0800
> +++ linux/include/linux/percpu_counter.h 2011-04-14 08:50:45.000000000 +0800
> @@ -75,7 +75,7 @@ static inline s64 percpu_counter_read_po
> barrier(); /* Prevent reloads of fbc->count */
> if (ret >= 0)
> return ret;
> - return 1;
> + return 0;
> }
>
> static inline int percpu_counter_initialized(struct percpu_counter *fbc)
> @@ -133,6 +133,10 @@ static inline s64 percpu_counter_read(st
> return fbc->count;
> }
>
> +/*
> + * percpu_counter is intended to track positive number. In UP case, the number
> + * should never be negative.
> + */
> static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
> {
> return fbc->count;
>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
2011-04-14 2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
@ 2011-04-14 2:35 ` Eric Dumazet
2011-04-14 2:44 ` Shaohua Li
0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14 2:35 UTC (permalink / raw)
To: shaohua.li; +Cc: linux-kernel, akpm, tj, cl
Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> pièce jointe document texte brut (percpu-counter-32bits.patch)
> percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
> we need some locking to protect it otherwise some very wrong value could be
> accessed.
>
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> ---
> include/linux/percpu_counter.h | 37 ++++++++++++++++++++++++++-----------
> 1 file changed, 26 insertions(+), 11 deletions(-)
>
> Index: linux/include/linux/percpu_counter.h
> ===================================================================
> --- linux.orig/include/linux/percpu_counter.h 2011-04-14 08:50:45.000000000 +0800
> +++ linux/include/linux/percpu_counter.h 2011-04-14 09:09:14.000000000 +0800
> @@ -89,9 +89,20 @@ struct percpu_counter {
> s64 count;
> };
>
> -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> {
> +#if BITS_PER_LONG == 32
> + preempt_disable();
> fbc->count = amount;
> + preempt_enable();
> +#else
> + fbc->count = amount;
> +#endif
> +}
> +
> +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> +{
> + percpu_counter_set(fbc, amount);
> return 0;
> }
>
Please dont do this and obfuscate the code.
percpu_counter_init() cannot be racy.
Its like saying spin_lock_init() could be racy.
No other thread/cpu is supposed to use the object if not yet
initialized.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
2011-04-14 2:35 ` Eric Dumazet
@ 2011-04-14 2:44 ` Shaohua Li
2011-04-14 2:54 ` Eric Dumazet
0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2011-04-14 2:44 UTC (permalink / raw)
To: Eric Dumazet; +Cc: linux-kernel, akpm, tj, cl
On Thu, 2011-04-14 at 10:35 +0800, Eric Dumazet wrote:
> Le jeudi 14 avril 2011 à 10:04 +0800, shaohua.li@intel.com a écrit :
> > pièce jointe document texte brut (percpu-counter-32bits.patch)
> > percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
> > we need some locking to protect it otherwise some very wrong value could be
> > accessed.
> >
> > Signed-off-by: Shaohua Li <shaohua.li@intel.com>
> > ---
> > include/linux/percpu_counter.h | 37 ++++++++++++++++++++++++++-----------
> > 1 file changed, 26 insertions(+), 11 deletions(-)
> >
> > Index: linux/include/linux/percpu_counter.h
> > ===================================================================
> > --- linux.orig/include/linux/percpu_counter.h 2011-04-14 08:50:45.000000000 +0800
> > +++ linux/include/linux/percpu_counter.h 2011-04-14 09:09:14.000000000 +0800
> > @@ -89,9 +89,20 @@ struct percpu_counter {
> > s64 count;
> > };
> >
> > -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> > +static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> > {
> > +#if BITS_PER_LONG == 32
> > + preempt_disable();
> > fbc->count = amount;
> > + preempt_enable();
> > +#else
> > + fbc->count = amount;
> > +#endif
> > +}
> > +
> > +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
> > +{
> > + percpu_counter_set(fbc, amount);
> > return 0;
> > }
> >
>
> Please dont do this and obfuscate the code.
>
> percpu_counter_init() cannot be racy.
>
> Its like saying spin_lock_init() could be racy.
>
> No other thread/cpu is supposed to use the object if not yet
> initialized.
that's true. I just want the code looks consistent and this doesn't
really obfuscate the code to me.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
2011-04-14 2:44 ` Shaohua Li
@ 2011-04-14 2:54 ` Eric Dumazet
2011-04-18 7:41 ` Shaohua Li
0 siblings, 1 reply; 14+ messages in thread
From: Eric Dumazet @ 2011-04-14 2:54 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, akpm, tj, cl
Le jeudi 14 avril 2011 à 10:44 +0800, Shaohua Li a écrit :
> that's true. I just want the code looks consistent and this doesn't
> really obfuscate the code to me.
Doing so gives a false sense of security to user.
percpu_counter_init() is unsafe by nature.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 1/3] percpu_counter: change return value and add comments
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
2011-04-14 2:32 ` Eric Dumazet
@ 2011-04-15 3:41 ` Tejun Heo
1 sibling, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2011-04-15 3:41 UTC (permalink / raw)
To: shaohua.li; +Cc: linux-kernel, akpm, cl, eric.dumazet
On Thu, Apr 14, 2011 at 10:04:48AM +0800, shaohua.li@intel.com wrote:
> the percpu_counter_*_positive() API in UP case doesn't check if return value is
> positive. Add comments to explain why we don't.
> Also if count < 0, returns 0 instead of 1 for *read_positive().
>
> Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Applied to percpu#for-2.6.40 w/ Eric's Acked-by added.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
2011-04-14 2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
@ 2011-04-15 4:15 ` Tejun Heo
2011-04-18 7:43 ` Shaohua Li
0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2011-04-15 4:15 UTC (permalink / raw)
To: shaohua.li; +Cc: linux-kernel, akpm, cl, eric.dumazet
Hello,
On Thu, Apr 14, 2011 at 10:04:50AM +0800, shaohua.li@intel.com wrote:
> Index: linux/lib/percpu_counter.c
> ===================================================================
> --- linux.orig/lib/percpu_counter.c 2011-04-14 09:53:04.000000000 +0800
> +++ linux/lib/percpu_counter.c 2011-04-14 10:01:29.000000000 +0800
> @@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
> {
> int cpu;
>
> - spin_lock(&fbc->lock);
> + /*
> + * Don't really need to disable preempt here, just make sure there is
> + * no big latency because of preemption
> + */
> + preempt_disable();
> for_each_possible_cpu(cpu) {
> s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> *pcount = 0;
> }
> - fbc->count = amount;
> - spin_unlock(&fbc->lock);
> + atomic64_set(&fbc->count, amount);
> + preempt_enable();
Disabling preemption here doesn't make any sense.
percpu_counter_set() inherently requires its users to guarantee that
no other user is modifying the percpu counter.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP
2011-04-14 2:54 ` Eric Dumazet
@ 2011-04-18 7:41 ` Shaohua Li
0 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2011-04-18 7:41 UTC (permalink / raw)
To: Eric Dumazet; +Cc: linux-kernel, akpm, tj, cl
On Thu, 2011-04-14 at 10:54 +0800, Eric Dumazet wrote:
> Le jeudi 14 avril 2011 à 10:44 +0800, Shaohua Li a écrit :
> > that's true. I just want the code looks consistent and this doesn't
> > really obfuscate the code to me.
>
> Doing so gives a false sense of security to user.
>
> percpu_counter_init() is unsafe by nature.
I really don't think this matters, but since you insist, here is the
updated the patch.
Subject: percpu_counter: fix code for 32bit systems for UP
percpu_counter.counter is a 's64'. Accessing it in 32-bit system is racing.
we need some locking to protect it otherwise some very wrong value could be
accessed.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
include/linux/percpu_counter.h | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-04-18 15:36:31.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-04-18 15:37:35.000000000 +0800
@@ -101,14 +101,34 @@ static inline void percpu_counter_destro
static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
+#if BITS_PER_LONG == 32
+ preempt_disable();
+ fbc->count = amount;
+ preempt_enable();
+#else
fbc->count = amount;
+#endif
+}
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+#if BITS_PER_LONG == 32
+ s64 count;
+ preempt_disable();
+ count = fbc->count;
+ preempt_enable();
+ return count;
+#else
+ return fbc->count;
+#endif
}
static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
- if (fbc->count > rhs)
+ s64 count = percpu_counter_read(fbc);
+ if (count > rhs)
return 1;
- else if (fbc->count < rhs)
+ else if (count < rhs)
return -1;
else
return 0;
@@ -128,18 +148,13 @@ __percpu_counter_add(struct percpu_count
percpu_counter_add(fbc, amount);
}
-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
-{
- return fbc->count;
-}
-
/*
* percpu_counter is intended to track positive number. In UP case, the number
* should never be negative.
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- return fbc->count;
+ return percpu_counter_read(fbc);
}
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
2011-04-15 4:15 ` Tejun Heo
@ 2011-04-18 7:43 ` Shaohua Li
2011-04-18 14:25 ` Christoph Lameter
0 siblings, 1 reply; 14+ messages in thread
From: Shaohua Li @ 2011-04-18 7:43 UTC (permalink / raw)
To: Tejun Heo; +Cc: linux-kernel, akpm, cl, eric.dumazet
On Fri, 2011-04-15 at 12:15 +0800, Tejun Heo wrote:
> Hello,
>
> On Thu, Apr 14, 2011 at 10:04:50AM +0800, shaohua.li@intel.com wrote:
> > Index: linux/lib/percpu_counter.c
> > ===================================================================
> > --- linux.orig/lib/percpu_counter.c 2011-04-14 09:53:04.000000000 +0800
> > +++ linux/lib/percpu_counter.c 2011-04-14 10:01:29.000000000 +0800
> > @@ -59,13 +59,17 @@ void percpu_counter_set(struct percpu_co
> > {
> > int cpu;
> >
> > - spin_lock(&fbc->lock);
> > + /*
> > + * Don't really need to disable preempt here, just make sure there is
> > + * no big latency because of preemption
> > + */
> > + preempt_disable();
> > for_each_possible_cpu(cpu) {
> > s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
> > *pcount = 0;
> > }
> > - fbc->count = amount;
> > - spin_unlock(&fbc->lock);
> > + atomic64_set(&fbc->count, amount);
> > + preempt_enable();
>
> Disabling preemption here doesn't make any sense.
> percpu_counter_set() inherently requires its users to guarantee that
> no other user is modifying the percpu counter.
ha, ok.
should I still rebase the patch against Christoph's patch? Looks that
one is still not settled down.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
2011-04-18 7:43 ` Shaohua Li
@ 2011-04-18 14:25 ` Christoph Lameter
2011-04-20 1:04 ` Shaohua Li
0 siblings, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2011-04-18 14:25 UTC (permalink / raw)
To: Shaohua Li; +Cc: Tejun Heo, linux-kernel, akpm, eric.dumazet
On Mon, 18 Apr 2011, Shaohua Li wrote:
> > Disabling preemption here doesn't make any sense.
> > percpu_counter_set() inherently requires its users to guarantee that
> > no other user is modifying the percpu counter.
> ha, ok.
> should I still rebase the patch against Christoph's patch? Looks that
> one is still not settled down.
I am a kind of confused about some of the arguments made there right now
and having your patch in that does the conversion to atomic would
simplify my patch (removes the spin_lock/unlock sequence in overflow
handling).
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP
2011-04-18 14:25 ` Christoph Lameter
@ 2011-04-20 1:04 ` Shaohua Li
0 siblings, 0 replies; 14+ messages in thread
From: Shaohua Li @ 2011-04-20 1:04 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Tejun Heo, linux-kernel, akpm, eric.dumazet
On Mon, 2011-04-18 at 22:25 +0800, Christoph Lameter wrote:
> On Mon, 18 Apr 2011, Shaohua Li wrote:
>
> > > Disabling preemption here doesn't make any sense.
> > > percpu_counter_set() inherently requires its users to guarantee that
> > > no other user is modifying the percpu counter.
> > ha, ok.
> > should I still rebase the patch against Christoph's patch? Looks that
> > one is still not settled down.
>
> I am a kind of confused about some of the arguments made there right now
> and having your patch in that does the conversion to atomic would
> simplify my patch (removes the spin_lock/unlock sequence in overflow
> handling).
below is the updated patch against Christoph's latest patch.
Subject: percpu_counter: use atomic64 for counter in SMP
Uses atomic64 for percpu_counter, because it is cheaper than spinlock. This
doesn't slow fast path (percpu_counter_read). atomic64_read equals to fbc->count
for 64-bit system, or equals to spin_lock-read-spin_unlock for 32-bit system
Note, originally the percpu_counter_read for 32-bit system doesn't hold
spin_lock, but that is buggy and might cause very wrong value accessed.
This patch fixes the issue.
This can also improve some workloads with percpu_counter->lock heavily
contented. For example, vm_committed_as sometimes causes the contention.
We should tune the batch count, but if we can make percpu_counter better,
why not? In a 24 CPUs system and 24 processes, each runs:
while (1) {
mmap(128M);
munmap(128M);
}
we then measure how many loops each process can take:
orig: 1226976
patched: 8210626
The atomic method gives 7x faster.
In percpu_counter_set() and __percpu_counter_sum(), there will be no lock
protecting. This means we might get inprecise count, but we have the same issue
even with lock protecting, because __percpu_counter_add doesn't hold locking
to update cpu local count.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
---
include/linux/percpu_counter.h | 18 ++++--------------
lib/percpu_counter.c | 37 +++++++++++++++----------------------
2 files changed, 19 insertions(+), 36 deletions(-)
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h 2011-04-19 08:52:14.000000000 +0800
+++ linux/include/linux/percpu_counter.h 2011-04-19 08:52:58.000000000 +0800
@@ -16,8 +16,7 @@
#ifdef CONFIG_SMP
struct percpu_counter {
- spinlock_t lock;
- s64 count;
+ atomic64_t count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
#endif
@@ -26,16 +25,7 @@ struct percpu_counter {
extern int percpu_counter_batch;
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
- struct lock_class_key *key);
-
-#define percpu_counter_init(fbc, value) \
- ({ \
- static struct lock_class_key __key; \
- \
- __percpu_counter_init(fbc, value, &__key); \
- })
-
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount);
void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
@@ -60,7 +50,7 @@ static inline s64 percpu_counter_sum(str
static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
- return fbc->count;
+ return atomic64_read(&fbc->count);
}
/*
@@ -70,7 +60,7 @@ static inline s64 percpu_counter_read(st
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = percpu_counter_read(fbc);
barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
Index: linux/lib/percpu_counter.c
===================================================================
--- linux.orig/lib/percpu_counter.c 2011-04-19 08:52:10.000000000 +0800
+++ linux/lib/percpu_counter.c 2011-04-19 08:53:33.000000000 +0800
@@ -59,13 +59,11 @@ void percpu_counter_set(struct percpu_co
{
int cpu;
- spin_lock(&fbc->lock);
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
- fbc->count = amount;
- spin_unlock(&fbc->lock);
+ atomic64_set(&fbc->count, amount);
}
EXPORT_SYMBOL(percpu_counter_set);
@@ -85,11 +83,8 @@ void __percpu_counter_add(struct percpu_
overflow = 0;
} while (this_cpu_cmpxchg(*fbc->counters, count, new) != count);
- if (unlikely(overflow)) {
- spin_lock(&fbc->lock);
- fbc->count += overflow;
- spin_unlock(&fbc->lock);
- }
+ if (unlikely(overflow))
+ atomic64_add(overflow, &fbc->count);
}
EXPORT_SYMBOL(__percpu_counter_add);
@@ -99,26 +94,27 @@ EXPORT_SYMBOL(__percpu_counter_add);
*/
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
- s64 ret;
+ s64 ret = 0;
int cpu;
- spin_lock(&fbc->lock);
- ret = fbc->count;
+ /*
+ * Don't really need to disable preempt here, just make sure there is
+ * no big latency because of preemption
+ */
+ preempt_disable();
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
- spin_unlock(&fbc->lock);
+ ret += atomic64_read(&fbc->count);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
- struct lock_class_key *key)
+int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
- spin_lock_init(&fbc->lock);
- lockdep_set_class(&fbc->lock, key);
- fbc->count = amount;
+ atomic64_set(&fbc->count, amount);
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
return -ENOMEM;
@@ -133,7 +129,7 @@ int __percpu_counter_init(struct percpu_
#endif
return 0;
}
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(percpu_counter_init);
void percpu_counter_destroy(struct percpu_counter *fbc)
{
@@ -177,13 +173,10 @@ static int __cpuinit percpu_counter_hotc
mutex_lock(&percpu_counters_lock);
list_for_each_entry(fbc, &percpu_counters, list) {
s32 *pcount;
- unsigned long flags;
- spin_lock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
- fbc->count += *pcount;
+ atomic64_add(*pcount, &fbc->count);
*pcount = 0;
- spin_unlock_irqrestore(&fbc->lock, flags);
}
mutex_unlock(&percpu_counters_lock);
#endif
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2011-04-20 1:04 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-14 2:04 [patch v3 0/3] percpu_counter: cleanup and fix shaohua.li
2011-04-14 2:04 ` [patch v3 1/3] percpu_counter: change return value and add comments shaohua.li
2011-04-14 2:32 ` Eric Dumazet
2011-04-15 3:41 ` Tejun Heo
2011-04-14 2:04 ` [patch v3 2/3] percpu_counter: fix code for 32bit systems for UP shaohua.li
2011-04-14 2:35 ` Eric Dumazet
2011-04-14 2:44 ` Shaohua Li
2011-04-14 2:54 ` Eric Dumazet
2011-04-18 7:41 ` Shaohua Li
2011-04-14 2:04 ` [patch v3 3/3] percpu_counter: use atomic64 for counter in SMP shaohua.li
2011-04-15 4:15 ` Tejun Heo
2011-04-18 7:43 ` Shaohua Li
2011-04-18 14:25 ` Christoph Lameter
2011-04-20 1:04 ` Shaohua Li
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).