linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
@ 2015-03-02 18:42 Jason Low
  2015-03-02 19:03 ` Linus Torvalds
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Jason Low @ 2015-03-02 18:42 UTC (permalink / raw)
  To: Peter Zijlstra, Ingo Molnar, Linus Torvalds
  Cc: Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Rik van Riel, Steven Rostedt, Scott Norton,
	Aswin Chandramouleeswaran, linux-kernel, Jason Low

v1->v2:
- Peter suggested that cputimer->running does not need to be atomic,
  so we can leave it as an integer.
- Address a race condition that could occur in update_gt_cputime().
- Add helper functions to avoid repeating code.

While running a database workload, we found a scalability issue
with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger
systems (such as a 16 socket machine), this caused more than 30% of
total time spent trying to obtain this kernel lock to update these
group timer stats.

This patch converts the timers to 64 bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Signed-off-by: Jason Low <jason.low2@hp.com>
---
 include/linux/init_task.h      |    7 +++--
 include/linux/sched.h          |   10 ++-----
 kernel/fork.c                  |    3 --
 kernel/sched/stats.h           |   12 ++------
 kernel/time/posix-cpu-timers.c |   55 +++++++++++++++++++++++----------------
 5 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3037fc0..c4cdec7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.cputime = INIT_CPUTIME,				\
-		.running = 0,						\
-		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+		.utime = ATOMIC64_INIT(0),                              \
+		.stime = ATOMIC64_INIT(0),                              \
+		.sum_exec_runtime = ATOMIC64_INIT(0),                   \
+		.running = 0						\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..d6b0f76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -588,9 +588,10 @@ struct task_cputime {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	struct task_cputime cputime;
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
 	int running;
-	raw_spinlock_t lock;
 };
 
 #include <linux/rwsem.h>
@@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	raw_spin_lock_init(&sig->cputimer.lock);
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2dda..df9dfe9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
 	unsigned long cpu_limit;
 
-	/* Thread group counters. */
-	thread_group_cputime_init(sig);
-
 	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..adda94e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->utime);
 }
 
 /**
@@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->stime);
 }
 
 /**
@@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->sum_exec_runtime);
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a16b678..ba93c23 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 	return error;
 }
 
+/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
+static inline void sample_group_cputimer(struct task_cputime *times,
+			                 struct thread_group_cputimer *cputimer)
+{
+        times->utime = atomic64_read(&cputimer->utime);
+        times->stime = atomic64_read(&cputimer->stime);
+        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
+}
 
 /*
  * Sample a per-thread clock for the given task.
@@ -196,23 +204,32 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-	if (b->utime > a->utime)
-		a->utime = b->utime;
-
-	if (b->stime > a->stime)
-		a->stime = b->stime;
+	u64 curr_cputime;
+	/*
+	 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+	 * to avoid race conditions with concurrent updates to cputime.
+	 */
+retry:
+	curr_cputime = atomic64_read(cputime);
+	if (sum_cputime > curr_cputime) {
+		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+			goto retry;
+	}
+}
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+	__update_gt_cputime(&cputimer->utime, sum->utime);
+	__update_gt_cputime(&cputimer->stime, sum->stime);
+	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
-	unsigned long flags;
 
 	if (!cputimer->running) {
 		/*
@@ -222,13 +239,10 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 * it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-		cputimer->running = 1;
-		update_gt_cputime(&cputimer->cputime, &sum);
-	} else
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-	*times = cputimer->cputime;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+		update_gt_cputime(cputimer, &sum);
+		ACCESS_ONCE(cputimer->running) = 1;
+	}
+	sample_group_cputimer(times, cputimer);
 }
 
 /*
@@ -885,11 +899,8 @@ static void check_thread_timers(struct task_struct *tsk,
 static void stop_process_timers(struct signal_struct *sig)
 {
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 0;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+	ACCESS_ONCE(cputimer->running) = 0;
 }
 
 static u32 onecputick;
@@ -1114,9 +1125,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
 
-		raw_spin_lock(&sig->cputimer.lock);
-		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		sample_group_cputimer(&group_sample, &sig->cputimer);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
-- 
1.7.2.5




^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
@ 2015-03-02 19:03 ` Linus Torvalds
  2015-03-02 21:49   ` Jason Low
  2015-03-02 19:40 ` Oleg Nesterov
  2015-03-05 15:35 ` Frederic Weisbecker
  2 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2015-03-02 19:03 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List

On Mon, Mar 2, 2015 at 10:42 AM, Jason Low <jason.low2@hp.com> wrote:
>
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.

NAK.

Not because I think this is wrong, but because somebody needs to look
at the effects on 32-bit architectures too.

In particular, check out lib/atomic64.c - which uses a hashed array of
16-bit spinlocks to do 64-bit atomics. That may or may well work ok in
practice, but it does mean that now sample_group_cputimer() and
update_gt_cputime() will take that (it ends up generally being the
same) spinlock three times for the three atomic64_read()'s.

Now, I think on x86, we end up using not lib/atomic64.c but our own
versions that use cmpxchg8b, which is probably fine from a performance
standpoint. But I see a lot of "select GENERIC_ATOMIC64" for other
architectures.

Anyway, it is *possible* that even on those 32-bit targets, the
atomic64's aren't any worse than the current spinlock in practice.  So
the "NAK" is in no way absolute - but I'd just like to hear that this
is all reasonably fine on 32-bit ARM and powerpc, for example.

Hmm?

                             Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
  2015-03-02 19:03 ` Linus Torvalds
@ 2015-03-02 19:40 ` Oleg Nesterov
  2015-03-02 19:43   ` Oleg Nesterov
  2015-03-02 21:19   ` Jason Low
  2015-03-05 15:35 ` Frederic Weisbecker
  2 siblings, 2 replies; 19+ messages in thread
From: Oleg Nesterov @ 2015-03-02 19:40 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

Well, I forgot everything about this code, but let me ask anyway ;)

On 03/02, Jason Low wrote:
>
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
>  {
> -	if (b->utime > a->utime)
> -		a->utime = b->utime;
> -
> -	if (b->stime > a->stime)
> -		a->stime = b->stime;
> +	u64 curr_cputime;
> +	/*
> +	 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
> +	 * to avoid race conditions with concurrent updates to cputime.
> +	 */
> +retry:
> +	curr_cputime = atomic64_read(cputime);
> +	if (sum_cputime > curr_cputime) {
> +		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> +			goto retry;
> +	}
> +}
>  
> -	if (b->sum_exec_runtime > a->sum_exec_runtime)
> -		a->sum_exec_runtime = b->sum_exec_runtime;
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> +{
> +	__update_gt_cputime(&cputimer->utime, sum->utime);
> +	__update_gt_cputime(&cputimer->stime, sum->stime);
> +	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
>  }

And this is called if !cputimer_running().

So who else can update these atomic64_t's ? The caller is called under ->siglock.
IOW, do we really need to cmpxchg/retry ?

Just curious, I am sure I missed something.

> @@ -222,13 +239,10 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
>  		 * it.
>  		 */
>  		thread_group_cputime(tsk, &sum);
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -		cputimer->running = 1;
> -		update_gt_cputime(&cputimer->cputime, &sum);
> -	} else
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	*times = cputimer->cputime;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +		update_gt_cputime(cputimer, &sum);
> +		ACCESS_ONCE(cputimer->running) = 1;

WRITE_ONCE() looks better... but it is not clear to me why do we need it
at all.

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 19:40 ` Oleg Nesterov
@ 2015-03-02 19:43   ` Oleg Nesterov
  2015-03-02 21:16     ` Jason Low
  2015-03-02 21:19   ` Jason Low
  1 sibling, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2015-03-02 19:43 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On 03/02, Oleg Nesterov wrote:
>
> Well, I forgot everything about this code, but let me ask anyway ;)
>
> On 03/02, Jason Low wrote:
> >
> > -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> > +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
> >  {
> > -	if (b->utime > a->utime)
> > -		a->utime = b->utime;
> > -
> > -	if (b->stime > a->stime)
> > -		a->stime = b->stime;
> > +	u64 curr_cputime;
> > +	/*
> > +	 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
> > +	 * to avoid race conditions with concurrent updates to cputime.
> > +	 */
> > +retry:
> > +	curr_cputime = atomic64_read(cputime);
> > +	if (sum_cputime > curr_cputime) {
> > +		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> > +			goto retry;
> > +	}
> > +}
> >
> > -	if (b->sum_exec_runtime > a->sum_exec_runtime)
> > -		a->sum_exec_runtime = b->sum_exec_runtime;
> > +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> > +{
> > +	__update_gt_cputime(&cputimer->utime, sum->utime);
> > +	__update_gt_cputime(&cputimer->stime, sum->stime);
> > +	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> >  }
>
> And this is called if !cputimer_running().
>
> So who else can update these atomic64_t's ? The caller is called under ->siglock.
> IOW, do we really need to cmpxchg/retry ?
>
> Just curious, I am sure I missed something.

Ah, sorry, I seem to understand.

We still can race with account_group_*time() even if ->running == 0. Because
(say) account_group_exec_runtime() can race with 1 -> 0 -> 1 transition.

Or is there another reason?

Oleg.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 19:43   ` Oleg Nesterov
@ 2015-03-02 21:16     ` Jason Low
  2015-03-02 21:44       ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Jason Low @ 2015-03-02 21:16 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel, Jason Low

On Mon, 2015-03-02 at 20:43 +0100, Oleg Nesterov wrote:
> On 03/02, Oleg Nesterov wrote:
> >
> > Well, I forgot everything about this code, but let me ask anyway ;)
> >
> > On 03/02, Jason Low wrote:
> > >
> > > -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> > > +static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
> > >  {
> > > -	if (b->utime > a->utime)
> > > -		a->utime = b->utime;
> > > -
> > > -	if (b->stime > a->stime)
> > > -		a->stime = b->stime;
> > > +	u64 curr_cputime;
> > > +	/*
> > > +	 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
> > > +	 * to avoid race conditions with concurrent updates to cputime.
> > > +	 */
> > > +retry:
> > > +	curr_cputime = atomic64_read(cputime);
> > > +	if (sum_cputime > curr_cputime) {
> > > +		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
> > > +			goto retry;
> > > +	}
> > > +}
> > >
> > > -	if (b->sum_exec_runtime > a->sum_exec_runtime)
> > > -		a->sum_exec_runtime = b->sum_exec_runtime;
> > > +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
> > > +{
> > > +	__update_gt_cputime(&cputimer->utime, sum->utime);
> > > +	__update_gt_cputime(&cputimer->stime, sum->stime);
> > > +	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> > >  }
> >
> > And this is called if !cputimer_running().
> >
> > So who else can update these atomic64_t's ? The caller is called under ->siglock.
> > IOW, do we really need to cmpxchg/retry ?
> >
> > Just curious, I am sure I missed something.
> 
> Ah, sorry, I seem to understand.
> 
> We still can race with account_group_*time() even if ->running == 0. Because
> (say) account_group_exec_runtime() can race with 1 -> 0 -> 1 transition.
> 
> Or is there another reason?

Hi Oleg,

Yes, that 1 -> 0 -> 1 transition was the race that I had in mind. Thus,
I added the extra atomic logic in update_gt_cputime() just to be safe.

In original code, we set cputimer->running first so it is running while
we call update_gt_cputime(). Now in this patch, we swapped the 2 calls
such that we set running after calling update_gt_cputime(), so that
wouldn't be an issue anymore.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 19:40 ` Oleg Nesterov
  2015-03-02 19:43   ` Oleg Nesterov
@ 2015-03-02 21:19   ` Jason Low
  1 sibling, 0 replies; 19+ messages in thread
From: Jason Low @ 2015-03-02 21:19 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On Mon, 2015-03-02 at 20:40 +0100, Oleg Nesterov wrote:
> Well, I forgot everything about this code, but let me ask anyway ;)
> 
> On 03/02, Jason Low wrote:

> > @@ -222,13 +239,10 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
> >  		 * it.
> >  		 */
> >  		thread_group_cputime(tsk, &sum);
> > -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> > -		cputimer->running = 1;
> > -		update_gt_cputime(&cputimer->cputime, &sum);
> > -	} else
> > -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> > -	*times = cputimer->cputime;
> > -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> > +		update_gt_cputime(cputimer, &sum);
> > +		ACCESS_ONCE(cputimer->running) = 1;
> 
> WRITE_ONCE() looks better... 

Okay, I can update that.

> but it is not clear to me why do we need it
> at all.

Peter suggested it here as we would now be updating the running field
without the lock:

https://lkml.org/lkml/2015/1/23/641


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 21:16     ` Jason Low
@ 2015-03-02 21:44       ` Linus Torvalds
  2015-03-02 22:43         ` Jason Low
  2015-03-05 15:20         ` Frederic Weisbecker
  0 siblings, 2 replies; 19+ messages in thread
From: Linus Torvalds @ 2015-03-02 21:44 UTC (permalink / raw)
  To: Jason Low
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List

On Mon, Mar 2, 2015 at 1:16 PM, Jason Low <jason.low2@hp.com> wrote:
>
> In original code, we set cputimer->running first so it is running while
> we call update_gt_cputime(). Now in this patch, we swapped the 2 calls
> such that we set running after calling update_gt_cputime(), so that
> wouldn't be an issue anymore.

Hmm. If you actually care about ordering, and 'running' should be
written to after the other things, then it might be best if you use

   smp_store_release(&cputimer->running, 1);

which makes it clear that the store happens *after* what went before it.

Or at least have a "smp_wmb()" between the atomic64 updates and the
"WRITE_ONCE()".

I guess that since you use cmpxchg in update_gt_cputime, the accesses
end up being ordered anyway, but it might be better to make that thing
very explicit.

                   Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 19:03 ` Linus Torvalds
@ 2015-03-02 21:49   ` Jason Low
  2015-03-19 17:21     ` Jason Low
  0 siblings, 1 reply; 19+ messages in thread
From: Jason Low @ 2015-03-02 21:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Ingo Molnar, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List, jason.low2

On Mon, 2015-03-02 at 11:03 -0800, Linus Torvalds wrote:
> On Mon, Mar 2, 2015 at 10:42 AM, Jason Low <jason.low2@hp.com> wrote:
> >
> > This patch converts the timers to 64 bit atomic variables and use
> > atomic add to update them without a lock. With this patch, the percent
> > of total time spent updating thread group cputimer timers was reduced
> > from 30% down to less than 1%.
> 
> NAK.
> 
> Not because I think this is wrong, but because somebody needs to look
> at the effects on 32-bit architectures too.
> 
> In particular, check out lib/atomic64.c - which uses a hashed array of
> 16-bit spinlocks to do 64-bit atomics. That may or may well work ok in
> practice, but it does mean that now sample_group_cputimer() and
> update_gt_cputime() will take that (it ends up generally being the
> same) spinlock three times for the three atomic64_read()'s.

Okay, I will run some tests to see how this change affects the
performance of itimers on 32 bit systems.

While the update_gt_cputime() shouldn't be an issue for performance
since it doesn't get called often, the sample_group_cputimer() needing
to take locks 3 times for each atomic64_read is something that could
impact performance, so we should take a look at that.

Thanks,
Jason


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 21:44       ` Linus Torvalds
@ 2015-03-02 22:43         ` Jason Low
  2015-03-05 15:20         ` Frederic Weisbecker
  1 sibling, 0 replies; 19+ messages in thread
From: Jason Low @ 2015-03-02 22:43 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Oleg Nesterov, Peter Zijlstra, Ingo Molnar, Paul E. McKenney,
	Andrew Morton, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List, jason.low2

On Mon, 2015-03-02 at 13:44 -0800, Linus Torvalds wrote:
> On Mon, Mar 2, 2015 at 1:16 PM, Jason Low <jason.low2@hp.com> wrote:
> >
> > In original code, we set cputimer->running first so it is running while
> > we call update_gt_cputime(). Now in this patch, we swapped the 2 calls
> > such that we set running after calling update_gt_cputime(), so that
> > wouldn't be an issue anymore.
> 
> Hmm. If you actually care about ordering, and 'running' should be
> written to after the other things, then it might be best if you use
> 
>    smp_store_release(&cputimer->running, 1);
> 
> which makes it clear that the store happens *after* what went before it.
> 
> Or at least have a "smp_wmb()" between the atomic64 updates and the
> "WRITE_ONCE()".
> 
> I guess that since you use cmpxchg in update_gt_cputime, the accesses
> end up being ordered anyway, but it might be better to make that thing
> very explicit.

Yeah, I suppose the extra (smp_mb or smp_wmb) might add more overhead
but since this is not a common code path anyway, it would be worth
adding it to make things more clear.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 21:44       ` Linus Torvalds
  2015-03-02 22:43         ` Jason Low
@ 2015-03-05 15:20         ` Frederic Weisbecker
  2015-03-05 20:02           ` Jason Low
  1 sibling, 1 reply; 19+ messages in thread
From: Frederic Weisbecker @ 2015-03-05 15:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jason Low, Oleg Nesterov, Peter Zijlstra, Ingo Molnar,
	Paul E. McKenney, Andrew Morton, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List

On Mon, Mar 02, 2015 at 01:44:04PM -0800, Linus Torvalds wrote:
> On Mon, Mar 2, 2015 at 1:16 PM, Jason Low <jason.low2@hp.com> wrote:
> >
> > In original code, we set cputimer->running first so it is running while
> > we call update_gt_cputime(). Now in this patch, we swapped the 2 calls
> > such that we set running after calling update_gt_cputime(), so that
> > wouldn't be an issue anymore.
> 
> Hmm. If you actually care about ordering, and 'running' should be
> written to after the other things, then it might be best if you use
> 
>    smp_store_release(&cputimer->running, 1);
> 
> which makes it clear that the store happens *after* what went before it.
> 
> Or at least have a "smp_wmb()" between the atomic64 updates and the
> "WRITE_ONCE()".

FWIW, perhaps it can be reduced with an smp_mb__before_atomic() on the
account_group_*_time() side, paired with smp_wmb() from the thread_group_cputimer()
side. Arming cputime->running shouldn't be too frequent while update cputime
happens at least every tick...

Assuming smp_mb__before_atomic() is more lightweight than smp_load_acquire()
of course. 

> 
> I guess that since you use cmpxchg in update_gt_cputime, the accesses
> end up being ordered anyway, but it might be better to make that thing
> very explicit.
> 
>                    Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
  2015-03-02 19:03 ` Linus Torvalds
  2015-03-02 19:40 ` Oleg Nesterov
@ 2015-03-05 15:35 ` Frederic Weisbecker
  2015-03-05 15:56   ` Paul E. McKenney
  2015-03-06  0:06   ` Jason Low
  2 siblings, 2 replies; 19+ messages in thread
From: Frederic Weisbecker @ 2015-03-05 15:35 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:
> v1->v2:
> - Peter suggested that cputimer->running does not need to be atomic,
>   so we can leave it as an integer.
> - Address a race condition that could occur in update_gt_cputime().
> - Add helper functions to avoid repeating code.
> 
> While running a database workload, we found a scalability issue
> with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger
> systems (such as a 16 socket machine), this caused more than 30% of
> total time spent trying to obtain this kernel lock to update these
> group timer stats.
> 
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%.
> 
> Signed-off-by: Jason Low <jason.low2@hp.com>
> ---
>  include/linux/init_task.h      |    7 +++--
>  include/linux/sched.h          |   10 ++-----
>  kernel/fork.c                  |    3 --
>  kernel/sched/stats.h           |   12 ++------
>  kernel/time/posix-cpu-timers.c |   55 +++++++++++++++++++++++----------------
>  5 files changed, 42 insertions(+), 45 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 3037fc0..c4cdec7 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>  	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
>  	.rlim		= INIT_RLIMITS,					\
>  	.cputimer	= { 						\
> -		.cputime = INIT_CPUTIME,				\
> -		.running = 0,						\
> -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> +		.utime = ATOMIC64_INIT(0),                              \
> +		.stime = ATOMIC64_INIT(0),                              \
> +		.sum_exec_runtime = ATOMIC64_INIT(0),                   \
> +		.running = 0						\
>  	},								\
>  	.cred_guard_mutex =						\
>  		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8db31ef..d6b0f76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -588,9 +588,10 @@ struct task_cputime {
>   * used for thread group CPU timer calculations.
>   */
>  struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>  	int running;
> -	raw_spinlock_t lock;
>  };
>  
>  #include <linux/rwsem.h>
> @@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
>  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>  
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -	raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>  /*
>   * Reevaluate whether the task has signals pending delivery.
>   * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 4dc2dda..df9dfe9 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
>  {
>  	unsigned long cpu_limit;
>  
> -	/* Thread group counters. */
> -	thread_group_cputime_init(sig);
> -
>  	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>  	if (cpu_limit != RLIM_INFINITY) {
>  		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..adda94e 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.utime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->utime);
>  }
>  
>  /**
> @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.stime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->stime);
>  }
>  
>  /**
> @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.sum_exec_runtime += ns;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(ns, &cputimer->sum_exec_runtime);
>  }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index a16b678..ba93c23 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
>  	return error;
>  }
>  
> +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +			                 struct thread_group_cputimer *cputimer)
> +{
> +        times->utime = atomic64_read(&cputimer->utime);
> +        times->stime = atomic64_read(&cputimer->stime);
> +        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);

So, in the case we are calling that right after setting cputimer->running, I guess we are fine
because we just updated cputimer with the freshest values.

But if we are reading this a while after, say several ticks further, there is a chance that
we read stale values since we don't lock anymore.

I don't know if it matters or not, I guess it depends how stale it can be and how much precision
we expect from posix cpu timers. It probably doesn't matter.

But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
value because it performs a full barrier, at the cost of more overhead of course.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-05 15:35 ` Frederic Weisbecker
@ 2015-03-05 15:56   ` Paul E. McKenney
  2015-03-05 16:00     ` Frederic Weisbecker
  2015-03-06  0:06   ` Jason Low
  1 sibling, 1 reply; 19+ messages in thread
From: Paul E. McKenney @ 2015-03-05 15:56 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jason Low, Peter Zijlstra, Ingo Molnar, Linus Torvalds,
	Andrew Morton, Oleg Nesterov, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On Thu, Mar 05, 2015 at 04:35:09PM +0100, Frederic Weisbecker wrote:
> On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:
> > v1->v2:
> > - Peter suggested that cputimer->running does not need to be atomic,
> >   so we can leave it as an integer.
> > - Address a race condition that could occur in update_gt_cputime().
> > - Add helper functions to avoid repeating code.
> > 
> > While running a database workload, we found a scalability issue
> > with itimers.
> > 
> > Much of the problem was caused by the thread_group_cputimer spinlock.
> > Each time we account for group system/user time, we need to obtain a
> > thread_group_cputimer's spinlock to update the timers. On larger
> > systems (such as a 16 socket machine), this caused more than 30% of
> > total time spent trying to obtain this kernel lock to update these
> > group timer stats.
> > 
> > This patch converts the timers to 64 bit atomic variables and use
> > atomic add to update them without a lock. With this patch, the percent
> > of total time spent updating thread group cputimer timers was reduced
> > from 30% down to less than 1%.
> > 
> > Signed-off-by: Jason Low <jason.low2@hp.com>
> > ---
> >  include/linux/init_task.h      |    7 +++--
> >  include/linux/sched.h          |   10 ++-----
> >  kernel/fork.c                  |    3 --
> >  kernel/sched/stats.h           |   12 ++------
> >  kernel/time/posix-cpu-timers.c |   55 +++++++++++++++++++++++----------------
> >  5 files changed, 42 insertions(+), 45 deletions(-)
> > 
> > diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> > index 3037fc0..c4cdec7 100644
> > --- a/include/linux/init_task.h
> > +++ b/include/linux/init_task.h
> > @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
> >  	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
> >  	.rlim		= INIT_RLIMITS,					\
> >  	.cputimer	= { 						\
> > -		.cputime = INIT_CPUTIME,				\
> > -		.running = 0,						\
> > -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> > +		.utime = ATOMIC64_INIT(0),                              \
> > +		.stime = ATOMIC64_INIT(0),                              \
> > +		.sum_exec_runtime = ATOMIC64_INIT(0),                   \
> > +		.running = 0						\
> >  	},								\
> >  	.cred_guard_mutex =						\
> >  		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 8db31ef..d6b0f76 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -588,9 +588,10 @@ struct task_cputime {
> >   * used for thread group CPU timer calculations.
> >   */
> >  struct thread_group_cputimer {
> > -	struct task_cputime cputime;
> > +	atomic64_t utime;
> > +	atomic64_t stime;
> > +	atomic64_t sum_exec_runtime;
> >  	int running;
> > -	raw_spinlock_t lock;
> >  };
> >  
> >  #include <linux/rwsem.h>
> > @@ -2942,11 +2943,6 @@ static __always_inline bool need_resched(void)
> >  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
> >  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
> >  
> > -static inline void thread_group_cputime_init(struct signal_struct *sig)
> > -{
> > -	raw_spin_lock_init(&sig->cputimer.lock);
> > -}
> > -
> >  /*
> >   * Reevaluate whether the task has signals pending delivery.
> >   * Wake the task if so.
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 4dc2dda..df9dfe9 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1037,9 +1037,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
> >  {
> >  	unsigned long cpu_limit;
> >  
> > -	/* Thread group counters. */
> > -	thread_group_cputime_init(sig);
> > -
> >  	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
> >  	if (cpu_limit != RLIM_INFINITY) {
> >  		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> > diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> > index 4ab7043..adda94e 100644
> > --- a/kernel/sched/stats.h
> > +++ b/kernel/sched/stats.h
> > @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.utime += cputime;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(cputime, &cputimer->utime);
> >  }
> >  
> >  /**
> > @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.stime += cputime;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(cputime, &cputimer->stime);
> >  }
> >  
> >  /**
> > @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
> >  	if (!cputimer_running(tsk))
> >  		return;
> >  
> > -	raw_spin_lock(&cputimer->lock);
> > -	cputimer->cputime.sum_exec_runtime += ns;
> > -	raw_spin_unlock(&cputimer->lock);
> > +	atomic64_add(ns, &cputimer->sum_exec_runtime);
> >  }
> > diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> > index a16b678..ba93c23 100644
> > --- a/kernel/time/posix-cpu-timers.c
> > +++ b/kernel/time/posix-cpu-timers.c
> > @@ -173,6 +173,14 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
> >  	return error;
> >  }
> >  
> > +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> > +static inline void sample_group_cputimer(struct task_cputime *times,
> > +			                 struct thread_group_cputimer *cputimer)
> > +{
> > +        times->utime = atomic64_read(&cputimer->utime);
> > +        times->stime = atomic64_read(&cputimer->stime);
> > +        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
> 
> So, in the case we are calling that right after setting cputimer->running, I guess we are fine
> because we just updated cputimer with the freshest values.
> 
> But if we are reading this a while after, say several ticks further, there is a chance that
> we read stale values since we don't lock anymore.
> 
> I don't know if it matters or not, I guess it depends how stale it can be and how much precision
> we expect from posix cpu timers. It probably doesn't matter.
> 
> But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
> value because it performs a full barrier, at the cost of more overhead of course.

Well, if we are running within a guest OS, we might be delayed at any point
for quite some time.  Even with interrupts disabled.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-05 15:56   ` Paul E. McKenney
@ 2015-03-05 16:00     ` Frederic Weisbecker
  2015-03-05 16:16       ` Paul E. McKenney
  0 siblings, 1 reply; 19+ messages in thread
From: Frederic Weisbecker @ 2015-03-05 16:00 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Jason Low, Peter Zijlstra, Ingo Molnar, Linus Torvalds,
	Andrew Morton, Oleg Nesterov, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On Thu, Mar 05, 2015 at 07:56:59AM -0800, Paul E. McKenney wrote:
> On Thu, Mar 05, 2015 at 04:35:09PM +0100, Frederic Weisbecker wrote:
> > So, in the case we are calling that right after setting cputimer->running, I guess we are fine
> > because we just updated cputimer with the freshest values.
> > 
> > But if we are reading this a while after, say several ticks further, there is a chance that
> > we read stale values since we don't lock anymore.
> > 
> > I don't know if it matters or not, I guess it depends how stale it can be and how much precision
> > we expect from posix cpu timers. It probably doesn't matter.
> > 
> > But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
> > value because it performs a full barrier, at the cost of more overhead of course.
> 
> Well, if we are running within a guest OS, we might be delayed at any point
> for quite some time.  Even with interrupts disabled.

You mean delayed because of the overhead of atomic_add_return() or the stale value
of cptimer-> fields? 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-05 16:00     ` Frederic Weisbecker
@ 2015-03-05 16:16       ` Paul E. McKenney
  0 siblings, 0 replies; 19+ messages in thread
From: Paul E. McKenney @ 2015-03-05 16:16 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jason Low, Peter Zijlstra, Ingo Molnar, Linus Torvalds,
	Andrew Morton, Oleg Nesterov, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel

On Thu, Mar 05, 2015 at 05:00:05PM +0100, Frederic Weisbecker wrote:
> On Thu, Mar 05, 2015 at 07:56:59AM -0800, Paul E. McKenney wrote:
> > On Thu, Mar 05, 2015 at 04:35:09PM +0100, Frederic Weisbecker wrote:
> > > So, in the case we are calling that right after setting cputimer->running, I guess we are fine
> > > because we just updated cputimer with the freshest values.
> > > 
> > > But if we are reading this a while after, say several ticks further, there is a chance that
> > > we read stale values since we don't lock anymore.
> > > 
> > > I don't know if it matters or not, I guess it depends how stale it can be and how much precision
> > > we expect from posix cpu timers. It probably doesn't matter.
> > > 
> > > But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
> > > value because it performs a full barrier, at the cost of more overhead of course.
> > 
> > Well, if we are running within a guest OS, we might be delayed at any point
> > for quite some time.  Even with interrupts disabled.
> 
> You mean delayed because of the overhead of atomic_add_return() or the stale value
> of cptimer-> fields? 

Because of preemption of the guest OS's VCPUs by the host OS.

								Thanx, Paul


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-05 15:20         ` Frederic Weisbecker
@ 2015-03-05 20:02           ` Jason Low
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Low @ 2015-03-05 20:02 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Linus Torvalds, Oleg Nesterov, Peter Zijlstra, Ingo Molnar,
	Paul E. McKenney, Andrew Morton, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List, jason.low2

On Thu, 2015-03-05 at 16:20 +0100, Frederic Weisbecker wrote:
> On Mon, Mar 02, 2015 at 01:44:04PM -0800, Linus Torvalds wrote:
> > On Mon, Mar 2, 2015 at 1:16 PM, Jason Low <jason.low2@hp.com> wrote:
> > >
> > > In original code, we set cputimer->running first so it is running while
> > > we call update_gt_cputime(). Now in this patch, we swapped the 2 calls
> > > such that we set running after calling update_gt_cputime(), so that
> > > wouldn't be an issue anymore.
> > 
> > Hmm. If you actually care about ordering, and 'running' should be
> > written to after the other things, then it might be best if you use
> > 
> >    smp_store_release(&cputimer->running, 1);
> > 
> > which makes it clear that the store happens *after* what went before it.
> > 
> > Or at least have a "smp_wmb()" between the atomic64 updates and the
> > "WRITE_ONCE()".
> 
> FWIW, perhaps it can be reduced with an smp_mb__before_atomic() on the
> account_group_*_time() side,

Hi Frederic,

I think Linus might be referring to the updates in update_gt_cputime()?
Otherwise, if the atomic updates in account_group_*_time() is already
enough for correctness, then we might not want to be adding barriers in
the hot paths if they aren't necessary.

I was thinking about the adding smp_store_release(&cputimer->running, 1)
to document that we want to write to the running field after the
operations in update_gt_cputime(). The overhead here won't be much since
it doesn't get called frequently as you mentioned.

>  paired with smp_wmb() from the thread_group_cputimer()
> side. Arming cputime->running shouldn't be too frequent while update cputime
> happens at least every tick...
> 
> Assuming smp_mb__before_atomic() is more lightweight than smp_load_acquire()
> of course. 
> 
> > 
> > I guess that since you use cmpxchg in update_gt_cputime, the accesses
> > end up being ordered anyway, but it might be better to make that thing
> > very explicit.
> > 
> >                    Linus



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-05 15:35 ` Frederic Weisbecker
  2015-03-05 15:56   ` Paul E. McKenney
@ 2015-03-06  0:06   ` Jason Low
  1 sibling, 0 replies; 19+ messages in thread
From: Jason Low @ 2015-03-06  0:06 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Peter Zijlstra, Ingo Molnar, Linus Torvalds, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	linux-kernel, jason.low2

On Thu, 2015-03-05 at 16:35 +0100, Frederic Weisbecker wrote:
> On Mon, Mar 02, 2015 at 10:42:11AM -0800, Jason Low wrote:

> > +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> > +static inline void sample_group_cputimer(struct task_cputime *times,
> > +			                 struct thread_group_cputimer *cputimer)
> > +{
> > +        times->utime = atomic64_read(&cputimer->utime);
> > +        times->stime = atomic64_read(&cputimer->stime);
> > +        times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
> 
> So, in the case we are calling that right after setting cputimer->running, I guess we are fine
> because we just updated cputimer with the freshest values.
> 
> But if we are reading this a while after, say several ticks further, there is a chance that
> we read stale values since we don't lock anymore.
> 
> I don't know if it matters or not, I guess it depends how stale it can be and how much precision
> we expect from posix cpu timers. It probably doesn't matter.
> 
> But just in case, atomic64_read_return(&cputimer->utime, 0) would make sure we get the freshest
> value because it performs a full barrier, at the cost of more overhead of course.

(Assuming that is atomic64_add_return :))

Yeah, there aren't any guarantees that we read the freshest value, but
since the lock isn't used to serialize subsequent accesses of
times->utime, ect..., the values can potentially become stale by the
time they get used anyway, even when we have the locking.

So I'm not sure if atomic64_add_return(&time, 0) for the reads would
really provide much of a benefit when we factor in the extra overhead.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-02 21:49   ` Jason Low
@ 2015-03-19 17:21     ` Jason Low
  2015-03-19 17:59       ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: Jason Low @ 2015-03-19 17:21 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Ingo Molnar, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List, jason.low2

On Mon, 2015-03-02 at 13:49 -0800, Jason Low wrote:
> On Mon, 2015-03-02 at 11:03 -0800, Linus Torvalds wrote:
> > On Mon, Mar 2, 2015 at 10:42 AM, Jason Low <jason.low2@hp.com> wrote:
> > >
> > > This patch converts the timers to 64 bit atomic variables and use
> > > atomic add to update them without a lock. With this patch, the percent
> > > of total time spent updating thread group cputimer timers was reduced
> > > from 30% down to less than 1%.
> > 
> > NAK.
> > 
> > Not because I think this is wrong, but because somebody needs to look
> > at the effects on 32-bit architectures too.
> 
> Okay, I will run some tests to see how this change affects the
> performance of itimers on 32 bit systems.

Hi Linus,

I tested this patch on a 32 bit ARM system with 4 cores. Using the
generic 64 bit atomics, I did not see any performance change with this
patch, and the relevant functions (account_group_*_time(), ect...) don't
show up in perf reports.

One factor might be because locking/cacheline contention isn't as
apparent on smaller systems to begin with, and lib/atomic64.c also
mentions that "this is expected to used on systems with small numbers of
CPUs (<= 4 or so)".


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-19 17:21     ` Jason Low
@ 2015-03-19 17:59       ` Linus Torvalds
  2015-03-19 20:14         ` Jason Low
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2015-03-19 17:59 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List

On Thu, Mar 19, 2015 at 10:21 AM, Jason Low <jason.low2@hp.com> wrote:
>
> I tested this patch on a 32 bit ARM system with 4 cores. Using the
> generic 64 bit atomics, I did not see any performance change with this
> patch, and the relevant functions (account_group_*_time(), ect...) don't
> show up in perf reports.

Ok.

> One factor might be because locking/cacheline contention isn't as
> apparent on smaller systems to begin with, and lib/atomic64.c also
> mentions that "this is expected to used on systems with small numbers of
> CPUs (<= 4 or so)".

Yeah, that's probably a valid argument anyway - 32-bit systems aren't
really going to be multi-node big systems any more.

So I'm ok with the patch,

                      Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-03-19 17:59       ` Linus Torvalds
@ 2015-03-19 20:14         ` Jason Low
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Low @ 2015-03-19 20:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Ingo Molnar, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Rik van Riel,
	Steven Rostedt, Scott Norton, Aswin Chandramouleeswaran,
	Linux Kernel Mailing List, jason.low2

On Thu, 2015-03-19 at 10:59 -0700, Linus Torvalds wrote:
> On Thu, Mar 19, 2015 at 10:21 AM, Jason Low <jason.low2@hp.com> wrote:
> >
> > I tested this patch on a 32 bit ARM system with 4 cores. Using the
> > generic 64 bit atomics, I did not see any performance change with this
> > patch, and the relevant functions (account_group_*_time(), ect...) don't
> > show up in perf reports.
> 
> Ok.
> 
> > One factor might be because locking/cacheline contention isn't as
> > apparent on smaller systems to begin with, and lib/atomic64.c also
> > mentions that "this is expected to used on systems with small numbers of
> > CPUs (<= 4 or so)".
> 
> Yeah, that's probably a valid argument anyway - 32-bit systems aren't
> really going to be multi-node big systems any more.
> 
> So I'm ok with the patch,

Okay, I will be sending out a v3 patch which will also mention a bit
about its effect on 32 bit systems in the changelog, in addition to the
changes that were discussed about in this thread (using WRITE_ONCE(),
ect...).

Thanks,
Jason


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2015-03-19 20:14 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-02 18:42 [PATCH v2] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
2015-03-02 19:03 ` Linus Torvalds
2015-03-02 21:49   ` Jason Low
2015-03-19 17:21     ` Jason Low
2015-03-19 17:59       ` Linus Torvalds
2015-03-19 20:14         ` Jason Low
2015-03-02 19:40 ` Oleg Nesterov
2015-03-02 19:43   ` Oleg Nesterov
2015-03-02 21:16     ` Jason Low
2015-03-02 21:44       ` Linus Torvalds
2015-03-02 22:43         ` Jason Low
2015-03-05 15:20         ` Frederic Weisbecker
2015-03-05 20:02           ` Jason Low
2015-03-02 21:19   ` Jason Low
2015-03-05 15:35 ` Frederic Weisbecker
2015-03-05 15:56   ` Paul E. McKenney
2015-03-05 16:00     ` Frederic Weisbecker
2015-03-05 16:16       ` Paul E. McKenney
2015-03-06  0:06   ` Jason Low

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).