All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] sched, timer: Improve scalability of itimers
@ 2015-04-14 23:09 Jason Low
  2015-04-14 23:09 ` [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
                   ` (3 more replies)
  0 siblings, 4 replies; 34+ messages in thread
From: Jason Low @ 2015-04-14 23:09 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, Jason Low

This patchset improves the scalability of itimers, thread_group_cputimer
and addresses a performance issue we found while running a database
workload where more than 30% of total time is spent in the kernel
trying to acquire the thread_group_cputimer spinlock.

While we're modifying sched and timer, patch 1 also updates all existing
usages of ACCESS_ONCE with the new READ_ONCE and WRITE_ONCE APIs in
those areas.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-14 23:09 [PATCH 0/3] sched, timer: Improve scalability of itimers Jason Low
@ 2015-04-14 23:09 ` Jason Low
  2015-04-14 23:59   ` Steven Rostedt
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 34+ messages in thread
From: Jason Low @ 2015-04-14 23:09 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, Jason Low

ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
the rest of the existing usages of ACCESS_ONCE in the scheduler, and use
the new READ_ONCE and WRITE_ONCE APIs.

Signed-off-by: Jason Low <jason.low2@hp.com>
---
 include/linux/sched.h          |    4 ++--
 kernel/fork.c                  |    2 +-
 kernel/sched/auto_group.c      |    2 +-
 kernel/sched/auto_group.h      |    2 +-
 kernel/sched/core.c            |    4 ++--
 kernel/sched/cputime.c         |    2 +-
 kernel/sched/deadline.c        |    2 +-
 kernel/sched/fair.c            |   14 +++++++-------
 kernel/sched/proc.c            |    4 ++--
 kernel/sched/rt.c              |    2 +-
 kernel/sched/sched.h           |    2 +-
 kernel/sched/wait.c            |    4 ++--
 kernel/time/posix-cpu-timers.c |    8 ++++----
 13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d77432..379fb3b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3070,13 +3070,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
 }
 
 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
 }
 
 static inline unsigned long rlimit(unsigned int limit)
diff --git a/kernel/fork.c b/kernel/fork.c
index cf65139..d96a0ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,7 +1045,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	/* Thread group counters. */
 	thread_group_cputime_init(sig);
 
-	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 		sig->cputimer.running = 1;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index eae160d..a8653c2 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -141,7 +141,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
 		goto out;
 
 	for_each_thread(p, t)
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd0471..890c95f 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
 
 	if (enabled && task_wants_autogroup(p, tg))
 		return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e..4131c24 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -508,7 +508,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
-	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 
 	for (;;) {
 		if (!(val & _TIF_POLLING_NRFLAG))
@@ -2505,7 +2505,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
-	unsigned long next, now = ACCESS_ONCE(jiffies);
+	unsigned long next, now = READ_ONCE(jiffies);
 
 	next = rq->last_sched_tick + HZ;
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..f5a64ff 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
 {
 	cputime_t old;
 
-	while (new > (old = ACCESS_ONCE(*counter)))
+	while (new > (old = READ_ONCE(*counter)))
 		cmpxchg_cputime(counter, old, new);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3fa8fa6..fa43e3f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -932,7 +932,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3..6acfda0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -832,7 +832,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 
 static unsigned int task_scan_min(struct task_struct *p)
 {
-	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 	unsigned int scan, floor;
 	unsigned int windows = 1;
 
@@ -1777,7 +1777,7 @@ static void task_numa_placement(struct task_struct *p)
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
-	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+	seq = READ_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
@@ -1921,7 +1921,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	}
 
 	rcu_read_lock();
-	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 
 	if (!cpupid_match_pid(tsk, cpupid))
 		goto no_join;
@@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-	ACCESS_ONCE(p->mm->numa_scan_seq)++;
+	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 	p->mm->numa_scan_offset = 0;
 }
 
@@ -4301,7 +4301,7 @@ static unsigned long capacity_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 	unsigned long load_avg = rq->cfs.runnable_load_avg;
 
 	if (nr_running)
@@ -5946,8 +5946,8 @@ static unsigned long scale_rt_capacity(int cpu)
 	 * Since we're reading these variables without serialization make sure
 	 * we read them once before doing sanity checks on them.
 	 */
-	age_stamp = ACCESS_ONCE(rq->age_stamp);
-	avg = ACCESS_ONCE(rq->rt_avg);
+	age_stamp = READ_ONCE(rq->age_stamp);
+	avg = READ_ONCE(rq->rt_avg);
 	delta = __rq_clock_broken(rq) - age_stamp;
 
 	if (unlikely(delta < 0))
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 8ecd552..361e6f1 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -526,7 +526,7 @@ static inline unsigned long get_rq_runnable_load(struct rq *rq)
  */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long load = get_rq_runnable_load(this_rq);
 	unsigned long pending_updates;
 
@@ -548,7 +548,7 @@ void update_idle_cpu_load(struct rq *this_rq)
 void update_cpu_load_nohz(void)
 {
 	struct rq *this_rq = this_rq();
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long pending_updates;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f4d4b07..7b00333 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1311,7 +1311,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dc0f435..949d46f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -688,7 +688,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-	return ACCESS_ONCE(rq->clock);
+	return READ_ONCE(rq->clock);
 }
 
 static inline u64 rq_clock(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a..2ccec98 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
 
 __sched int bit_wait_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 
 __sched int bit_wait_io_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da7..e072d98 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk,
 	/*
 	 * Check for the special case thread timers.
 	 */
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
 		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGPROF);
 	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
 			 SIGVTALRM);
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
 		cputime_t x;
 		if (psecs >= hard) {
 			/*
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-14 23:09 [PATCH 0/3] sched, timer: Improve scalability of itimers Jason Low
  2015-04-14 23:09 ` [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
@ 2015-04-14 23:09 ` Jason Low
  2015-04-15  7:33   ` Ingo Molnar
                     ` (3 more replies)
  2015-04-14 23:09 ` [PATCH 3/3] sched, timer: Use cmpxchg to do updates in update_gt_cputime() Jason Low
  2015-04-14 23:53 ` [PATCH 0/3] sched, timer: Improve scalability of itimers Linus Torvalds
  3 siblings, 4 replies; 34+ messages in thread
From: Jason Low @ 2015-04-14 23:09 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, Jason Low

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64 bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%. 

Note: On 32 bit systems using the generic 64 bit atomics, this causes 
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32 bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32 bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.

Signed-off-by: Jason Low <jason.low2@hp.com>
---
 include/linux/init_task.h      |    7 +++--
 include/linux/sched.h          |   10 ++-----
 kernel/fork.c                  |    3 --
 kernel/sched/stats.h           |   12 ++-------
 kernel/time/posix-cpu-timers.c |   48 ++++++++++++++++++++--------------------
 5 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d223..7b9d8b5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.cputime = INIT_CPUTIME,				\
-		.running = 0,						\
-		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+		.utime 		  = ATOMIC64_INIT(0),			\
+		.stime		  = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),			\
+		.running 	  = 0					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 379fb3b..a5bb23b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -592,9 +592,10 @@ struct task_cputime {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	struct task_cputime cputime;
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
 	int running;
-	raw_spinlock_t lock;
 };
 
 #include <linux/rwsem.h>
@@ -2952,11 +2953,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	raw_spin_lock_init(&sig->cputimer.lock);
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index d96a0ca..da8e6dd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1042,9 +1042,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
 	unsigned long cpu_limit;
 
-	/* Thread group counters. */
-	thread_group_cputime_init(sig);
-
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab7043..adda94e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->utime);
 }
 
 /**
@@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->stime);
 }
 
 /**
@@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->sum_exec_runtime);
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d98..7e96082 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,44 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
 {
-	if (b->utime > a->utime)
-		a->utime = b->utime;
+	if (sum->utime > atomic64_read(&cputimer->utime))
+		atomic64_set(&cputimer->utime, sum->utime);
 
-	if (b->stime > a->stime)
-		a->stime = b->stime;
+	if (sum->stime > atomic64_read(&cputimer->stime))
+		atomic64_set(&cputimer->stime, sum->stime);
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+	if (sum->sum_exec_runtime > atomic64_read(&cputimer->sum_exec_runtime))
+		atomic64_set(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+}
+
+/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
+static inline void sample_group_cputimer(struct task_cputime *times,
+					  struct thread_group_cputimer *cputimer)
+{
+	times->utime = atomic64_read(&cputimer->utime);
+	times->stime = atomic64_read(&cputimer->stime);
+	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
-	unsigned long flags;
 
 	if (!cputimer->running) {
 		/*
 		 * The POSIX timer interface allows for absolute time expiry
 		 * values through the TIMER_ABSTIME flag, therefore we have
-		 * to synchronize the timer to the clock every time we start
-		 * it.
+		 * to synchronize the timer to the clock every time we start it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-		cputimer->running = 1;
-		update_gt_cputime(&cputimer->cputime, &sum);
-	} else
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-	*times = cputimer->cputime;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+		update_gt_cputime(cputimer, &sum);
+		/* Start 'running' after update_gt_cputime() */
+		smp_store_release(&cputimer->running, 1);
+	}
+	sample_group_cputimer(times, cputimer);
 }
 
 /*
@@ -885,11 +890,8 @@ static void check_thread_timers(struct task_struct *tsk,
 static void stop_process_timers(struct signal_struct *sig)
 {
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 0;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+	WRITE_ONCE(cputimer->running, 0);
 }
 
 static u32 onecputick;
@@ -1114,9 +1116,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (sig->cputimer.running) {
 		struct task_cputime group_sample;
 
-		raw_spin_lock(&sig->cputimer.lock);
-		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		sample_group_cputimer(&group_sample, &sig->cputimer);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 3/3] sched, timer: Use cmpxchg to do updates in update_gt_cputime()
  2015-04-14 23:09 [PATCH 0/3] sched, timer: Improve scalability of itimers Jason Low
  2015-04-14 23:09 ` [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
@ 2015-04-14 23:09 ` Jason Low
  2015-04-14 23:53 ` [PATCH 0/3] sched, timer: Improve scalability of itimers Linus Torvalds
  3 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-14 23:09 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, Jason Low

Note: The chance that the race which this patch addresses seems very
unlikely to occur, especially after the change in patch 2 which sets
the running field after calling this update_gt_cputimer().

However, I am including this patch if we want to be completely safe
from concurrent updates.

-----------------------------------------------------------------------------

Since we're now updating thread group cputimer values without a lock,
there is now a potential race that can occur in update_gt_cputime() where
the cputimers are concurrently being updated in account_group_*_time().

This can occur when the ->running field transitions from 1 -> 0 -> 1. If the
cputimer->running field is set while thread 1 runs run_posix_cpu_timers(),
but another thread, thread 2, turns off cputimer->running before thread 1
enters thread_group_cputimer(), and another thread, thread 3, enables it
after thread 1 checks !cputimer->running in thread_group_cputimer(), then
there is a possibility that update_gt_cputime() is updating the cputimers
while the cputimer is running.

This patch uses cmpxchg and retry logic to ensure that update_gt_cputime()
is making its updates atomically.

Signed-off-by: Jason Low <jason.low2@hp.com>
---
 kernel/time/posix-cpu-timers.c |   26 ++++++++++++++++++--------
 1 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 7e96082..130d717 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,16 +196,26 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-	if (sum->utime > atomic64_read(&cputimer->utime))
-		atomic64_set(&cputimer->utime, sum->utime);
-
-	if (sum->stime > atomic64_read(&cputimer->stime))
-		atomic64_set(&cputimer->stime, sum->stime);
+	u64 curr_cputime;
+	/*
+	 * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+	 * to avoid race conditions with concurrent updates to cputime.
+	 */
+retry:
+	curr_cputime = atomic64_read(cputime);
+	if (sum_cputime > curr_cputime) {
+		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+			goto retry;
+	}
+}
 
-	if (sum->sum_exec_runtime > atomic64_read(&cputimer->sum_exec_runtime))
-		atomic64_set(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+	__update_gt_cputime(&cputimer->utime, sum->utime);
+	__update_gt_cputime(&cputimer->stime, sum->stime);
+	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
 }
 
 /* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
-- 
1.7.2.5


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 0/3] sched, timer: Improve scalability of itimers
  2015-04-14 23:09 [PATCH 0/3] sched, timer: Improve scalability of itimers Jason Low
                   ` (2 preceding siblings ...)
  2015-04-14 23:09 ` [PATCH 3/3] sched, timer: Use cmpxchg to do updates in update_gt_cputime() Jason Low
@ 2015-04-14 23:53 ` Linus Torvalds
  2015-04-15  7:24   ` Ingo Molnar
  3 siblings, 1 reply; 34+ messages in thread
From: Linus Torvalds @ 2015-04-14 23:53 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	Linux Kernel Mailing List, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Mel Gorman,
	Steven Rostedt, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton

On Tue, Apr 14, 2015 at 4:09 PM, Jason Low <jason.low2@hp.com> wrote:
> This patchset improves the scalability of itimers, thread_group_cputimer
> and addresses a performance issue we found while running a database
> workload where more than 30% of total time is spent in the kernel
> trying to acquire the thread_group_cputimer spinlock.

I'm ok with this series, but I'm going to assume that I'll get it
through the scheduler tree. Ingo?

                        Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-14 23:09 ` [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
@ 2015-04-14 23:59   ` Steven Rostedt
  2015-04-15  2:12     ` Jason Low
  0 siblings, 1 reply; 34+ messages in thread
From: Steven Rostedt @ 2015-04-14 23:59 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On Tue, 14 Apr 2015 16:09:44 -0700
Jason Low <jason.low2@hp.com> wrote:


> @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>  
>  static void reset_ptenuma_scan(struct task_struct *p)
>  {
> -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
> +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);

Is the READ_ONCE() inside the WRITE_ONCE() really necessary?

>  	p->mm->numa_scan_offset = 0;
>  }
>  

-- Steve

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-14 23:59   ` Steven Rostedt
@ 2015-04-15  2:12     ` Jason Low
  2015-04-15  2:40       ` Steven Rostedt
  0 siblings, 1 reply; 34+ messages in thread
From: Jason Low @ 2015-04-15  2:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton,
	jason.low2

Hi Steven,

On Tue, 2015-04-14 at 19:59 -0400, Steven Rostedt wrote:
> On Tue, 14 Apr 2015 16:09:44 -0700
> Jason Low <jason.low2@hp.com> wrote:
> 
> 
> > @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> >  
> >  static void reset_ptenuma_scan(struct task_struct *p)
> >  {
> > -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
> > +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> 
> Is the READ_ONCE() inside the WRITE_ONCE() really necessary?

Yeah, I think so to be safe, otherwise, the access of
p->mm->numa_scan_seq in the 2nd parameter doesn't have the volatile
cast.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  2:12     ` Jason Low
@ 2015-04-15  2:40       ` Steven Rostedt
  2015-04-15  7:46         ` Ingo Molnar
  2015-04-16  2:29         ` Jason Low
  0 siblings, 2 replies; 34+ messages in thread
From: Steven Rostedt @ 2015-04-15  2:40 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On Tue, 14 Apr 2015 19:12:33 -0700
Jason Low <jason.low2@hp.com> wrote:

> Hi Steven,
> 
> On Tue, 2015-04-14 at 19:59 -0400, Steven Rostedt wrote:
> > On Tue, 14 Apr 2015 16:09:44 -0700
> > Jason Low <jason.low2@hp.com> wrote:
> > 
> > 
> > > @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> > >  
> > >  static void reset_ptenuma_scan(struct task_struct *p)
> > >  {
> > > -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
> > > +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> > 
> > Is the READ_ONCE() inside the WRITE_ONCE() really necessary?
> 
> Yeah, I think so to be safe, otherwise, the access of
> p->mm->numa_scan_seq in the 2nd parameter doesn't have the volatile
> cast.

You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
and just a:

	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;

Can be done. But I'm still trying to wrap my head around why this is
needed here. Comments would have been really helpful. We should make
all READ_ONCE() WRITE_ONCE and obsolete ACCESS_ONCE() have mandatory
comments just like we do with memory barriers.

-- Steve


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 0/3] sched, timer: Improve scalability of itimers
  2015-04-14 23:53 ` [PATCH 0/3] sched, timer: Improve scalability of itimers Linus Torvalds
@ 2015-04-15  7:24   ` Ingo Molnar
  0 siblings, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-04-15  7:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jason Low, Peter Zijlstra, Thomas Gleixner,
	Linux Kernel Mailing List, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Mel Gorman,
	Steven Rostedt, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Tue, Apr 14, 2015 at 4:09 PM, Jason Low <jason.low2@hp.com> wrote:
> > This patchset improves the scalability of itimers, thread_group_cputimer
> > and addresses a performance issue we found while running a database
> > workload where more than 30% of total time is spent in the kernel
> > trying to acquire the thread_group_cputimer spinlock.
> 
> I'm ok with this series, but I'm going to assume that I'll get it
> through the scheduler tree. Ingo?

Yeah, will pick them up!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
@ 2015-04-15  7:33   ` Ingo Molnar
  2015-04-15  7:35     ` Ingo Molnar
  2015-04-15 10:37   ` Preeti U Murthy
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-04-15  7:33 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Steven Rostedt, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton


* Jason Low <jason.low2@hp.com> wrote:

> While running a database workload, we found a scalability issue with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.

So I'm fine with the basic principle, but in the hope that maybe 
posix-cpu-timers will grow similar optimizations in the future, it 
would help to have the new data type factored out better, not 
open-coded:

>  struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>  	int running;
> -	raw_spinlock_t lock;
>  };

So after your changes we still have a separate:

struct task_cputime {
        cputime_t utime;
        cputime_t stime;
        unsigned long long sum_exec_runtime;
};

Which then weirdly overlaps with a different structure on a different 
abstraction level:

 struct thread_group_cputimer {
	atomic64_t utime;
	atomic64_t stime;
	atomic64_t sum_exec_runtime;
 	int running;
 };

So I think it would be more obvious what's going on if we introduced 
an atomic task_cputime structure:

 struct task_cputime_atomic {
	atomic64_t utime;
	atomic64_t stime;
	atomic64_t sum_exec_runtime;
 };

and put that into 'struct thread_group_cputimer':

 struct thread_group_cputimer {
	struct task_cputime_atomic cputime_atomic;
  	int running;
 };

Maybe even factor out the main update and reading methods into 
expressively named helper inlines?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15  7:33   ` Ingo Molnar
@ 2015-04-15  7:35     ` Ingo Molnar
  2015-04-15 17:14       ` Jason Low
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-04-15  7:35 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Steven Rostedt, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton


* Ingo Molnar <mingo@kernel.org> wrote:

> So after your changes we still have a separate:
> 
> struct task_cputime {
>         cputime_t utime;
>         cputime_t stime;
>         unsigned long long sum_exec_runtime;
> };
> 
> Which then weirdly overlaps with a different structure on a different 
> abstraction level:
> 
>  struct thread_group_cputimer {
> 	atomic64_t utime;
> 	atomic64_t stime;
> 	atomic64_t sum_exec_runtime;
>  	int running;
>  };
> 
> So I think it would be more obvious what's going on if we introduced 
> an atomic task_cputime structure:
> 
>  struct task_cputime_atomic {
> 	atomic64_t utime;
> 	atomic64_t stime;
> 	atomic64_t sum_exec_runtime;
>  };
> 
> and put that into 'struct thread_group_cputimer':
> 
>  struct thread_group_cputimer {
> 	struct task_cputime_atomic cputime_atomic;
>   	int running;
>  };
> 
> Maybe even factor out the main update and reading methods into 
> expressively named helper inlines?

Btw., feel free to preserve your original series and turn this 
factoring out into 1-2 extra patches on top of it: so that we preserve 
your testing on the original series, and see the structure (and cost) 
of the factoring out of the new data type.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  2:40       ` Steven Rostedt
@ 2015-04-15  7:46         ` Ingo Molnar
  2015-04-15 18:49           ` Jason Low
                             ` (2 more replies)
  2015-04-16  2:29         ` Jason Low
  1 sibling, 3 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-04-15  7:46 UTC (permalink / raw)
  To: Steven Rostedt, Peter Zijlstra, Mel Gorman, Rik van Riel
  Cc: Jason Low, Linus Torvalds, Peter Zijlstra, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 14 Apr 2015 19:12:33 -0700
> Jason Low <jason.low2@hp.com> wrote:
> 
> > Hi Steven,
> > 
> > On Tue, 2015-04-14 at 19:59 -0400, Steven Rostedt wrote:
> > > On Tue, 14 Apr 2015 16:09:44 -0700
> > > Jason Low <jason.low2@hp.com> wrote:
> > > 
> > > 
> > > > @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
> > > >  
> > > >  static void reset_ptenuma_scan(struct task_struct *p)
> > > >  {
> > > > -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
> > > > +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
> > > 
> > > Is the READ_ONCE() inside the WRITE_ONCE() really necessary?
> > 
> > Yeah, I think so to be safe, otherwise, the access of
> > p->mm->numa_scan_seq in the 2nd parameter doesn't have the volatile
> > cast.
> 
> You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
> and just a:
> 
> 	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;
> 
> Can be done. But I'm still trying to wrap my head around why this is
> needed here. Comments would have been really helpful. We should make
> all READ_ONCE() WRITE_ONCE and obsolete ACCESS_ONCE() have mandatory
> comments just like we do with memory barriers.

So the original ACCESS_ONCE() barriers were misguided to begin with: I 
think they tried to handle races with the scheduler balancing softirq 
and tried to avoid having to use atomics for the sequence counter 
(which would be overkill), but things like ACCESS_ONCE(x)++ never 
guaranteed atomicity (or even coherency) of the update.

But since in reality this is only statistical sampling code, all these 
compiler barriers can be removed I think. Peter, Mel, Rik, do you 
agree?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
  2015-04-15  7:33   ` Ingo Molnar
@ 2015-04-15 10:37   ` Preeti U Murthy
  2015-04-15 19:09     ` Jason Low
  2015-04-15 13:25   ` Frederic Weisbecker
  2015-04-15 14:23   ` Davidlohr Bueso
  3 siblings, 1 reply; 34+ messages in thread
From: Preeti U Murthy @ 2015-04-15 10:37 UTC (permalink / raw)
  To: Jason Low, Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner
  Cc: linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On 04/15/2015 04:39 AM, Jason Low wrote:
>  /*
> @@ -885,11 +890,8 @@ static void check_thread_timers(struct task_struct *tsk,
>  static void stop_process_timers(struct signal_struct *sig)
>  {
>  	struct thread_group_cputimer *cputimer = &sig->cputimer;
> -	unsigned long flags;
> 
> -	raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	cputimer->running = 0;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +	WRITE_ONCE(cputimer->running, 0);

Why do a WRITE_ONCE() here ? Maybe you should explicitly mention this
through a comment like Steven pointed out about all
WRITE/READ/ACCESS_ONCE() usage.

Regards
Preeti U Murthy
>  }
> 
>  static u32 onecputick;
> @@ -1114,9 +1116,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
>  	if (sig->cputimer.running) {
>  		struct task_cputime group_sample;
> 
> -		raw_spin_lock(&sig->cputimer.lock);
> -		group_sample = sig->cputimer.cputime;
> -		raw_spin_unlock(&sig->cputimer.lock);
> +		sample_group_cputimer(&group_sample, &sig->cputimer);
> 
>  		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
>  			return 1;
> 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
  2015-04-15  7:33   ` Ingo Molnar
  2015-04-15 10:37   ` Preeti U Murthy
@ 2015-04-15 13:25   ` Frederic Weisbecker
  2015-04-15 13:32     ` Peter Zijlstra
  2015-04-15 14:23   ` Davidlohr Bueso
  3 siblings, 1 reply; 34+ messages in thread
From: Frederic Weisbecker @ 2015-04-15 13:25 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Mel Gorman, Steven Rostedt, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On Tue, Apr 14, 2015 at 04:09:45PM -0700, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
> 
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%. 
> 
> Note: On 32 bit systems using the generic 64 bit atomics, this causes 
> sample_group_cputimer() to take locks 3 times instead of just 1 time.
> However, we tested this patch on a 32 bit system ARM system using the
> generic atomics and did not find the overhead to be much of an issue.
> An explanation for why this isn't an issue is that 32 bit systems usually
> have small numbers of CPUs, and cacheline contention from extra spinlocks
> called periodically is not really apparent on smaller systems.
> 
> Signed-off-by: Jason Low <jason.low2@hp.com>
> ---
>  include/linux/init_task.h      |    7 +++--
>  include/linux/sched.h          |   10 ++-----
>  kernel/fork.c                  |    3 --
>  kernel/sched/stats.h           |   12 ++-------
>  kernel/time/posix-cpu-timers.c |   48 ++++++++++++++++++++--------------------
>  5 files changed, 34 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 696d223..7b9d8b5 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
>  	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
>  	.rlim		= INIT_RLIMITS,					\
>  	.cputimer	= { 						\
> -		.cputime = INIT_CPUTIME,				\
> -		.running = 0,						\
> -		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
> +		.utime 		  = ATOMIC64_INIT(0),			\
> +		.stime		  = ATOMIC64_INIT(0),			\
> +		.sum_exec_runtime = ATOMIC64_INIT(0),			\
> +		.running 	  = 0					\
>  	},								\
>  	.cred_guard_mutex =						\
>  		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 379fb3b..a5bb23b 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -592,9 +592,10 @@ struct task_cputime {
>   * used for thread group CPU timer calculations.
>   */
>  struct thread_group_cputimer {
> -	struct task_cputime cputime;
> +	atomic64_t utime;
> +	atomic64_t stime;
> +	atomic64_t sum_exec_runtime;
>  	int running;
> -	raw_spinlock_t lock;
>  };
>  
>  #include <linux/rwsem.h>
> @@ -2952,11 +2953,6 @@ static __always_inline bool need_resched(void)
>  void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
>  
> -static inline void thread_group_cputime_init(struct signal_struct *sig)
> -{
> -	raw_spin_lock_init(&sig->cputimer.lock);
> -}
> -
>  /*
>   * Reevaluate whether the task has signals pending delivery.
>   * Wake the task if so.
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d96a0ca..da8e6dd 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1042,9 +1042,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
>  {
>  	unsigned long cpu_limit;
>  
> -	/* Thread group counters. */
> -	thread_group_cputime_init(sig);
> -
>  	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
>  	if (cpu_limit != RLIM_INFINITY) {
>  		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 4ab7043..adda94e 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -215,9 +215,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.utime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->utime);
>  }
>  
>  /**
> @@ -238,9 +236,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.stime += cputime;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(cputime, &cputimer->stime);
>  }
>  
>  /**
> @@ -261,7 +257,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
>  	if (!cputimer_running(tsk))
>  		return;
>  
> -	raw_spin_lock(&cputimer->lock);
> -	cputimer->cputime.sum_exec_runtime += ns;
> -	raw_spin_unlock(&cputimer->lock);
> +	atomic64_add(ns, &cputimer->sum_exec_runtime);
>  }
> diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
> index e072d98..7e96082 100644
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -196,39 +196,44 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
>  	return 0;
>  }
>  
> -static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
> +static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
>  {
> -	if (b->utime > a->utime)
> -		a->utime = b->utime;
> +	if (sum->utime > atomic64_read(&cputimer->utime))
> +		atomic64_set(&cputimer->utime, sum->utime);
>  
> -	if (b->stime > a->stime)
> -		a->stime = b->stime;
> +	if (sum->stime > atomic64_read(&cputimer->stime))
> +		atomic64_set(&cputimer->stime, sum->stime);
>  
> -	if (b->sum_exec_runtime > a->sum_exec_runtime)
> -		a->sum_exec_runtime = b->sum_exec_runtime;
> +	if (sum->sum_exec_runtime > atomic64_read(&cputimer->sum_exec_runtime))
> +		atomic64_set(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
> +}
> +
> +/* Sample thread_group_cputimer values in "cputimer", copy results to "times" */
> +static inline void sample_group_cputimer(struct task_cputime *times,
> +					  struct thread_group_cputimer *cputimer)
> +{
> +	times->utime = atomic64_read(&cputimer->utime);
> +	times->stime = atomic64_read(&cputimer->stime);
> +	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
>  }
>  
>  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
>  {
>  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
>  	struct task_cputime sum;
> -	unsigned long flags;
>  
>  	if (!cputimer->running) {
>  		/*
>  		 * The POSIX timer interface allows for absolute time expiry
>  		 * values through the TIMER_ABSTIME flag, therefore we have
> -		 * to synchronize the timer to the clock every time we start
> -		 * it.
> +		 * to synchronize the timer to the clock every time we start it.
>  		 */
>  		thread_group_cputime(tsk, &sum);
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -		cputimer->running = 1;
> -		update_gt_cputime(&cputimer->cputime, &sum);
> -	} else
> -		raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	*times = cputimer->cputime;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +		update_gt_cputime(cputimer, &sum);
> +		/* Start 'running' after update_gt_cputime() */
> +		smp_store_release(&cputimer->running, 1);

This barrier should be mirrored somewhere but I can't see where in this patch.
Maybe in another one in the series. Or maybe there is already a barrier in the
existing code that I'm missing. I would expect to see it in account_group_*_time().
In any case, there should be comment about what it mirrors.

> +	}
> +	sample_group_cputimer(times, cputimer);
>  }
>  
>  /*
> @@ -885,11 +890,8 @@ static void check_thread_timers(struct task_struct *tsk,
>  static void stop_process_timers(struct signal_struct *sig)
>  {
>  	struct thread_group_cputimer *cputimer = &sig->cputimer;
> -	unsigned long flags;
>  
> -	raw_spin_lock_irqsave(&cputimer->lock, flags);
> -	cputimer->running = 0;
> -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> +	WRITE_ONCE(cputimer->running, 0);
>  }
>  
>  static u32 onecputick;
> @@ -1114,9 +1116,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
>  	if (sig->cputimer.running) {
>  		struct task_cputime group_sample;
>  
> -		raw_spin_lock(&sig->cputimer.lock);
> -		group_sample = sig->cputimer.cputime;
> -		raw_spin_unlock(&sig->cputimer.lock);
> +		sample_group_cputimer(&group_sample, &sig->cputimer);
>  
>  		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
>  			return 1;
> -- 
> 1.7.2.5
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15 13:25   ` Frederic Weisbecker
@ 2015-04-15 13:32     ` Peter Zijlstra
  2015-04-15 20:04       ` Jason Low
  0 siblings, 1 reply; 34+ messages in thread
From: Peter Zijlstra @ 2015-04-15 13:32 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jason Low, Linus Torvalds, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Mel Gorman, Steven Rostedt, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On Wed, Apr 15, 2015 at 03:25:36PM +0200, Frederic Weisbecker wrote:
> On Tue, Apr 14, 2015 at 04:09:45PM -0700, Jason Low wrote:
> >  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
> >  {
> >  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
> >  	struct task_cputime sum;
> >  
> >  	if (!cputimer->running) {
> >  		/*
> >  		 * The POSIX timer interface allows for absolute time expiry
> >  		 * values through the TIMER_ABSTIME flag, therefore we have
> > +		 * to synchronize the timer to the clock every time we start it.
> >  		 */
> >  		thread_group_cputime(tsk, &sum);
> > +		update_gt_cputime(cputimer, &sum);
> > +		/* Start 'running' after update_gt_cputime() */
> > +		smp_store_release(&cputimer->running, 1);
> 
> This barrier should be mirrored somewhere but I can't see where in this patch.
> Maybe in another one in the series. Or maybe there is already a barrier in the
> existing code that I'm missing. I would expect to see it in account_group_*_time().
> In any case, there should be comment about what it mirrors.

I think it should be in cputimer_running(), which should use
smp_load_acquire() to read cputimer->running.

That way you guarantee that everything observing 'running' will indeed
observe the initialized state.



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
                     ` (2 preceding siblings ...)
  2015-04-15 13:25   ` Frederic Weisbecker
@ 2015-04-15 14:23   ` Davidlohr Bueso
  2015-04-15 21:15     ` Jason Low
  3 siblings, 1 reply; 34+ messages in thread
From: Davidlohr Bueso @ 2015-04-15 14:23 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton

On Tue, 2015-04-14 at 16:09 -0700, Jason Low wrote:
> While running a database workload, we found a scalability issue with itimers.
> 
> Much of the problem was caused by the thread_group_cputimer spinlock.
> Each time we account for group system/user time, we need to obtain a
> thread_group_cputimer's spinlock to update the timers. On larger systems
> (such as a 16 socket machine), this caused more than 30% of total time
> spent trying to obtain this kernel lock to update these group timer stats.
> 
> This patch converts the timers to 64 bit atomic variables and use
> atomic add to update them without a lock. With this patch, the percent
> of total time spent updating thread group cputimer timers was reduced
> from 30% down to less than 1%. 

What does 30% less time spent dealing with the thread_group_cputimer's
spinlock buy us? iow, does this help DB benchmark throughput or such?

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15  7:35     ` Ingo Molnar
@ 2015-04-15 17:14       ` Jason Low
  0 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-15 17:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Steven Rostedt, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton,
	jason.low2

On Wed, 2015-04-15 at 09:35 +0200, Ingo Molnar wrote:
> * Ingo Molnar <mingo@kernel.org> wrote:
> 
> > So after your changes we still have a separate:
> > 
> > struct task_cputime {
> >         cputime_t utime;
> >         cputime_t stime;
> >         unsigned long long sum_exec_runtime;
> > };
> > 
> > Which then weirdly overlaps with a different structure on a different 
> > abstraction level:
> > 
> >  struct thread_group_cputimer {
> > 	atomic64_t utime;
> > 	atomic64_t stime;
> > 	atomic64_t sum_exec_runtime;
> >  	int running;
> >  };
> > 
> > So I think it would be more obvious what's going on if we introduced 
> > an atomic task_cputime structure:
> > 
> >  struct task_cputime_atomic {
> > 	atomic64_t utime;
> > 	atomic64_t stime;
> > 	atomic64_t sum_exec_runtime;
> >  };
> > 
> > and put that into 'struct thread_group_cputimer':
> > 
> >  struct thread_group_cputimer {
> > 	struct task_cputime_atomic cputime_atomic;
> >   	int running;
> >  };
> > 
> > Maybe even factor out the main update and reading methods into 
> > expressively named helper inlines?
> 
> Btw., feel free to preserve your original series and turn this 
> factoring out into 1-2 extra patches on top of it: so that we preserve 
> your testing on the original series, and see the structure (and cost) 
> of the factoring out of the new data type.

Okay, I'll add a task_cputime_atomic.

That will convert things like:

    void sample_group_cputimer(struct task_cputime *times,
                               struct thread_group_cputimer *cputimer)

to

    void sample_atomic_cputimes(struct task_cputime *times
                                struct task_cputime_atomic *atomic_cputimes)

which makes more sense, and the new "task_cputime_atomic" can
potentially be used in other places.

Thanks,
Jason


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  7:46         ` Ingo Molnar
@ 2015-04-15 18:49           ` Jason Low
  2015-04-15 19:16             ` Steven Rostedt
  2015-04-16  2:46           ` Jason Low
  2015-04-16 16:52           ` Peter Zijlstra
  2 siblings, 1 reply; 34+ messages in thread
From: Jason Low @ 2015-04-15 18:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Peter Zijlstra, Mel Gorman, Rik van Riel,
	Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton, jason.low2

On Wed, 2015-04-15 at 09:46 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:

> > You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
> > and just a:
> > 
> > 	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;
> > 
> > Can be done. But I'm still trying to wrap my head around why this is
> > needed here. Comments would have been really helpful. We should make
> > all READ_ONCE() WRITE_ONCE and obsolete ACCESS_ONCE() have mandatory
> > comments just like we do with memory barriers.
> 
> So the original ACCESS_ONCE() barriers were misguided to begin with: I 
> think they tried to handle races with the scheduler balancing softirq 
> and tried to avoid having to use atomics for the sequence counter 
> (which would be overkill), but things like ACCESS_ONCE(x)++ never 
> guaranteed atomicity (or even coherency) of the update.
> 
> But since in reality this is only statistical sampling code, all these 
> compiler barriers can be removed I think. Peter, Mel, Rik, do you 
> agree?

So I'll keep the READ_ONCE nested inside WRITE_ONCE for the purpose of
this patch since this patch is a conversion from ACCESS_ONCE, but yes,
if the original purpose of ACCESS_ONCE was to do an atomic increment,
then the ACCESS_ONCE doesn't help with that.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15 10:37   ` Preeti U Murthy
@ 2015-04-15 19:09     ` Jason Low
  0 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-15 19:09 UTC (permalink / raw)
  To: Preeti U Murthy
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton,
	jason.low2

On Wed, 2015-04-15 at 16:07 +0530, Preeti U Murthy wrote:
> On 04/15/2015 04:39 AM, Jason Low wrote:
> >  /*
> > @@ -885,11 +890,8 @@ static void check_thread_timers(struct task_struct *tsk,
> >  static void stop_process_timers(struct signal_struct *sig)
> >  {
> >  	struct thread_group_cputimer *cputimer = &sig->cputimer;
> > -	unsigned long flags;
> > 
> > -	raw_spin_lock_irqsave(&cputimer->lock, flags);
> > -	cputimer->running = 0;
> > -	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
> > +	WRITE_ONCE(cputimer->running, 0);
> 
> Why do a WRITE_ONCE() here ?

Perhaps Peter can confirm/elaborate, but since we're now updating the
running field without the lock, we use WRITE_ONCE to guarantee that this
doesn't get optimized in any way. This can also serve as "documentation"
that we're writing to a shared variable without a lock.

>  Maybe you should explicitly mention this
> through a comment like Steven pointed out about all
> WRITE/READ/ACCESS_ONCE() usage.

Yeah, we should add a comment here.

Thanks,
Jason


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15 18:49           ` Jason Low
@ 2015-04-15 19:16             ` Steven Rostedt
  0 siblings, 0 replies; 34+ messages in thread
From: Steven Rostedt @ 2015-04-15 19:16 UTC (permalink / raw)
  To: Jason Low
  Cc: Ingo Molnar, Peter Zijlstra, Mel Gorman, Rik van Riel,
	Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton

On Wed, 15 Apr 2015 11:49:09 -0700
Jason Low <jason.low2@hp.com> wrote:

> So I'll keep the READ_ONCE nested inside WRITE_ONCE for the purpose of
> this patch since this patch is a conversion from ACCESS_ONCE, but yes,
> if the original purpose of ACCESS_ONCE was to do an atomic increment,
> then the ACCESS_ONCE doesn't help with that.

For the purpose of this patch, I think it's fine, as being more
paranoid is better than not being paranoid enough.

But this has shined light onto whether it is needed or not, and we
should figure that out in the not so far future.

-- Steve

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15 13:32     ` Peter Zijlstra
@ 2015-04-15 20:04       ` Jason Low
  0 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-15 20:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Linus Torvalds, Ingo Molnar,
	Thomas Gleixner, linux-kernel, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, jason.low2

On Wed, 2015-04-15 at 15:32 +0200, Peter Zijlstra wrote:
> On Wed, Apr 15, 2015 at 03:25:36PM +0200, Frederic Weisbecker wrote:
> > On Tue, Apr 14, 2015 at 04:09:45PM -0700, Jason Low wrote:
> > >  void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
> > >  {
> > >  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
> > >  	struct task_cputime sum;
> > >  
> > >  	if (!cputimer->running) {
> > >  		/*
> > >  		 * The POSIX timer interface allows for absolute time expiry
> > >  		 * values through the TIMER_ABSTIME flag, therefore we have
> > > +		 * to synchronize the timer to the clock every time we start it.
> > >  		 */
> > >  		thread_group_cputime(tsk, &sum);
> > > +		update_gt_cputime(cputimer, &sum);
> > > +		/* Start 'running' after update_gt_cputime() */
> > > +		smp_store_release(&cputimer->running, 1);
> > 
> > This barrier should be mirrored somewhere but I can't see where in this patch.
> > Maybe in another one in the series. Or maybe there is already a barrier in the
> > existing code that I'm missing. I would expect to see it in account_group_*_time().
> > In any case, there should be comment about what it mirrors.
> 
> I think it should be in cputimer_running(), which should use
> smp_load_acquire() to read cputimer->running.
> 
> That way you guarantee that everything observing 'running' will indeed
> observe the initialized state.

So I intended the smp_store_release() here to be mainly for
documentation purposes, to say that we would like to set running after
the update.

With patch 3/3, even if running happens to get set earlier, the worst
case scenario is that update_gt_cputime may have to do go through some
retry logic. This isn't much of a performance issue in practice
(especially compared to adding smp_load_acquire() to hot paths), since
we only enter this path when we need to enable the timers.

In that case, I'm wondering if should just convert this back to
WRITE_ONCE(cputimer->running, 1) and avoid adding barriers to the hot
paths?

Thanks,
Jason


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability
  2015-04-15 14:23   ` Davidlohr Bueso
@ 2015-04-15 21:15     ` Jason Low
  0 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-15 21:15 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Steven Rostedt,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, jason.low2

On Wed, 2015-04-15 at 07:23 -0700, Davidlohr Bueso wrote:
> On Tue, 2015-04-14 at 16:09 -0700, Jason Low wrote:
> > While running a database workload, we found a scalability issue with itimers.
> > 
> > Much of the problem was caused by the thread_group_cputimer spinlock.
> > Each time we account for group system/user time, we need to obtain a
> > thread_group_cputimer's spinlock to update the timers. On larger systems
> > (such as a 16 socket machine), this caused more than 30% of total time
> > spent trying to obtain this kernel lock to update these group timer stats.
> > 
> > This patch converts the timers to 64 bit atomic variables and use
> > atomic add to update them without a lock. With this patch, the percent
> > of total time spent updating thread group cputimer timers was reduced
> > from 30% down to less than 1%. 
> 
> What does 30% less time spent dealing with the thread_group_cputimer's
> spinlock buy us? iow, does this help DB benchmark throughput or such?

Yes, this also helps increase throughput since the scalability issue
causes a performance degradation in the database workload. I currently
don't have the exact  numbers though.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  2:40       ` Steven Rostedt
  2015-04-15  7:46         ` Ingo Molnar
@ 2015-04-16  2:29         ` Jason Low
  2015-04-16  2:37           ` Steven Rostedt
  1 sibling, 1 reply; 34+ messages in thread
From: Jason Low @ 2015-04-16  2:29 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton,
	jason.low2

On Tue, 2015-04-14 at 22:40 -0400, Steven Rostedt wrote:
> You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
> and just a:
> 
> 	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;

Just to confirm, is this a typo? Because there really is a numa_scan_seq
in the task_struct itself too :)

p->mm->numa_scan_seq is read in task_numa_placement() with
ACCESS_ONCE(), and so the benefit that I do see with it is that it makes
it consistent by doing the updates with ACCESS_ONCE too (for
documentation purposes).

If that's really the case:

    WRITE_ONCE(p->mm->numa_scan_seq, p->mm->numa_scan_seq + 1)

should be enough for that.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16  2:29         ` Jason Low
@ 2015-04-16  2:37           ` Steven Rostedt
  0 siblings, 0 replies; 34+ messages in thread
From: Steven Rostedt @ 2015-04-16  2:37 UTC (permalink / raw)
  To: Jason Low
  Cc: Linus Torvalds, Peter Zijlstra, Ingo Molnar, Thomas Gleixner,
	linux-kernel, Paul E. McKenney, Andrew Morton, Oleg Nesterov,
	Mike Galbraith, Frederic Weisbecker, Mel Gorman, Preeti U Murthy,
	hideaki.kimura, Aswin Chandramouleeswaran, Scott J Norton

On Wed, 15 Apr 2015 19:29:01 -0700
Jason Low <jason.low2@hp.com> wrote:

> On Tue, 2015-04-14 at 22:40 -0400, Steven Rostedt wrote:
> > You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
> > and just a:
> > 
> > 	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;
> 
> Just to confirm, is this a typo? Because there really is a numa_scan_seq
> in the task_struct itself too :)
> 

Oops, yeah that was a typo.

-- Steve

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  7:46         ` Ingo Molnar
  2015-04-15 18:49           ` Jason Low
@ 2015-04-16  2:46           ` Jason Low
  2015-04-16 16:52           ` Peter Zijlstra
  2 siblings, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-16  2:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Peter Zijlstra, Mel Gorman, Rik van Riel,
	Linus Torvalds, Peter Zijlstra, Thomas Gleixner, linux-kernel,
	Paul E. McKenney, Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton, jason.low2

Hi Ingo,

On Wed, 2015-04-15 at 09:46 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> > You are correct. Now I'm thinking that the WRITE_ONCE() is not needed,
> > and just a:
> > 
> > 	p->mm->numa_scan_seq = READ_ONCE(p->numa_scan_seq) + 1;
> > 
> > Can be done. But I'm still trying to wrap my head around why this is
> > needed here. Comments would have been really helpful. We should make
> > all READ_ONCE() WRITE_ONCE and obsolete ACCESS_ONCE() have mandatory
> > comments just like we do with memory barriers.
> 
> So the original ACCESS_ONCE() barriers were misguided to begin with: I 
> think they tried to handle races with the scheduler balancing softirq 
> and tried to avoid having to use atomics for the sequence counter 
> (which would be overkill), but things like ACCESS_ONCE(x)++ never 
> guaranteed atomicity (or even coherency) of the update.
> 
> But since in reality this is only statistical sampling code, all these 
> compiler barriers can be removed I think. Peter, Mel, Rik, do you 
> agree?

Though in the read side for accessing things such as numa_scan_seq, we
still want to keep them in order to guarantee that numa_scan_seq is only
loaded once right?

static void task_numa_placement(struct task_struct *p)
{
    ...

    seq = ACCESS_ONCE(p->mm->numa_scan_seq);
    if (p->numa_scan_seq == seq)
        return;
    p->numa_scan_seq = seq;

    ...
}


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-15  7:46         ` Ingo Molnar
  2015-04-15 18:49           ` Jason Low
  2015-04-16  2:46           ` Jason Low
@ 2015-04-16 16:52           ` Peter Zijlstra
  2015-04-16 18:02             ` Ingo Molnar
  2 siblings, 1 reply; 34+ messages in thread
From: Peter Zijlstra @ 2015-04-16 16:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton

On Wed, Apr 15, 2015 at 09:46:01AM +0200, Ingo Molnar wrote:

 > @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 >  
 >  static void reset_ptenuma_scan(struct task_struct *p)
 >  {
 > -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
 > +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 
vs

	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
	if (p->numa_scan_seq == seq)
		return;
	p->numa_scan_seq = seq;


> So the original ACCESS_ONCE() barriers were misguided to begin with: I 
> think they tried to handle races with the scheduler balancing softirq 
> and tried to avoid having to use atomics for the sequence counter 
> (which would be overkill), but things like ACCESS_ONCE(x)++ never 
> guaranteed atomicity (or even coherency) of the update.
> 
> But since in reality this is only statistical sampling code, all these 
> compiler barriers can be removed I think. Peter, Mel, Rik, do you 
> agree?

ACCESS_ONCE() is not a compiler barrier

The 'read' side uses ACCESS_ONCE() for two purposes:
 - to load the value once, we don't want the seq number to change under
   us for obvious reasons
 - to avoid load tearing and observe weird seq numbers

The update side uses ACCESS_ONCE() to avoid write tearing, and strictly
speaking it should also worry about read-tearing since its not hard
serialized, although its very unlikely to actually have concurrency
(IIRC).

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 16:52           ` Peter Zijlstra
@ 2015-04-16 18:02             ` Ingo Molnar
  2015-04-16 18:15               ` Peter Zijlstra
  0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-04-16 18:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Wed, Apr 15, 2015 at 09:46:01AM +0200, Ingo Molnar wrote:
> 
>  > @@ -2088,7 +2088,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>  >  
>  >  static void reset_ptenuma_scan(struct task_struct *p)
>  >  {
>  > -	ACCESS_ONCE(p->mm->numa_scan_seq)++;
>  > +	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
>  
> vs
> 
> 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
> 	if (p->numa_scan_seq == seq)
> 		return;
> 	p->numa_scan_seq = seq;
> 
> 
> > So the original ACCESS_ONCE() barriers were misguided to begin with: I 
> > think they tried to handle races with the scheduler balancing softirq 
> > and tried to avoid having to use atomics for the sequence counter 
> > (which would be overkill), but things like ACCESS_ONCE(x)++ never 
> > guaranteed atomicity (or even coherency) of the update.
> > 
> > But since in reality this is only statistical sampling code, all these 
> > compiler barriers can be removed I think. Peter, Mel, Rik, do you 
> > agree?
> 
> ACCESS_ONCE() is not a compiler barrier

It's not a general compiler barrier (and I didn't claim so) but it is 
still a compiler barrier: it's documented as a weak, variable specific 
barrier in Documentation/memor-barriers.txt:

  COMPILER BARRIER
  ----------------

  The Linux kernel has an explicit compiler barrier function that  prevents the
  compiler from moving the memory accesses either side of it to the  other side:

        barrier();

  This is a general barrier -- there are no read-read or write-write variants
  of barrier().  However, ACCESS_ONCE() can be thought of as a weak form
  for barrier() that affects only the specific accesses flagged by the
  ACCESS_ONCE().

 [...]

> The 'read' side uses ACCESS_ONCE() for two purposes:
>  - to load the value once, we don't want the seq number to change under
>    us for obvious reasons
>  - to avoid load tearing and observe weird seq numbers
> 
> The update side uses ACCESS_ONCE() to avoid write tearing, and 
> strictly speaking it should also worry about read-tearing since its 
> not hard serialized, although its very unlikely to actually have 
> concurrency (IIRC).

So what bad effects can there be from the very unlikely read and write 
tearing?

AFAICS nothing particularly bad. On the read side:

        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;

If p->mm->numa_scan_seq gets loaded twice (very unlikely), and two 
different values happen, then we might get a 'double' NUMA placement 
run - i.e. statistical noise.

On the update side:

        ACCESS_ONCE(p->mm->numa_scan_seq)++;
        p->mm->numa_scan_offset = 0;

If the compiler tears that up we might skip an update - again 
statistical noise at worst.

Nor is compiler tearing the only theoretical worry here: in theory, 
with long cache coherency latencies we might get two updates 'mixed 
up' and resulting in a (single) missed update.

Only atomics would solve all the races, but I think that would be 
overdoing it.

This is what I meant by that there's no harm from this race.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 18:02             ` Ingo Molnar
@ 2015-04-16 18:15               ` Peter Zijlstra
  2015-04-16 18:24                 ` Ingo Molnar
  2015-04-16 21:00                 ` Jason Low
  0 siblings, 2 replies; 34+ messages in thread
From: Peter Zijlstra @ 2015-04-16 18:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton

On Thu, Apr 16, 2015 at 08:02:27PM +0200, Ingo Molnar wrote:
> > ACCESS_ONCE() is not a compiler barrier
> 
> It's not a general compiler barrier (and I didn't claim so) but it is 
> still a compiler barrier: it's documented as a weak, variable specific 
> barrier in Documentation/memor-barriers.txt:

Ok, fair enough. I just don't generally think of them as 'barriers'.

> > The 'read' side uses ACCESS_ONCE() for two purposes:
> >  - to load the value once, we don't want the seq number to change under
> >    us for obvious reasons
> >  - to avoid load tearing and observe weird seq numbers
> > 
> > The update side uses ACCESS_ONCE() to avoid write tearing, and 
> > strictly speaking it should also worry about read-tearing since its 
> > not hard serialized, although its very unlikely to actually have 
> > concurrency (IIRC).

> This is what I meant by that there's no harm from this race.

Ok, but I would still like to preserve the READ one on the usage site
and the WRITE one on the update side, if only as documentation that
there's something 'special' happening.

And while the effects here might end up being statistical noise, I have
conceptual problems with re-reading seq counts, that's not proper.

And its not like they really cost anything.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 18:15               ` Peter Zijlstra
@ 2015-04-16 18:24                 ` Ingo Molnar
  2015-04-16 19:02                   ` Peter Zijlstra
  2015-04-17  3:25                   ` Jason Low
  2015-04-16 21:00                 ` Jason Low
  1 sibling, 2 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-04-16 18:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, Apr 16, 2015 at 08:02:27PM +0200, Ingo Molnar wrote:
> > > ACCESS_ONCE() is not a compiler barrier
> > 
> > It's not a general compiler barrier (and I didn't claim so) but it is 
> > still a compiler barrier: it's documented as a weak, variable specific 
> > barrier in Documentation/memor-barriers.txt:
> 
> Ok, fair enough. I just don't generally think of them as 'barriers'.
> 
> > > The 'read' side uses ACCESS_ONCE() for two purposes:
> > >  - to load the value once, we don't want the seq number to change under
> > >    us for obvious reasons
> > >  - to avoid load tearing and observe weird seq numbers
> > > 
> > > The update side uses ACCESS_ONCE() to avoid write tearing, and 
> > > strictly speaking it should also worry about read-tearing since its 
> > > not hard serialized, although its very unlikely to actually have 
> > > concurrency (IIRC).
> 
> > This is what I meant by that there's no harm from this race.
> 
> Ok, but I would still like to preserve the READ one on the usage 
> site and the WRITE one on the update side, if only as documentation 
> that there's something 'special' happening.
> 
> And while the effects here might end up being statistical noise, I 
> have conceptual problems with re-reading seq counts, that's not 
> proper.

Yes ... but that still leaves this weird feeling that it's really 
still a bit wrong because it's not proper parallel code, we just 
reduced the probability of the remaining races radically. And it's not 
like GCC (or any compiler) does load tearing or even store tearing 
under normal -O2 for such code patterns, right?

> 
> And its not like they really cost anything.

That's true.

Would it make sense to add a few comments to the seq field definition 
site(s), about how it's supposed to be accessed - or to the 
READ_ONCE()/WRITE_ONCE() sites, to keep people from wondering?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 18:24                 ` Ingo Molnar
@ 2015-04-16 19:02                   ` Peter Zijlstra
  2015-04-16 19:41                     ` Paul E. McKenney
  2015-04-17  3:25                   ` Jason Low
  1 sibling, 1 reply; 34+ messages in thread
From: Peter Zijlstra @ 2015-04-16 19:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton

On Thu, Apr 16, 2015 at 08:24:27PM +0200, Ingo Molnar wrote:
> Yes ... but that still leaves this weird feeling that it's really 
> still a bit wrong because it's not proper parallel code, we just 
> reduced the probability of the remaining races radically. And it's not 
> like GCC (or any compiler) does load tearing or even store tearing 
> under normal -O2 for such code patterns, right?

I think Paul once caught GCC doing something silly, but typically no.
The re-loads however have been frequently observed.

> > And its not like they really cost anything.
> 
> That's true.
> 
> Would it make sense to add a few comments to the seq field definition 
> site(s), about how it's supposed to be accessed - or to the 
> READ_ONCE()/WRITE_ONCE() sites, to keep people from wondering?

For sure, can do a comment no problem.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 19:02                   ` Peter Zijlstra
@ 2015-04-16 19:41                     ` Paul E. McKenney
  0 siblings, 0 replies; 34+ messages in thread
From: Paul E. McKenney @ 2015-04-16 19:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Steven Rostedt, Mel Gorman, Rik van Riel, Jason Low,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Mel Gorman,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton

On Thu, Apr 16, 2015 at 09:02:08PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 16, 2015 at 08:24:27PM +0200, Ingo Molnar wrote:
> > Yes ... but that still leaves this weird feeling that it's really 
> > still a bit wrong because it's not proper parallel code, we just 
> > reduced the probability of the remaining races radically. And it's not 
> > like GCC (or any compiler) does load tearing or even store tearing 
> > under normal -O2 for such code patterns, right?
> 
> I think Paul once caught GCC doing something silly, but typically no.
> The re-loads however have been frequently observed.

Too true!

Some architectures do split stores of constants.  For example, given
an architecture with a store-immediate instruction with (say) a four-bit 
immediate field, gcc can compile this:

	x = 0x00020008;

to something like:

	st $2, (x+2)
	st $8, (x)

And gcc was doing this even though the store to x had volatile semantics,
a bug which has thankfully since been fixed.

But then again, I am paranoid.  So I would not put it past gcc to think
to itself "Hmmm...  I just loaded x a few instructions back, and only
clobbered the low-order byte.  So I will just reload that byte into
low-order byte of the register containing the remnants of the previous
load."

No, I have never seen gcc do that, but a C compiler could do that and
still claim to be complying with the standard.  :-/

							Thanx, Paul

> > > And its not like they really cost anything.
> > 
> > That's true.
> > 
> > Would it make sense to add a few comments to the seq field definition 
> > site(s), about how it's supposed to be accessed - or to the 
> > READ_ONCE()/WRITE_ONCE() sites, to keep people from wondering?
> 
> For sure, can do a comment no problem.
> 


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 18:15               ` Peter Zijlstra
  2015-04-16 18:24                 ` Ingo Molnar
@ 2015-04-16 21:00                 ` Jason Low
  1 sibling, 0 replies; 34+ messages in thread
From: Jason Low @ 2015-04-16 21:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Steven Rostedt, Mel Gorman, Rik van Riel,
	Linus Torvalds, Thomas Gleixner, linux-kernel, Paul E. McKenney,
	Andrew Morton, Oleg Nesterov, Mike Galbraith,
	Frederic Weisbecker, Mel Gorman, Preeti U Murthy, hideaki.kimura,
	Aswin Chandramouleeswaran, Scott J Norton, jason.low2

On Thu, 2015-04-16 at 20:15 +0200, Peter Zijlstra wrote:
> On Thu, Apr 16, 2015 at 08:02:27PM +0200, Ingo Molnar wrote:
> > > ACCESS_ONCE() is not a compiler barrier
> > 
> > It's not a general compiler barrier (and I didn't claim so) but it is 
> > still a compiler barrier: it's documented as a weak, variable specific 
> > barrier in Documentation/memor-barriers.txt:
> 
> Ok, fair enough. I just don't generally think of them as 'barriers'.
> 
> > > The 'read' side uses ACCESS_ONCE() for two purposes:
> > >  - to load the value once, we don't want the seq number to change under
> > >    us for obvious reasons
> > >  - to avoid load tearing and observe weird seq numbers
> > > 
> > > The update side uses ACCESS_ONCE() to avoid write tearing, and 
> > > strictly speaking it should also worry about read-tearing since its 
> > > not hard serialized, although its very unlikely to actually have 
> > > concurrency (IIRC).
> 
> > This is what I meant by that there's no harm from this race.
> 
> Ok, but I would still like to preserve the READ one on the usage site
> and the WRITE one on the update side, if only as documentation that
> there's something 'special' happening.

In that case, in our patch 2, I suppose we also want to use READ_ONCE()
when accessing the running field, which also helps document that we're
reading and writing to a non-atomic value that gets accessed without a
lock.

> And while the effects here might end up being statistical noise, I have
> conceptual problems with re-reading seq counts, that's not proper.
> 
> And its not like they really cost anything.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-16 18:24                 ` Ingo Molnar
  2015-04-16 19:02                   ` Peter Zijlstra
@ 2015-04-17  3:25                   ` Jason Low
  2015-04-17  8:19                     ` Ingo Molnar
  1 sibling, 1 reply; 34+ messages in thread
From: Jason Low @ 2015-04-17  3:25 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Steven Rostedt, Rik van Riel, Linus Torvalds,
	Thomas Gleixner, linux-kernel, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Mel Gorman,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton, jason.low2

On Thu, 2015-04-16 at 20:24 +0200, Ingo Molnar wrote:
> Would it make sense to add a few comments to the seq field definition 
> site(s), about how it's supposed to be accessed - or to the 
> READ_ONCE()/WRITE_ONCE() sites, to keep people from wondering?

How about this:

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5a44371..63fa87f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
+	/*
+	 * The p->mm->numa_scan_seq gets updated without
+	 * exclusive access. Use READ_ONCE() here to ensure
+	 * that the field is read in a single access.
+	 */
 	seq = READ_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
@@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 static void reset_ptenuma_scan(struct task_struct *p)
 {
+	/*
+	 * We only did a read acquisition of the mmap sem, so
+	 * p->mm->numa_scan_seq is written to without exclusive access.
+	 * That's not much of an issue though, since this is just used
+	 * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
+	 * are not expensive, to avoid load/store tearing.
+	 */
 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 	p->mm->numa_scan_offset = 0;
 }



^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler
  2015-04-17  3:25                   ` Jason Low
@ 2015-04-17  8:19                     ` Ingo Molnar
  0 siblings, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-04-17  8:19 UTC (permalink / raw)
  To: Jason Low
  Cc: Peter Zijlstra, Steven Rostedt, Rik van Riel, Linus Torvalds,
	Thomas Gleixner, linux-kernel, Paul E. McKenney, Andrew Morton,
	Oleg Nesterov, Mike Galbraith, Frederic Weisbecker, Mel Gorman,
	Preeti U Murthy, hideaki.kimura, Aswin Chandramouleeswaran,
	Scott J Norton


* Jason Low <jason.low2@hp.com> wrote:

> On Thu, 2015-04-16 at 20:24 +0200, Ingo Molnar wrote:
> > Would it make sense to add a few comments to the seq field definition 
> > site(s), about how it's supposed to be accessed - or to the 
> > READ_ONCE()/WRITE_ONCE() sites, to keep people from wondering?
> 
> How about this:
> 
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5a44371..63fa87f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1794,6 +1794,11 @@ static void task_numa_placement(struct task_struct *p)
>  	u64 runtime, period;
>  	spinlock_t *group_lock = NULL;
>  
> +	/*
> +	 * The p->mm->numa_scan_seq gets updated without
> +	 * exclusive access. Use READ_ONCE() here to ensure
> +	 * that the field is read in a single access.
> +	 */
>  	seq = READ_ONCE(p->mm->numa_scan_seq);
>  	if (p->numa_scan_seq == seq)
>  		return;
> @@ -2107,6 +2112,13 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
>  
>  static void reset_ptenuma_scan(struct task_struct *p)
>  {
> +	/*
> +	 * We only did a read acquisition of the mmap sem, so
> +	 * p->mm->numa_scan_seq is written to without exclusive access.
> +	 * That's not much of an issue though, since this is just used
> +	 * for statistical sampling. Use WRITE_ONCE and READ_ONCE, which
> +	 * are not expensive, to avoid load/store tearing.
> +	 */
>  	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
>  	p->mm->numa_scan_offset = 0;
>  }

Perfect! It just needs a changelog and a SOB.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2015-04-17  8:19 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-14 23:09 [PATCH 0/3] sched, timer: Improve scalability of itimers Jason Low
2015-04-14 23:09 ` [PATCH 1/3] sched, timer: Remove usages of ACCESS_ONCE in the scheduler Jason Low
2015-04-14 23:59   ` Steven Rostedt
2015-04-15  2:12     ` Jason Low
2015-04-15  2:40       ` Steven Rostedt
2015-04-15  7:46         ` Ingo Molnar
2015-04-15 18:49           ` Jason Low
2015-04-15 19:16             ` Steven Rostedt
2015-04-16  2:46           ` Jason Low
2015-04-16 16:52           ` Peter Zijlstra
2015-04-16 18:02             ` Ingo Molnar
2015-04-16 18:15               ` Peter Zijlstra
2015-04-16 18:24                 ` Ingo Molnar
2015-04-16 19:02                   ` Peter Zijlstra
2015-04-16 19:41                     ` Paul E. McKenney
2015-04-17  3:25                   ` Jason Low
2015-04-17  8:19                     ` Ingo Molnar
2015-04-16 21:00                 ` Jason Low
2015-04-16  2:29         ` Jason Low
2015-04-16  2:37           ` Steven Rostedt
2015-04-14 23:09 ` [PATCH 2/3] sched, timer: Use atomics for thread_group_cputimer to improve scalability Jason Low
2015-04-15  7:33   ` Ingo Molnar
2015-04-15  7:35     ` Ingo Molnar
2015-04-15 17:14       ` Jason Low
2015-04-15 10:37   ` Preeti U Murthy
2015-04-15 19:09     ` Jason Low
2015-04-15 13:25   ` Frederic Weisbecker
2015-04-15 13:32     ` Peter Zijlstra
2015-04-15 20:04       ` Jason Low
2015-04-15 14:23   ` Davidlohr Bueso
2015-04-15 21:15     ` Jason Low
2015-04-14 23:09 ` [PATCH 3/3] sched, timer: Use cmpxchg to do updates in update_gt_cputime() Jason Low
2015-04-14 23:53 ` [PATCH 0/3] sched, timer: Improve scalability of itimers Linus Torvalds
2015-04-15  7:24   ` Ingo Molnar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.