All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
@ 2016-08-09  9:51 Peter Zijlstra
  2016-08-09 23:47 ` John Stultz
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Peter Zijlstra @ 2016-08-09  9:51 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: John Stultz, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos


Currently the percpu-rwsem switches to (global) atomic ops while a
writer is waiting; which could be quite a while and slows down
releasing the readers.

This patch cures this problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

This also enables using the percpu-rwsem with rcu_sync disabled in order
to bias the implementation differently, reducing the writer latency by
adding some cost to readers.

Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   84 +++++++++++++--
 kernel/locking/percpu-rwsem.c |  228 ++++++++++++++++++++++++------------------
 2 files changed, 206 insertions(+), 106 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -10,30 +10,96 @@
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
-	unsigned int __percpu	*fast_read_ctr;
+	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
+	wait_queue_head_t	writer;
+	int			readers_block;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start checking
+	 * counters while we are here. So if we see !sem->state, we know that
+	 * the writer won't be checking until we're past the preempt_enable()
+	 * and that one the synchronize_sched() is done, the writer will see
+	 * anything we did within this RCU-sched read-size critical section.
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem, false); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	int ret = 1;
+
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+
+	if (ret)
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->read_count);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 				const char *, struct lock_class_key *);
+
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,152 +8,186 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
-	brw->fast_read_ctr = alloc_percpu(int);
-	if (unlikely(!brw->fast_read_ctr))
+	sem->read_count = alloc_percpu(int);
+	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
-	atomic_set(&brw->slow_read_ctr, 0);
-	init_waitqueue_head(&brw->write_waitq);
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+	init_waitqueue_head(&sem->writer);
+	sem->readers_block = 0;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
 	/*
 	 * XXX: temporary kludge. The error path in alloc_super()
 	 * assumes that percpu_free_rwsem() is safe after kzalloc().
 	 */
-	if (!brw->fast_read_ctr)
+	if (!sem->read_count)
 		return;
 
-	rcu_sync_dtor(&brw->rss);
-	free_percpu(brw->fast_read_ctr);
-	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+	rcu_sync_dtor(&sem->rss);
+	free_percpu(sem->read_count);
+	sem->read_count = NULL; /* catch use after free bugs */
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 {
-	bool success;
+	/*
+	 * Due to having preemption disabled the decrement happens on
+	 * the same CPU as the increment, avoiding the
+	 * increment-on-one-CPU-and-decrement-on-another problem.
+	 *
+	 * If the reader misses the writer's assignment of readers_block, then
+	 * the writer is guaranteed to see the reader's increment.
+	 *
+	 * Conversely, any readers that increment their sem->read_count after
+	 * the writer looks are guaranteed to see the readers_block value,
+	 * which in turn means that they are guaranteed to immediately
+	 * decrement their sem->read_count, so that it doesn't matter that the
+	 * writer missed them.
+	 */
 
-	preempt_disable();
-	success = rcu_sync_is_idle(&brw->rss);
-	if (likely(success))
-		__this_cpu_add(*brw->fast_read_ctr, val);
-	preempt_enable();
+	smp_mb(); /* A matches D */
 
-	return success;
-}
+	/*
+	 * If !readers_block the critical section starts here, matched by the
+	 * release in percpu_up_write().
+	 */
+	if (likely(!smp_load_acquire(&sem->readers_block)))
+		return 1;
 
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
-	might_sleep();
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+	/*
+	 * Per the above comment; we still have preemption disabled and
+	 * will thus decrement on the same CPU as we incremented.
+	 */
+	__percpu_up_read(sem);
 
-	if (likely(update_fast_ctr(brw, +1)))
-		return;
+	if (try)
+		return 0;
 
-	/* Avoid rwsem_acquire_read() and rwsem_release() */
-	__down_read(&brw->rw_sem);
-	atomic_inc(&brw->slow_read_ctr);
-	__up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
-
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
-	if (unlikely(!update_fast_ctr(brw, +1))) {
-		if (!__down_read_trylock(&brw->rw_sem))
-			return 0;
-		atomic_inc(&brw->slow_read_ctr);
-		__up_read(&brw->rw_sem);
-	}
+	/*
+	 * We either call schedule() in the wait, or we'll fall through
+	 * and reschedule on the preempt_enable() in percpu_down_read().
+	 */
+	preempt_enable_no_resched();
 
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+	/*
+	 * Avoid lockdep for the down/up_read() we already have them.
+	 */
+	__down_read(&sem->rw_sem);
+	this_cpu_inc(*sem->read_count);
+	__up_read(&sem->rw_sem);
+
+	preempt_disable();
 	return 1;
 }
+EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
-	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
-	if (likely(update_fast_ctr(brw, -1)))
-		return;
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to aggregate
+	 * zero, as that is the only time it matters) they will also see our
+	 * critical section.
+	 */
+	__this_cpu_dec(*sem->read_count);
 
-	/* false-positive is possible but harmless */
-	if (atomic_dec_and_test(&brw->slow_read_ctr))
-		wake_up_all(&brw->write_waitq);
+	/* Prod writer to recheck readers_active */
+	wake_up(&sem->writer);
 }
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
 
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	compiletime_assert_atomic_type(__sum);				\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
+
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-	unsigned int sum = 0;
-	int cpu;
+	if (per_cpu_sum(*sem->read_count) != 0)
+		return false;
 
-	for_each_possible_cpu(cpu) {
-		sum += per_cpu(*brw->fast_read_ctr, cpu);
-		per_cpu(*brw->fast_read_ctr, cpu) = 0;
-	}
+	/*
+	 * If we observed the decrement; ensure we see the entire critical
+	 * section.
+	 */
+
+	smp_mb(); /* C matches B */
 
-	return sum;
+	return true;
 }
 
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	/* Notify readers to take the slow path. */
+	rcu_sync_enter(&sem->rss);
+
+	down_write(&sem->rw_sem);
+
 	/*
-	 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
-	 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
-	 *
-	 * The latter synchronises us with the preceding readers which used
-	 * the fast-past, so we can not miss the result of __this_cpu_add()
-	 * or anything else inside their criticial sections.
+	 * Notify new readers to block; up until now, and thus throughout the
+	 * longish rcu_sync_enter() above, new readers could still come in.
 	 */
-	rcu_sync_enter(&brw->rss);
+	WRITE_ONCE(sem->readers_block, 1);
 
-	/* exclude other writers, and block the new readers completely */
-	down_write(&brw->rw_sem);
+	smp_mb(); /* D matches A */
 
-	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+	/*
+	 * If they don't see our writer of readers_block, then we are
+	 * guaranteed to see their sem->read_count increment, and therefore
+	 * will wait for them.
+	 */
 
-	/* wait for all readers to complete their percpu_up_read() */
-	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+	/* Wait for all now active readers to complete. */
+	wait_event(sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	/* release the lock, but the readers can't use the fast-path */
-	up_write(&brw->rw_sem);
 	/*
-	 * Enable the fast-path in percpu_down_read() and percpu_up_read()
-	 * but only after another gp pass; this adds the necessary barrier
-	 * to ensure the reader can't miss the changes done by us.
+	 * Signal the writer is done, no fast path yet.
+	 *
+	 * One reason that we cannot just immediately flip to readers_fast is
+	 * that new readers might fail to see the results of this writer's
+	 * critical section.
+	 *
+	 * Therefore we force it through the slow path which guarantees an
+	 * acquire and thereby guarantees the critical section's consistency.
+	 */
+	smp_store_release(&sem->readers_block, 0);
+
+	/*
+	 * Release the write lock, this will allow readers back in the game.
+	 */
+	up_write(&sem->rw_sem);
+
+	/*
+	 * Once this completes (at least one RCU-sched grace period hence) the
+	 * reader fast path will be available again. Safe to use outside the
+	 * exclusive write lock because its counting.
 	 */
-	rcu_sync_exit(&brw->rss);
+	rcu_sync_exit(&sem->rss);
 }
 EXPORT_SYMBOL_GPL(percpu_up_write);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-09  9:51 [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2016-08-09 23:47 ` John Stultz
  2016-08-10  7:56   ` Peter Zijlstra
  2016-08-11 16:54 ` [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write() Oleg Nesterov
  2016-08-31  5:21 ` [v2] locking/percpu-rwsem: Optimize readers and reduce global impact Guenter Roeck
  2 siblings, 1 reply; 19+ messages in thread
From: John Stultz @ 2016-08-09 23:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On Tue, Aug 9, 2016 at 2:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> Currently the percpu-rwsem switches to (global) atomic ops while a
> writer is waiting; which could be quite a while and slows down
> releasing the readers.
>
> This patch cures this problem by ordering the reader-state vs
> reader-count (see the comments in __percpu_down_read() and
> percpu_down_write()). This changes a global atomic op into a full
> memory barrier, which doesn't have the global cacheline contention.
>
> This also enables using the percpu-rwsem with rcu_sync disabled in order
> to bias the implementation differently, reducing the writer latency by
> adding some cost to readers.

So this by itself doesn't help us much, but including the following
from Oleg does help quite a bit:

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index db27804..9e9200b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5394,6 +5394,8 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));

+       rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+
        mutex_lock(&cgroup_mutex);

        /* Add init_css_set to the hash table */


thanks
-john

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-09 23:47 ` John Stultz
@ 2016-08-10  7:56   ` Peter Zijlstra
  2016-08-10 17:09     ` Oleg Nesterov
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2016-08-10  7:56 UTC (permalink / raw)
  To: John Stultz
  Cc: Oleg Nesterov, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On Tue, Aug 09, 2016 at 04:47:38PM -0700, John Stultz wrote:
> On Tue, Aug 9, 2016 at 2:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Currently the percpu-rwsem switches to (global) atomic ops while a
> > writer is waiting; which could be quite a while and slows down
> > releasing the readers.
> >
> > This patch cures this problem by ordering the reader-state vs
> > reader-count (see the comments in __percpu_down_read() and
> > percpu_down_write()). This changes a global atomic op into a full
> > memory barrier, which doesn't have the global cacheline contention.
> >
> > This also enables using the percpu-rwsem with rcu_sync disabled in order
> > to bias the implementation differently, reducing the writer latency by
> > adding some cost to readers.
> 
> So this by itself doesn't help us much, but including the following
> from Oleg does help quite a bit:

Correct, Oleg was going to send his rcu_sync rework on top of this. But
since its holiday season things might be tad delayed.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-10  7:56   ` Peter Zijlstra
@ 2016-08-10 17:09     ` Oleg Nesterov
       [not found]       ` <CAH7ZN-zXrenrbbcmvZ+biNozYe21jw6fULopG=g9-xRwWHE6nw@mail.gmail.com>
  0 siblings, 1 reply; 19+ messages in thread
From: Oleg Nesterov @ 2016-08-10 17:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: John Stultz, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On 08/10, Peter Zijlstra wrote:
>
> On Tue, Aug 09, 2016 at 04:47:38PM -0700, John Stultz wrote:
> > On Tue, Aug 9, 2016 at 2:51 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Currently the percpu-rwsem switches to (global) atomic ops while a
> > > writer is waiting; which could be quite a while and slows down
> > > releasing the readers.
> > >
> > > This patch cures this problem by ordering the reader-state vs
> > > reader-count (see the comments in __percpu_down_read() and
> > > percpu_down_write()). This changes a global atomic op into a full
> > > memory barrier, which doesn't have the global cacheline contention.
> > >
> > > This also enables using the percpu-rwsem with rcu_sync disabled in order
> > > to bias the implementation differently, reducing the writer latency by
> > > adding some cost to readers.
> >
> > So this by itself doesn't help us much, but including the following
> > from Oleg does help quite a bit:
>
> Correct, Oleg was going to send his rcu_sync rework on top of this. But
> since its holiday season things might be tad delayed.

Yeees. The patch is ready and even seems to work, but I still can't (re)write
the comments. Will try to finish tomorrow.

But. We need something simple and backportable, so I am going to send the
patch below first. As you can see this is your "sabotage" change, just
the new helper was renamed + s/!GP_IDLE/GP_PASSED/ fix. And the only reason
I can't send it today is that somehow I can't write the changelog ;)

So I would be really happy if you send this change instead of me, I am going
to keep "From: peterz" anyway.

Oleg.


diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e..ece7ed9 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff0..614f9cd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5609,6 +5609,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9..c9b7bc8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -83,6 +83,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 
 /**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() pairs
+ * turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
+/**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write()
  2016-08-09  9:51 [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2016-08-09 23:47 ` John Stultz
@ 2016-08-11 16:54 ` Oleg Nesterov
  2016-08-18 10:59   ` [tip:locking/core] locking, rcu, cgroup: Avoid " tip-bot for Peter Zijlstra
  2016-08-18 13:41   ` tip-bot for Peter Zijlstra
  2016-08-31  5:21 ` [v2] locking/percpu-rwsem: Optimize readers and reduce global impact Guenter Roeck
  2 siblings, 2 replies; 19+ messages in thread
From: Oleg Nesterov @ 2016-08-11 16:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: John Stultz, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

From: Peter Zijlstra <peterz@infradead.org>

The current percpu-rwsem read side is entirely free of serializing insns
at the cost of having a synchronize_sched() in the write path.

The latency of the synchronize_sched() is too high for cgroups. The
commit 1ed1328792ff talks about the write path being a fairly cold path
but this is not the case for Android which moves task to the foreground
cgroup and back around binder IPC calls from foreground processes to
background processes, so it is significantly hotter than human initiated
operations.

Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the
problem, hopefully it should not be that slow after another commit
80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global
impact").

We could just add rcu_sync_enter() into cgroup_init() but we do not want
another synchronize_sched() at boot time, so this patch adds the new helper
which doesn't block but currently can only be called before the first use.

Cc: Tejun Heo <tj@kernel.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/rcu_sync.h |  1 +
 kernel/cgroup.c          |  6 ++++++
 kernel/rcu/sync.c        | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e..ece7ed9 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff0..8b79751 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5609,6 +5609,12 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9..e358313 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -83,6 +83,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 
 /**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
+/**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
       [not found]         ` <5a2cc178ee03466fa3b104f8f28b44ff@NASANEXM02C.na.qualcomm.com>
@ 2016-08-13  1:44           ` Om Dhyade
  2016-08-24 21:16             ` John Stultz
  0 siblings, 1 reply; 19+ messages in thread
From: Om Dhyade @ 2016-08-13  1:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: John Stultz, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

Thank you Dimtry for sharing the patches.

Update from my tests:
Use-case: Android application launches.

I tested the patches on android N build, i see max latency ~7ms.
In my tests, the wait is due to: copy_process(fork.c) blocks all threads 
in __cgroup_procs_write including threads which are not part of the 
forking process's thread-group.

Dimtry had provided a hack patch which reverts to per-process rw-sem 
which had max latency of ~2ms.

android user-space binder library does 2 cgroup write operations per 
transaction, apart from the copy_process(fork.c) wait, i see pre-emption 
in _cgroup_procs_write causing waits.

Thanks.

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [tip:locking/core] locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write()
  2016-08-11 16:54 ` [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write() Oleg Nesterov
@ 2016-08-18 10:59   ` tip-bot for Peter Zijlstra
  2016-08-18 13:41   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 19+ messages in thread
From: tip-bot for Peter Zijlstra @ 2016-08-18 10:59 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: torvalds, akpm, tglx, romlem, peterz, hpa, dimitrysh,
	john.stultz, tkjos, mingo, ccross, tj, paulmck, linux-kernel,
	oleg

Commit-ID:  64ca3c072e18deaee2a3716d460bf3d72aac242f
Gitweb:     http://git.kernel.org/tip/64ca3c072e18deaee2a3716d460bf3d72aac242f
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Aug 2016 18:54:13 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 18 Aug 2016 11:34:25 +0200

locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write()

The current percpu-rwsem read side is entirely free of serializing insns
at the cost of having a synchronize_sched() in the write path.

The latency of the synchronize_sched() is too high for cgroups. The
commit 1ed1328792ff talks about the write path being a fairly cold path
but this is not the case for Android which moves task to the foreground
cgroup and back around binder IPC calls from foreground processes to
background processes, so it is significantly hotter than human initiated
operations.

Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the
problem, hopefully it should not be that slow after another commit:

  80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact").

We could just add rcu_sync_enter() into cgroup_init() but we do not want
another synchronize_sched() at boot time, so this patch adds the new helper
which doesn't block but currently can only be called before the first use.

Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Colin Cross <ccross@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Link: http://lkml.kernel.org/r/20160811165413.GA22807@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcu_sync.h |  1 +
 kernel/cgroup.c          |  6 ++++++
 kernel/rcu/sync.c        | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e..ece7ed9 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7..9f51cdf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5606,6 +5606,12 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9..e358313 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -83,6 +83,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 
 /**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
+/**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [tip:locking/core] locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write()
  2016-08-11 16:54 ` [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write() Oleg Nesterov
  2016-08-18 10:59   ` [tip:locking/core] locking, rcu, cgroup: Avoid " tip-bot for Peter Zijlstra
@ 2016-08-18 13:41   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 19+ messages in thread
From: tip-bot for Peter Zijlstra @ 2016-08-18 13:41 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: peterz, romlem, torvalds, mingo, john.stultz, ccross,
	linux-kernel, oleg, dimitrysh, paulmck, tglx, akpm, hpa, tkjos,
	tj

Commit-ID:  3942a9bd7b5842a924e99ee6ec1350b8006c94ec
Gitweb:     http://git.kernel.org/tip/3942a9bd7b5842a924e99ee6ec1350b8006c94ec
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Aug 2016 18:54:13 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 18 Aug 2016 15:36:59 +0200

locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write()

The current percpu-rwsem read side is entirely free of serializing insns
at the cost of having a synchronize_sched() in the write path.

The latency of the synchronize_sched() is too high for cgroups. The
commit 1ed1328792ff talks about the write path being a fairly cold path
but this is not the case for Android which moves task to the foreground
cgroup and back around binder IPC calls from foreground processes to
background processes, so it is significantly hotter than human initiated
operations.

Switch cgroup_threadgroup_rwsem into the slow mode for now to avoid the
problem, hopefully it should not be that slow after another commit:

  80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce global impact").

We could just add rcu_sync_enter() into cgroup_init() but we do not want
another synchronize_sched() at boot time, so this patch adds the new helper
which doesn't block but currently can only be called before the first use.

Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Colin Cross <ccross@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rom Lemarchand <romlem@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Todd Kjos <tkjos@google.com>
Link: http://lkml.kernel.org/r/20160811165413.GA22807@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcu_sync.h |  1 +
 kernel/cgroup.c          |  6 ++++++
 kernel/rcu/sync.c        | 12 ++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index a63a33e..ece7ed9 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter_start(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7..9f51cdf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5606,6 +5606,12 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	/*
+	 * The latency of the synchronize_sched() is too high for cgroups,
+	 * avoid it at the cost of forcing all readers into the slow path.
+	 */
+	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 198473d..50d1861 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -85,6 +85,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
 }
 
 /**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = GP_PASSED;
+}
+
+/**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-13  1:44           ` Om Dhyade
@ 2016-08-24 21:16             ` John Stultz
  2016-08-24 21:30               ` Tejun Heo
  0 siblings, 1 reply; 19+ messages in thread
From: John Stultz @ 2016-08-24 21:16 UTC (permalink / raw)
  To: Peter Zijlstra, Tejun Heo, Oleg Nesterov
  Cc: Om Dhyade, Paul E. McKenney, Ingo Molnar, lkml, Dmitry Shmidt,
	Rom Lemarchand, Colin Cross, Todd Kjos

On Fri, Aug 12, 2016 at 6:44 PM, Om Dhyade <odhyade@codeaurora.org> wrote:
> Update from my tests:
> Use-case: Android application launches.
>
> I tested the patches on android N build, i see max latency ~7ms.
> In my tests, the wait is due to: copy_process(fork.c) blocks all threads in
> __cgroup_procs_write including threads which are not part of the forking
> process's thread-group.
>
> Dimtry had provided a hack patch which reverts to per-process rw-sem which
> had max latency of ~2ms.
>
> android user-space binder library does 2 cgroup write operations per
> transaction, apart from the copy_process(fork.c) wait, i see pre-emption in
> _cgroup_procs_write causing waits.


Hey Peter, Tejun, Oleg,
  So while you're tweaks for the percpu-rwsem have greatly helped the
regression folks were seeing (many thanks, by the way), as noted
above, the performance regression with the global lock compared to
earlier kernels is still ~3x slower (though again, much better then
the 80x slower that was seen earlier).

So I was wondering if patches to go back to the per signal_struct
locking would still be considered? Or is the global lock approach the
only way forward?

At a higher level, I'm worried that Android's use of cgroups as a
priority enforcement mechanism is at odds with developers focusing on
it as a container enforcement mechanism, as in the latter its not
common for tasks to change between cgroups, but with the former
priority adjustments are quite common.

thanks
-john

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-24 21:16             ` John Stultz
@ 2016-08-24 21:30               ` Tejun Heo
  2016-08-24 22:50                 ` John Stultz
  2016-08-26  2:14                 ` John Stultz
  0 siblings, 2 replies; 19+ messages in thread
From: Tejun Heo @ 2016-08-24 21:30 UTC (permalink / raw)
  To: John Stultz
  Cc: Peter Zijlstra, Oleg Nesterov, Om Dhyade, Paul E. McKenney,
	Ingo Molnar, lkml, Dmitry Shmidt, Rom Lemarchand, Colin Cross,
	Todd Kjos

Hello, John.

On Wed, Aug 24, 2016 at 02:16:52PM -0700, John Stultz wrote:
> Hey Peter, Tejun, Oleg,
>   So while you're tweaks for the percpu-rwsem have greatly helped the
> regression folks were seeing (many thanks, by the way), as noted
> above, the performance regression with the global lock compared to
> earlier kernels is still ~3x slower (though again, much better then
> the 80x slower that was seen earlier).
> 
> So I was wondering if patches to go back to the per signal_struct
> locking would still be considered? Or is the global lock approach the
> only way forward?

We can't simply revert but we can make the lock per signal_struct
again.  It's just that it'd be quite a bit more complex (but, again,
if we need it...) and for cases where migrations aren't as frequent
percpu-rwsem would be at least a bit lower overhead.  Can you please
test with the following patch applied just in case?

 https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/commit/?h=for-4.8-fixes&id=568ac888215c7fb2fabe8ea739b00ec3c1f5d440

> At a higher level, I'm worried that Android's use of cgroups as a
> priority enforcement mechanism is at odds with developers focusing on
> it as a container enforcement mechanism, as in the latter its not
> common for tasks to change between cgroups, but with the former
> priority adjustments are quite common.

It has been at odds as long as android existed.  cgroup used to have
synchronous synchronize_rcu() in the migration path which android
kernel simply deleted (didn't break android's use case), re-labeling
every page's memcg ownership on foreground and background switches
(does it still do that? it's simply unworkable) and so on.

I find it difficult to believe that android's requirements can be
satisfied only through moving processes around.  cgroup usage model is
affected by container use cases but isn't limited to it.  If android
is willing to change, I'd be more than happy to work with you and
solve obstacles.  If not, we'll surely try not to break it anymore
than it has always been broken.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-24 21:30               ` Tejun Heo
@ 2016-08-24 22:50                 ` John Stultz
  2016-08-26  2:14                 ` John Stultz
  1 sibling, 0 replies; 19+ messages in thread
From: John Stultz @ 2016-08-24 22:50 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Oleg Nesterov, Om Dhyade, Paul E. McKenney,
	Ingo Molnar, lkml, Dmitry Shmidt, Rom Lemarchand, Colin Cross,
	Todd Kjos

On Wed, Aug 24, 2016 at 2:30 PM, Tejun Heo <tj@kernel.org> wrote:
> On Wed, Aug 24, 2016 at 02:16:52PM -0700, John Stultz wrote:
>>
>> So I was wondering if patches to go back to the per signal_struct
>> locking would still be considered? Or is the global lock approach the
>> only way forward?
>
> We can't simply revert but we can make the lock per signal_struct
> again.  It's just that it'd be quite a bit more complex (but, again,
> if we need it...) and for cases where migrations aren't as frequent
> percpu-rwsem would be at least a bit lower overhead.  Can you please
> test with the following patch applied just in case?
>
>  https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/commit/?h=for-4.8-fixes&id=568ac888215c7fb2fabe8ea739b00ec3c1f5d440

That does provide a reasonable improvement in my testing!  But, I'll
pass it along to folks who are doing more in-depth performance
analysis to make sure.

If that doesn't work, I'll talk w/ Dmitry about submitting his patch
reworking things back to the per signal_struct locking.

>> At a higher level, I'm worried that Android's use of cgroups as a
>> priority enforcement mechanism is at odds with developers focusing on
>> it as a container enforcement mechanism, as in the latter its not
>> common for tasks to change between cgroups, but with the former
>> priority adjustments are quite common.
>
> It has been at odds as long as android existed.  cgroup used to have
> synchronous synchronize_rcu() in the migration path which android
> kernel simply deleted (didn't break android's use case), re-labeling
> every page's memcg ownership on foreground and background switches
> (does it still do that? it's simply unworkable) and so on.

Yea. Android folks can correct me, but I think the memcg
forground/background bits have been dropped. They still use a apps
memcg group for low-memory-notification in their
userland-low-memory-killer-daemon (however my understanding is that
has yet to sufficiently replace the in-kernel low-memory-killer).

> I find it difficult to believe that android's requirements can be
> satisfied only through moving processes around.  cgroup usage model is
> affected by container use cases but isn't limited to it.  If android
> is willing to change, I'd be more than happy to work with you and
> solve obstacles.  If not, we'll surely try not to break it anymore
> than it has always been broken.

I think folks are open to ideas, but going from idea-from-the-list to
something reliable enough to ship is always a bit fraught, so it often
takes some time and effort to really validate if those ideas are
workable.

I have on my list to re-submit the can_attach logic Android uses to
provide permissions checks on processes migrating other tasks between
cgroups, so at least we can try to reduce the separate domains of
focus and get folks sharing the same code bases.

thanks
-john

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-24 21:30               ` Tejun Heo
  2016-08-24 22:50                 ` John Stultz
@ 2016-08-26  2:14                 ` John Stultz
  2016-08-26 12:51                   ` Tejun Heo
  1 sibling, 1 reply; 19+ messages in thread
From: John Stultz @ 2016-08-26  2:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Peter Zijlstra, Oleg Nesterov, Om Dhyade, Paul E. McKenney,
	Ingo Molnar, lkml, Dmitry Shmidt, Rom Lemarchand, Colin Cross,
	Todd Kjos

On Wed, Aug 24, 2016 at 2:30 PM, Tejun Heo <tj@kernel.org> wrote:
> Hello, John.
>
> On Wed, Aug 24, 2016 at 02:16:52PM -0700, John Stultz wrote:
>> Hey Peter, Tejun, Oleg,
>>   So while you're tweaks for the percpu-rwsem have greatly helped the
>> regression folks were seeing (many thanks, by the way), as noted
>> above, the performance regression with the global lock compared to
>> earlier kernels is still ~3x slower (though again, much better then
>> the 80x slower that was seen earlier).
>>
>> So I was wondering if patches to go back to the per signal_struct
>> locking would still be considered? Or is the global lock approach the
>> only way forward?
>
> We can't simply revert but we can make the lock per signal_struct
> again.  It's just that it'd be quite a bit more complex (but, again,
> if we need it...) and for cases where migrations aren't as frequent
> percpu-rwsem would be at least a bit lower overhead.  Can you please
> test with the following patch applied just in case?
>
>  https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/commit/?h=for-4.8-fixes&id=568ac888215c7fb2fabe8ea739b00ec3c1f5d440


Hey! Good news. This patch along with Peter's locking changes pushes
the latencies down to an apparently acceptable level!

Many thanks for the pointer!

thanks
-john

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-26  2:14                 ` John Stultz
@ 2016-08-26 12:51                   ` Tejun Heo
  2016-08-26 16:47                     ` Dmitry Shmidt
  0 siblings, 1 reply; 19+ messages in thread
From: Tejun Heo @ 2016-08-26 12:51 UTC (permalink / raw)
  To: John Stultz
  Cc: Peter Zijlstra, Oleg Nesterov, Om Dhyade, Paul E. McKenney,
	Ingo Molnar, lkml, Dmitry Shmidt, Rom Lemarchand, Colin Cross,
	Todd Kjos

Hello, John.

On Thu, Aug 25, 2016 at 07:14:07PM -0700, John Stultz wrote:
> Hey! Good news. This patch along with Peter's locking changes pushes
> the latencies down to an apparently acceptable level!

Ah, that's good to hear.  Please feel free to ping me if you guys
wanna talk about cgroup usage in android.

Thanks!

-- 
tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-26 12:51                   ` Tejun Heo
@ 2016-08-26 16:47                     ` Dmitry Shmidt
  2016-08-26 20:10                       ` Om Dhyade
  0 siblings, 1 reply; 19+ messages in thread
From: Dmitry Shmidt @ 2016-08-26 16:47 UTC (permalink / raw)
  To: Tejun Heo, Peter Zijlstra
  Cc: John Stultz, Oleg Nesterov, Om Dhyade, Paul E. McKenney,
	Ingo Molnar, lkml, Rom Lemarchand, Colin Cross, Todd Kjos

On Fri, Aug 26, 2016 at 5:51 AM, Tejun Heo <tj@kernel.org> wrote:
> Hello, John.
>
> On Thu, Aug 25, 2016 at 07:14:07PM -0700, John Stultz wrote:
>> Hey! Good news. This patch along with Peter's locking changes pushes
>> the latencies down to an apparently acceptable level!
>
> Ah, that's good to hear.  Please feel free to ping me if you guys
> wanna talk about cgroup usage in android.

Thanks a lot for all your help resolving this issue!

> Thanks!
>
> --
> tejun

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-26 16:47                     ` Dmitry Shmidt
@ 2016-08-26 20:10                       ` Om Dhyade
  0 siblings, 0 replies; 19+ messages in thread
From: Om Dhyade @ 2016-08-26 20:10 UTC (permalink / raw)
  To: Dmitry Shmidt, Tejun Heo, Peter Zijlstra
  Cc: John Stultz, Oleg Nesterov, Paul E. McKenney, Ingo Molnar, lkml,
	Rom Lemarchand, Colin Cross, Todd Kjos



On 8/26/2016 9:47 AM, Dmitry Shmidt wrote:
> On Fri, Aug 26, 2016 at 5:51 AM, Tejun Heo <tj@kernel.org> wrote:
>> Hello, John.
>>
>> On Thu, Aug 25, 2016 at 07:14:07PM -0700, John Stultz wrote:
>>> Hey! Good news. This patch along with Peter's locking changes pushes
>>> the latencies down to an apparently acceptable level!
>>
>> Ah, that's good to hear.  Please feel free to ping me if you guys
>> wanna talk about cgroup usage in android.
>
> Thanks a lot for all your help resolving this issue!
>
Thank you for the help.
>> Thanks!
>>
>> --
>> tejun

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-09  9:51 [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2016-08-09 23:47 ` John Stultz
  2016-08-11 16:54 ` [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write() Oleg Nesterov
@ 2016-08-31  5:21 ` Guenter Roeck
  2016-08-31  8:09   ` Peter Zijlstra
  2 siblings, 1 reply; 19+ messages in thread
From: Guenter Roeck @ 2016-08-31  5:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, John Stultz, Paul E. McKenney, Ingo Molnar, lkml,
	Tejun Heo, Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

Peter,

On Tue, Aug 09, 2016 at 11:51:12AM +0200, Peter Zijlstra wrote:
> Currently the percpu-rwsem switches to (global) atomic ops while a
> writer is waiting; which could be quite a while and slows down
> releasing the readers.
> 
> This patch cures this problem by ordering the reader-state vs
> reader-count (see the comments in __percpu_down_read() and
> percpu_down_write()). This changes a global atomic op into a full
> memory barrier, which doesn't have the global cacheline contention.
> 
> This also enables using the percpu-rwsem with rcu_sync disabled in order
> to bias the implementation differently, reducing the writer latency by
> adding some cost to readers.
> 
> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/percpu-rwsem.h  |   84 +++++++++++++--
>  kernel/locking/percpu-rwsem.c |  228 ++++++++++++++++++++++++------------------
>  2 files changed, 206 insertions(+), 106 deletions(-)
> 
> --- a/include/linux/percpu-rwsem.h
> +++ b/include/linux/percpu-rwsem.h
> @@ -10,30 +10,96 @@
>  
>  struct percpu_rw_semaphore {
>  	struct rcu_sync		rss;
> -	unsigned int __percpu	*fast_read_ctr;
> +	unsigned int __percpu	*read_count;
>  	struct rw_semaphore	rw_sem;
> -	atomic_t		slow_read_ctr;
> -	wait_queue_head_t	write_waitq;
> +	wait_queue_head_t	writer;
> +	int			readers_block;
>  };
>  
> -extern void percpu_down_read(struct percpu_rw_semaphore *);
> -extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
> -extern void percpu_up_read(struct percpu_rw_semaphore *);
> +extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
> +extern void __percpu_up_read(struct percpu_rw_semaphore *);
> +
> +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> +{
> +	might_sleep();
> +
> +	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
> +
> +	preempt_disable();
> +	/*
> +	 * We are in an RCU-sched read-side critical section, so the writer
> +	 * cannot both change sem->state from readers_fast and start checking
> +	 * counters while we are here. So if we see !sem->state, we know that
> +	 * the writer won't be checking until we're past the preempt_enable()
> +	 * and that one the synchronize_sched() is done, the writer will see
> +	 * anything we did within this RCU-sched read-size critical section.
> +	 */
> +	__this_cpu_inc(*sem->read_count);
> +	if (unlikely(!rcu_sync_is_idle(&sem->rss)))

The call to rcu_sync_is_idle() causes the following build error when building
x86_64:allmodconfig.

ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!
ERROR: "rcu_sync_lockdep_assert" [fs/ext4/ext4.ko] undefined!

I think this was also reported by the 0-day build bot.

The simple fix would of course be to export rcu_sync_lockdep_assert. Before I
apply that change to the Android code (where the patch has been aplied and
the problem is seen) - do you by any chance have a better solution in mind ?

Thanks,
Guenter

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-31  5:21 ` [v2] locking/percpu-rwsem: Optimize readers and reduce global impact Guenter Roeck
@ 2016-08-31  8:09   ` Peter Zijlstra
  2016-08-31 13:41     ` Guenter Roeck
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2016-08-31  8:09 UTC (permalink / raw)
  To: Guenter Roeck
  Cc: Oleg Nesterov, John Stultz, Paul E. McKenney, Ingo Molnar, lkml,
	Tejun Heo, Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On Tue, Aug 30, 2016 at 10:21:02PM -0700, Guenter Roeck wrote:
> Peter,
> 
> The call to rcu_sync_is_idle() causes the following build error when building
> x86_64:allmodconfig.
> 
> ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!
> ERROR: "rcu_sync_lockdep_assert" [fs/ext4/ext4.ko] undefined!
> 
> I think this was also reported by the 0-day build bot.
> 
> The simple fix would of course be to export rcu_sync_lockdep_assert. Before I
> apply that change to the Android code (where the patch has been aplied and
> the problem is seen) - do you by any chance have a better solution in mind ?

Nope, that's exactly what Ingo did when he ran into this. If you look at
commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce
global impact"), you'll notice the last hunk:


diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..198473d90f81 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
 	RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
 			 "suspicious rcu_sync_is_idle() usage");
 }
+
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
 #endif
 
 /**

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-31  8:09   ` Peter Zijlstra
@ 2016-08-31 13:41     ` Guenter Roeck
  2016-08-31 13:47       ` Guenter Roeck
  0 siblings, 1 reply; 19+ messages in thread
From: Guenter Roeck @ 2016-08-31 13:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, John Stultz, Paul E. McKenney, Ingo Molnar, lkml,
	Tejun Heo, Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On 08/31/2016 01:09 AM, Peter Zijlstra wrote:
> On Tue, Aug 30, 2016 at 10:21:02PM -0700, Guenter Roeck wrote:
>> Peter,
>>
>> The call to rcu_sync_is_idle() causes the following build error when building
>> x86_64:allmodconfig.
>>
>> ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!
>> ERROR: "rcu_sync_lockdep_assert" [fs/ext4/ext4.ko] undefined!
>>
>> I think this was also reported by the 0-day build bot.
>>
>> The simple fix would of course be to export rcu_sync_lockdep_assert. Before I
>> apply that change to the Android code (where the patch has been aplied and
>> the problem is seen) - do you by any chance have a better solution in mind ?
>
> Nope, that's exactly what Ingo did when he ran into this. If you look at
> commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce
> global impact"), you'll notice the last hunk:
>
Ah, I missed that, and didn't realize that it is already upstream. Thanks a lot!

Guenter

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [v2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-08-31 13:41     ` Guenter Roeck
@ 2016-08-31 13:47       ` Guenter Roeck
  0 siblings, 0 replies; 19+ messages in thread
From: Guenter Roeck @ 2016-08-31 13:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, John Stultz, Paul E. McKenney, Ingo Molnar, lkml,
	Tejun Heo, Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On 08/31/2016 06:41 AM, Guenter Roeck wrote:
> On 08/31/2016 01:09 AM, Peter Zijlstra wrote:
>> On Tue, Aug 30, 2016 at 10:21:02PM -0700, Guenter Roeck wrote:
>>> Peter,
>>>
>>> The call to rcu_sync_is_idle() causes the following build error when building
>>> x86_64:allmodconfig.
>>>
>>> ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!
>>> ERROR: "rcu_sync_lockdep_assert" [fs/ext4/ext4.ko] undefined!
>>>
>>> I think this was also reported by the 0-day build bot.
>>>
>>> The simple fix would of course be to export rcu_sync_lockdep_assert. Before I
>>> apply that change to the Android code (where the patch has been aplied and
>>> the problem is seen) - do you by any chance have a better solution in mind ?
>>
>> Nope, that's exactly what Ingo did when he ran into this. If you look at
>> commit 80127a39681b ("locking/percpu-rwsem: Optimize readers and reduce
>> global impact"), you'll notice the last hunk:
>>
> Ah, I missed that, and didn't realize that it is already upstream. Thanks a lot!
>

Wait, it isn't ... I must have fetched a branch with 80127a39681b. Ignore the noise
I am making until after I had another coffee.

Guenter

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2016-08-31 13:47 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-09  9:51 [PATCH v2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2016-08-09 23:47 ` John Stultz
2016-08-10  7:56   ` Peter Zijlstra
2016-08-10 17:09     ` Oleg Nesterov
     [not found]       ` <CAH7ZN-zXrenrbbcmvZ+biNozYe21jw6fULopG=g9-xRwWHE6nw@mail.gmail.com>
     [not found]         ` <5a2cc178ee03466fa3b104f8f28b44ff@NASANEXM02C.na.qualcomm.com>
2016-08-13  1:44           ` Om Dhyade
2016-08-24 21:16             ` John Stultz
2016-08-24 21:30               ` Tejun Heo
2016-08-24 22:50                 ` John Stultz
2016-08-26  2:14                 ` John Stultz
2016-08-26 12:51                   ` Tejun Heo
2016-08-26 16:47                     ` Dmitry Shmidt
2016-08-26 20:10                       ` Om Dhyade
2016-08-11 16:54 ` [PATCH] cgroup: avoid synchronize_sched() in __cgroup_procs_write() Oleg Nesterov
2016-08-18 10:59   ` [tip:locking/core] locking, rcu, cgroup: Avoid " tip-bot for Peter Zijlstra
2016-08-18 13:41   ` tip-bot for Peter Zijlstra
2016-08-31  5:21 ` [v2] locking/percpu-rwsem: Optimize readers and reduce global impact Guenter Roeck
2016-08-31  8:09   ` Peter Zijlstra
2016-08-31 13:41     ` Guenter Roeck
2016-08-31 13:47       ` Guenter Roeck

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.