All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] locking/percpu-rwsem: Optimizations/tweaks
@ 2016-07-14 18:25 Peter Zijlstra
  2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2016-07-14 18:25 ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Peter Zijlstra
  0 siblings, 2 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 18:25 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: oleg, tj, paulmck, john.stultz, dimitrysh, romlem, ccross, tkjos, peterz

As reported by John and Dmitry, Android tickles bad behaviour in
cgroup_threadgroup_rwsem.

These two patches attempt to address the issue by optimizing the percpu-rwsem
slow path and second add a bias knob to the percpu-rwsem that allows disabling
the fast path which improves writer latency by getting rid of a
synchronize_sched() call.

---
 fs/super.c                    |   3 +-
 include/linux/percpu-rwsem.h  |  96 +++++++++++++++--
 kernel/cgroup.c               |   2 +-
 kernel/locking/percpu-rwsem.c | 246 +++++++++++++++++++++++++-----------------
 4 files changed, 239 insertions(+), 108 deletions(-)

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-07-14 18:25 [PATCH 0/2] locking/percpu-rwsem: Optimizations/tweaks Peter Zijlstra
@ 2016-07-14 18:25 ` Peter Zijlstra
  2016-07-15 16:30   ` Oleg Nesterov
                     ` (2 more replies)
  2016-07-14 18:25 ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Peter Zijlstra
  1 sibling, 3 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 18:25 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: oleg, tj, paulmck, john.stultz, dimitrysh, romlem, ccross, tkjos, peterz

[-- Attachment #1: peterz-locking-percpu-rwsem-opt.patch --]
[-- Type: text/plain, Size: 13629 bytes --]

Currently the percpu-rwsem switches to (global) atomic ops while a
writer is waiting; which could be quite a while and slows down
releasing the readers.

This patch cures this problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   85 +++++++++++++--
 kernel/locking/percpu-rwsem.c |  236 ++++++++++++++++++++++++------------------
 2 files changed, 216 insertions(+), 105 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -10,30 +10,97 @@
 
 struct percpu_rw_semaphore {
 	struct rcu_sync		rss;
-	unsigned int __percpu	*fast_read_ctr;
+	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
+	wait_queue_head_t	writer;
+	int			state;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern int  __percpu_down_read_trylock(struct percpu_rw_semaphore *);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start checking
+	 * counters while we are here. So if we see !sem->state, we know that
+	 * the writer won't be checking until we're past the preempt_enable()
+	 * and that one the synchronize_sched() is done, the writer will see
+	 * anything we did within this RCU-sched read-size critical section.
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem); /* Unconditional memory barrier */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	int ret = 1;
+
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	__this_cpu_inc(*sem->read_count);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		ret = __percpu_down_read_trylock(sem);
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+
+	if (ret)
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->read_count);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
 				const char *, struct lock_class_key *);
+
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
-
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,152 +8,196 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+enum { readers_slow, readers_block };
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
-	brw->fast_read_ctr = alloc_percpu(int);
-	if (unlikely(!brw->fast_read_ctr))
+	sem->read_count = alloc_percpu(int);
+	if (unlikely(!sem->read_count))
 		return -ENOMEM;
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
-	atomic_set(&brw->slow_read_ctr, 0);
-	init_waitqueue_head(&brw->write_waitq);
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+	init_waitqueue_head(&sem->writer);
+	sem->state = readers_slow;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
 	/*
 	 * XXX: temporary kludge. The error path in alloc_super()
 	 * assumes that percpu_free_rwsem() is safe after kzalloc().
 	 */
-	if (!brw->fast_read_ctr)
+	if (!sem->read_count)
 		return;
 
-	rcu_sync_dtor(&brw->rss);
-	free_percpu(brw->fast_read_ctr);
-	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+	rcu_sync_dtor(&sem->rss);
+	free_percpu(sem->read_count);
+	sem->read_count = NULL; /* catch use after free bugs */
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+void __percpu_down_read(struct percpu_rw_semaphore *sem)
 {
-	bool success;
+	/*
+	 * Due to having preemption disabled the decrement happens on
+	 * the same CPU as the increment, avoiding the
+	 * increment-on-one-CPU-and-decrement-on-another problem.
+	 *
+	 * And yes, if the reader misses the writer's assignment of
+	 * readers_block to sem->state, then the writer is
+	 * guaranteed to see the reader's increment.  Conversely, any
+	 * readers that increment their sem->read_count after the
+	 * writer looks are guaranteed to see the readers_block value,
+	 * which in turn means that they are guaranteed to immediately
+	 * decrement their sem->read_count, so that it doesn't matter
+	 * that the writer missed them.
+	 */
 
-	preempt_disable();
-	success = rcu_sync_is_idle(&brw->rss);
-	if (likely(success))
-		__this_cpu_add(*brw->fast_read_ctr, val);
-	preempt_enable();
+	smp_mb(); /* A matches D */
 
-	return success;
-}
+	/*
+	 * If !readers_block the critical section starts here, matched by the
+	 * release in percpu_up_write().
+	 */
+	if (likely(smp_load_acquire(&sem->state) != readers_block))
+		return;
 
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
-	might_sleep();
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+	/*
+	 * Per the above comment; we still have preemption disabled and
+	 * will thus decrement on the same CPU as we incremented.
+	 */
+	__percpu_up_read(sem);
 
-	if (likely(update_fast_ctr(brw, +1)))
-		return;
+	/*
+	 * We either call schedule() in the wait, or we'll fall through
+	 * and reschedule on the preempt_enable() in percpu_down_read().
+	 */
+	preempt_enable_no_resched();
+
+	/*
+	 * Avoid lockdep for the down/up_read() we already have them.
+	 */
+	__down_read(&sem->rw_sem);
+	__this_cpu_inc(*sem->read_count);
+	__up_read(&sem->rw_sem);
 
-	/* Avoid rwsem_acquire_read() and rwsem_release() */
-	__down_read(&brw->rw_sem);
-	atomic_inc(&brw->slow_read_ctr);
-	__up_read(&brw->rw_sem);
+	preempt_disable();
 }
-EXPORT_SYMBOL_GPL(percpu_down_read);
+EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+int __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
-	if (unlikely(!update_fast_ctr(brw, +1))) {
-		if (!__down_read_trylock(&brw->rw_sem))
-			return 0;
-		atomic_inc(&brw->slow_read_ctr);
-		__up_read(&brw->rw_sem);
-	}
+	smp_mb(); /* A matches D */
+
+	if (likely(smp_load_acquire(&sem->state) != readers_block))
+		return 1;
 
-	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
-	return 1;
+	__percpu_up_read(sem);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(__percpu_down_read_trylock);
 
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
-	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
-	if (likely(update_fast_ctr(brw, -1)))
-		return;
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to aggregate
+	 * zero, as that is the only time it matters) they will also see our
+	 * critical section.
+	 */
+	__this_cpu_dec(*sem->read_count);
 
-	/* false-positive is possible but harmless */
-	if (atomic_dec_and_test(&brw->slow_read_ctr))
-		wake_up_all(&brw->write_waitq);
+	/* Prod writer to recheck readers_active */
+	wake_up(&sem->writer);
 }
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	compiletime_assert_atomic_type(__sum);				\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
 
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-	unsigned int sum = 0;
-	int cpu;
+	if (per_cpu_sum(*sem->read_count) != 0)
+		return false;
+
+	/*
+	 * If we observed the decrement; ensure we see the entire critical
+	 * section.
+	 */
 
-	for_each_possible_cpu(cpu) {
-		sum += per_cpu(*brw->fast_read_ctr, cpu);
-		per_cpu(*brw->fast_read_ctr, cpu) = 0;
-	}
+	smp_mb(); /* C matches B */
 
-	return sum;
+	return true;
 }
 
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
+	down_write(&sem->rw_sem);
+
+	/* Notify readers to take the slow path. */
+	rcu_sync_enter(&sem->rss);
+
 	/*
-	 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
-	 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
-	 *
-	 * The latter synchronises us with the preceding readers which used
-	 * the fast-past, so we can not miss the result of __this_cpu_add()
-	 * or anything else inside their criticial sections.
+	 * Notify new readers to block; up until now, and thus throughout the
+	 * longish rcu_sync_enter() above, new readers could still come in.
 	 */
-	rcu_sync_enter(&brw->rss);
+	WRITE_ONCE(sem->state, readers_block);
 
-	/* exclude other writers, and block the new readers completely */
-	down_write(&brw->rw_sem);
+	smp_mb(); /* D matches A */
 
-	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+	/*
+	 * If they don't see our writer of readers_block to sem->state,
+	 * then we are guaranteed to see their sem->read_count increment, and
+	 * therefore will wait for them.
+	 */
 
-	/* wait for all readers to complete their percpu_up_read() */
-	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+	/* Wait for all now active readers to complete. */
+	wait_event(sem->writer, readers_active_check(sem));
 }
 EXPORT_SYMBOL_GPL(percpu_down_write);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	/* release the lock, but the readers can't use the fast-path */
-	up_write(&brw->rw_sem);
 	/*
-	 * Enable the fast-path in percpu_down_read() and percpu_up_read()
-	 * but only after another gp pass; this adds the necessary barrier
-	 * to ensure the reader can't miss the changes done by us.
+	 * Signal the writer is done, no fast path yet.
+	 *
+	 * One reason that we cannot just immediately flip to readers_fast is
+	 * that new readers might fail to see the results of this writer's
+	 * critical section.
+	 *
+	 * Therefore we force it through the slow path which guarantees an
+	 * acquire and thereby guarantees the critical section's consistency.
+	 */
+	smp_store_release(&sem->state, readers_slow);
+
+	/*
+	 * Release the write lock, this will allow readers back in the game.
+	 */
+	up_write(&sem->rw_sem);
+
+	/*
+	 * Once this completes (at least one RCU grace period hence) the reader
+	 * fast path will be available again. Safe to use outside the exclusive
+	 * write lock because its counting.
 	 */
-	rcu_sync_exit(&brw->rss);
+	rcu_sync_exit(&sem->rss);
 }
-EXPORT_SYMBOL_GPL(percpu_up_write);
+EXPORT_SYMBOL(percpu_up_write);

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 18:25 [PATCH 0/2] locking/percpu-rwsem: Optimizations/tweaks Peter Zijlstra
  2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2016-07-14 18:25 ` Peter Zijlstra
  2016-07-14 18:37   ` Peter Zijlstra
  2016-07-14 18:43   ` Oleg Nesterov
  1 sibling, 2 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 18:25 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: oleg, tj, paulmck, john.stultz, dimitrysh, romlem, ccross, tkjos, peterz

[-- Attachment #1: peterz-locking-percpu-rwsem-bias.patch --]
[-- Type: text/plain, Size: 3605 bytes --]

The current percpu-rwsem read side is entirely free of serializing
instructions at the cost of having a synchronize_sched() in the write
path.

The latency of the synchronize_sched() is too high for some users
(cgroups), so provide a __percpu_init_rwsem(.bias) argument to forgot
this synchronize_sched() at the cost of forcing all readers into the
slow path, which has serializing instructions.

Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Dmitry Shmidt <dimitrysh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/super.c                    |    3 ++-
 include/linux/percpu-rwsem.h  |   15 +++++++++++++--
 kernel/cgroup.c               |    2 +-
 kernel/locking/percpu-rwsem.c |   10 +++++++++-
 4 files changed, 25 insertions(+), 5 deletions(-)

--- a/fs/super.c
+++ b/fs/super.c
@@ -195,7 +195,8 @@ static struct super_block *alloc_super(s
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
 		if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
 					sb_writers_name[i],
-					&type->s_writers_key[i]))
+					&type->s_writers_key[i],
+					PERCPU_RWSEM_READER))
 			goto fail;
 	}
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -90,15 +90,26 @@ static inline void percpu_up_read(struct
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
+enum percpu_rwsem_bias { PERCPU_RWSEM_READER, PERCPU_RWSEM_WRITER };
+
 extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
-				const char *, struct lock_class_key *);
+				const char *, struct lock_class_key *,
+				enum percpu_rwsem_bias bias);
 
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
 #define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key,		\
+			    PERCPU_RWSEM_READER);		\
+})
+
+#define percpu_init_rwsem_writer(sem)				\
+({								\
+	static struct lock_class_key rwsem_key;			\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key,		\
+			    PERCPU_RWSEM_WRITER);		\
 })
 
 #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5605,7 +5605,7 @@ int __init cgroup_init(void)
 	int ssid;
 
 	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
+	BUG_ON(percpu_init_rwsem_writer(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,8 @@
 enum { readers_slow, readers_block };
 
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
-			const char *name, struct lock_class_key *rwsem_key)
+			const char *name, struct lock_class_key *rwsem_key,
+			enum percpu_rwsem_bias bias)
 {
 	sem->read_count = alloc_percpu(int);
 	if (unlikely(!sem->read_count))
@@ -19,6 +20,13 @@ int __percpu_init_rwsem(struct percpu_rw
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	if (bias == PERCPU_RWSEM_WRITER) {
+		/*
+		 * Disable rcu_sync() and force slow path.
+		 */
+		sem->rss.gp_count++;
+		sem->rss.gp_state = !0;
+	}
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 	init_waitqueue_head(&sem->writer);
 	sem->state = readers_slow;

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 18:25 ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Peter Zijlstra
@ 2016-07-14 18:37   ` Peter Zijlstra
  2016-07-14 18:43   ` Oleg Nesterov
  1 sibling, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 18:37 UTC (permalink / raw)
  To: mingo, linux-kernel
  Cc: oleg, tj, paulmck, john.stultz, dimitrysh, romlem, ccross, tkjos

On Thu, Jul 14, 2016 at 08:25:47PM +0200, Peter Zijlstra wrote:
> @@ -19,6 +20,13 @@ int __percpu_init_rwsem(struct percpu_rw
>  
>  	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
>  	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
> +	if (bias == PERCPU_RWSEM_WRITER) {
> +		/*
> +		 * Disable rcu_sync() and force slow path.
> +		 */
> +		sem->rss.gp_count++;
> +		sem->rss.gp_state = !0;
> +	}
>  	__init_rwsem(&sem->rw_sem, name, rwsem_key);
>  	init_waitqueue_head(&sem->writer);
>  	sem->state = readers_slow;

So this seemed like a better deal than calling rcu_sync_enter(), because
that would still incur a (pointless) synchronize_sched() at init time
and people do tend to complain about things like that.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 18:25 ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Peter Zijlstra
  2016-07-14 18:37   ` Peter Zijlstra
@ 2016-07-14 18:43   ` Oleg Nesterov
  2016-07-14 18:56     ` Peter Zijlstra
  2016-07-14 19:20     ` Peter Zijlstra
  1 sibling, 2 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-14 18:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On 07/14, Peter Zijlstra wrote:
>
> The current percpu-rwsem read side is entirely free of serializing
> instructions at the cost of having a synchronize_sched() in the write
> path.
>
> The latency of the synchronize_sched() is too high for some users
> (cgroups), so provide a __percpu_init_rwsem(.bias) argument to forgot
> this synchronize_sched() at the cost of forcing all readers into the
> slow path, which has serializing instructions.

Oh well... I personally do not think this is what we want... Can't
we just add the stupid rcu_sync_enter() into cgroup_init() at least
for now? Yes, this means the unnecessary .sync() at boot time, but
it will go away after cleanups I am going to send.

Because, again, we will probably want to change this bias dynamically.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 18:43   ` Oleg Nesterov
@ 2016-07-14 18:56     ` Peter Zijlstra
  2016-07-14 19:20     ` Peter Zijlstra
  1 sibling, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 18:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On Thu, Jul 14, 2016 at 08:43:51PM +0200, Oleg Nesterov wrote:
> On 07/14, Peter Zijlstra wrote:
> >
> > The current percpu-rwsem read side is entirely free of serializing
> > instructions at the cost of having a synchronize_sched() in the write
> > path.
> >
> > The latency of the synchronize_sched() is too high for some users
> > (cgroups), so provide a __percpu_init_rwsem(.bias) argument to forgot
> > this synchronize_sched() at the cost of forcing all readers into the
> > slow path, which has serializing instructions.
> 
> Oh well... I personally do not think this is what we want... Can't
> we just add the stupid rcu_sync_enter() into cgroup_init() at least
> for now? 

> Yes, this means the unnecessary .sync() at boot time, but
> it will go away after cleanups I am going to send.

Those would have to hit the same merge window though; some people (like
Arjan) really care about boot times and hunt and kill people adding
pointless delays..

> Because, again, we will probably want to change this bias dynamically.

Hmm, how so?

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 18:43   ` Oleg Nesterov
  2016-07-14 18:56     ` Peter Zijlstra
@ 2016-07-14 19:20     ` Peter Zijlstra
  2016-07-14 19:29       ` Paul E. McKenney
                         ` (2 more replies)
  1 sibling, 3 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 19:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On Thu, Jul 14, 2016 at 08:43:51PM +0200, Oleg Nesterov wrote:
> 
> Oh well... I personally do not think this is what we want... Can't
> we just add the stupid rcu_sync_enter() into cgroup_init() at least
> for now? Yes, this means the unnecessary .sync() at boot time, but
> it will go away after cleanups I am going to send.

Something like so then?

---
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(stru
 }
 
 extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_sabotage(struct rcu_sync *);
 extern void rcu_sync_enter(struct rcu_sync *);
 extern void rcu_sync_exit(struct rcu_sync *);
 extern void rcu_sync_dtor(struct rcu_sync *);
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5609,6 +5609,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
+	rcu_sync_sabotage(&cgroup_threadgroup_rwsem.rss);
+
 	get_user_ns(init_cgroup_ns.user_ns);
 
 	mutex_lock(&cgroup_mutex);
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -83,6 +83,21 @@ void rcu_sync_init(struct rcu_sync *rsp,
 }
 
 /**
+ * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
+ * @rsp: Pointer to rcu_sync structure to be sabotaged
+ *
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() pairs
+ * turn into NO-OPs.
+ */
+void rcu_sync_sabotage(struct rcu_sync *rsp)
+{
+	rsp->gp_count++;
+	rsp->gp_state = !GP_IDLE;
+}
+
+/**
  * rcu_sync_enter() - Force readers onto slowpath
  * @rsp: Pointer to rcu_sync structure to use for synchronization
  *

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 19:20     ` Peter Zijlstra
@ 2016-07-14 19:29       ` Paul E. McKenney
  2016-07-14 19:38         ` Peter Zijlstra
  2016-07-15 13:27       ` Oleg Nesterov
  2016-07-15 13:42       ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Oleg Nesterov
  2 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-14 19:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Thu, Jul 14, 2016 at 09:20:18PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 14, 2016 at 08:43:51PM +0200, Oleg Nesterov wrote:
> > 
> > Oh well... I personally do not think this is what we want... Can't
> > we just add the stupid rcu_sync_enter() into cgroup_init() at least
> > for now? Yes, this means the unnecessary .sync() at boot time, but
> > it will go away after cleanups I am going to send.
> 
> Something like so then?
> 
> ---
> --- a/include/linux/rcu_sync.h
> +++ b/include/linux/rcu_sync.h
> @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(stru
>  }
> 
>  extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
> +extern void rcu_sync_sabotage(struct rcu_sync *);
>  extern void rcu_sync_enter(struct rcu_sync *);
>  extern void rcu_sync_exit(struct rcu_sync *);
>  extern void rcu_sync_dtor(struct rcu_sync *);
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -5609,6 +5609,8 @@ int __init cgroup_init(void)
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
> 
> +	rcu_sync_sabotage(&cgroup_threadgroup_rwsem.rss);

With a name like that...

> +
>  	get_user_ns(init_cgroup_ns.user_ns);
> 
>  	mutex_lock(&cgroup_mutex);
> --- a/kernel/rcu/sync.c
> +++ b/kernel/rcu/sync.c
> @@ -83,6 +83,21 @@ void rcu_sync_init(struct rcu_sync *rsp,
>  }
> 
>  /**
> + * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
> + * @rsp: Pointer to rcu_sync structure to be sabotaged
> + *
> + * Must be called after rcu_sync_init() and before first use.
> + *
> + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() pairs
> + * turn into NO-OPs.
> + */
> +void rcu_sync_sabotage(struct rcu_sync *rsp)
> +{
> +	rsp->gp_count++;
> +	rsp->gp_state = !GP_IDLE;

???  A very strange way to say GP_PENDING.  A new GP_DISABLED, perhaps?

							Thanx, Paul

> +}
> +
> +/**
>   * rcu_sync_enter() - Force readers onto slowpath
>   * @rsp: Pointer to rcu_sync structure to use for synchronization
>   *
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 19:29       ` Paul E. McKenney
@ 2016-07-14 19:38         ` Peter Zijlstra
  2016-07-14 19:54           ` Paul E. McKenney
  0 siblings, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-14 19:38 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Thu, Jul 14, 2016 at 12:29:59PM -0700, Paul E. McKenney wrote:
> On Thu, Jul 14, 2016 at 09:20:18PM +0200, Peter Zijlstra wrote:
> >  /**
> > + * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
> > + * @rsp: Pointer to rcu_sync structure to be sabotaged
> > + *
> > + * Must be called after rcu_sync_init() and before first use.
> > + *
> > + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() pairs
> > + * turn into NO-OPs.
> > + */
> > +void rcu_sync_sabotage(struct rcu_sync *rsp)
> > +{
> > +	rsp->gp_count++;
> > +	rsp->gp_state = !GP_IDLE;
> 
> ???  A very strange way to say GP_PENDING.  A new GP_DISABLED, perhaps?

Right, so the important thing is that its not GP_IDLE, the rest doesn't
really matter.

This forces rcu_sync_is_idle() to return false. The skewed gp_count
ensures rcu_sync_{enter,exit}() pairs no-op.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 19:38         ` Peter Zijlstra
@ 2016-07-14 19:54           ` Paul E. McKenney
  0 siblings, 0 replies; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-14 19:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Thu, Jul 14, 2016 at 09:38:00PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 14, 2016 at 12:29:59PM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 14, 2016 at 09:20:18PM +0200, Peter Zijlstra wrote:
> > >  /**
> > > + * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
> > > + * @rsp: Pointer to rcu_sync structure to be sabotaged
> > > + *
> > > + * Must be called after rcu_sync_init() and before first use.
> > > + *
> > > + * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() pairs
> > > + * turn into NO-OPs.
> > > + */
> > > +void rcu_sync_sabotage(struct rcu_sync *rsp)
> > > +{
> > > +	rsp->gp_count++;
> > > +	rsp->gp_state = !GP_IDLE;
> > 
> > ???  A very strange way to say GP_PENDING.  A new GP_DISABLED, perhaps?
> 
> Right, so the important thing is that its not GP_IDLE, the rest doesn't
> really matter.
> 
> This forces rcu_sync_is_idle() to return false. The skewed gp_count
> ensures rcu_sync_{enter,exit}() pairs no-op.

Understood.  But let's have at least some pity on the poor people who
might one day read this code.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 19:20     ` Peter Zijlstra
  2016-07-14 19:29       ` Paul E. McKenney
@ 2016-07-15 13:27       ` Oleg Nesterov
  2016-07-15 13:39         ` Paul E. McKenney
  2016-07-15 13:42       ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Oleg Nesterov
  2 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-15 13:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On 07/14, Peter Zijlstra wrote:
>
> On Thu, Jul 14, 2016 at 08:43:51PM +0200, Oleg Nesterov wrote:
> >
> > Oh well... I personally do not think this is what we want... Can't
> > we just add the stupid rcu_sync_enter() into cgroup_init() at least
> > for now? Yes, this means the unnecessary .sync() at boot time, but
> > it will go away after cleanups I am going to send.
>
> Something like so then?

OK, agreed,

> ---
> --- a/include/linux/rcu_sync.h
> +++ b/include/linux/rcu_sync.h
> @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(stru
>  }
>
>  extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
> +extern void rcu_sync_sabotage(struct rcu_sync *);
>  extern void rcu_sync_enter(struct rcu_sync *);
>  extern void rcu_sync_exit(struct rcu_sync *);
>  extern void rcu_sync_dtor(struct rcu_sync *);
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -5609,6 +5609,8 @@ int __init cgroup_init(void)
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
>  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
>
> +	rcu_sync_sabotage(&cgroup_threadgroup_rwsem.rss);

Heh ;) I too think it should be renamed. I'd suggest

	__rcu_sync_enter(rss);

although I do not really mind and agree with any name.

Hopefully I'll send some cleanups soon, rcu_sync_enter() will be
re-implemented as

	rcu_sync_enter(rss)
	{
		if (__rcu_sync_enter(rss))
			__rcu_sync_wait(rss);
	}

> + * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
> + * @rsp: Pointer to rcu_sync structure to be sabotaged
> + *
> + * Must be called after rcu_sync_init() and before first use.

OK. And after those cleanups __rcu_sync_enter() can be called at any
time.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-15 13:27       ` Oleg Nesterov
@ 2016-07-15 13:39         ` Paul E. McKenney
  2016-07-15 13:45           ` Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-15 13:39 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Fri, Jul 15, 2016 at 03:27:09PM +0200, Oleg Nesterov wrote:
> On 07/14, Peter Zijlstra wrote:
> >
> > On Thu, Jul 14, 2016 at 08:43:51PM +0200, Oleg Nesterov wrote:
> > >
> > > Oh well... I personally do not think this is what we want... Can't
> > > we just add the stupid rcu_sync_enter() into cgroup_init() at least
> > > for now? Yes, this means the unnecessary .sync() at boot time, but
> > > it will go away after cleanups I am going to send.
> >
> > Something like so then?
> 
> OK, agreed,
> 
> > ---
> > --- a/include/linux/rcu_sync.h
> > +++ b/include/linux/rcu_sync.h
> > @@ -59,6 +59,7 @@ static inline bool rcu_sync_is_idle(stru
> >  }
> >
> >  extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
> > +extern void rcu_sync_sabotage(struct rcu_sync *);
> >  extern void rcu_sync_enter(struct rcu_sync *);
> >  extern void rcu_sync_exit(struct rcu_sync *);
> >  extern void rcu_sync_dtor(struct rcu_sync *);
> > --- a/kernel/cgroup.c
> > +++ b/kernel/cgroup.c
> > @@ -5609,6 +5609,8 @@ int __init cgroup_init(void)
> >  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
> >  	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
> >
> > +	rcu_sync_sabotage(&cgroup_threadgroup_rwsem.rss);
> 
> Heh ;) I too think it should be renamed. I'd suggest
> 
> 	__rcu_sync_enter(rss);
> 
> although I do not really mind and agree with any name.

Hmmm...  Why not just move the checks out into the caller?  That
would make the intent much more clear.

							Thanx, Paul

> Hopefully I'll send some cleanups soon, rcu_sync_enter() will be
> re-implemented as
> 
> 	rcu_sync_enter(rss)
> 	{
> 		if (__rcu_sync_enter(rss))
> 			__rcu_sync_wait(rss);
> 	}
> 
> > + * rcu_sync_sabotage() - Sabotage a fresh rcu_sync instance
> > + * @rsp: Pointer to rcu_sync structure to be sabotaged
> > + *
> > + * Must be called after rcu_sync_init() and before first use.
> 
> OK. And after those cleanups __rcu_sync_enter() can be called at any
> time.
> 
> Oleg.
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-14 19:20     ` Peter Zijlstra
  2016-07-14 19:29       ` Paul E. McKenney
  2016-07-15 13:27       ` Oleg Nesterov
@ 2016-07-15 13:42       ` Oleg Nesterov
  2 siblings, 0 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-15 13:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On 07/14, Peter Zijlstra wrote:
>
> +void rcu_sync_sabotage(struct rcu_sync *rsp)
> +{
> +	rsp->gp_count++;
> +	rsp->gp_state = !GP_IDLE;
> +}

Ah, I didn't notice this !GP_IDLE...

Please use GP_PASSED, this is what this actually means. And note the
wait_event(GP_PASSED) in rcu_sync_enter().

Otherwise

Reviewed-by: Oleg Nesterov <oleg@redhat.com>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-15 13:39         ` Paul E. McKenney
@ 2016-07-15 13:45           ` Oleg Nesterov
  2016-07-15 15:38             ` Paul E. McKenney
  0 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-15 13:45 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On 07/15, Paul E. McKenney wrote:
>
> On Fri, Jul 15, 2016 at 03:27:09PM +0200, Oleg Nesterov wrote:
> >
> > Heh ;) I too think it should be renamed. I'd suggest
> >
> > 	__rcu_sync_enter(rss);
> >
> > although I do not really mind and agree with any name.
>
> Hmmm...  Why not just move the checks out into the caller?  That
> would make the intent much more clear.

Hmm. which caller?

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-15 13:45           ` Oleg Nesterov
@ 2016-07-15 15:38             ` Paul E. McKenney
  2016-07-15 16:49               ` Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-15 15:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Fri, Jul 15, 2016 at 03:45:24PM +0200, Oleg Nesterov wrote:
> On 07/15, Paul E. McKenney wrote:
> >
> > On Fri, Jul 15, 2016 at 03:27:09PM +0200, Oleg Nesterov wrote:
> > >
> > > Heh ;) I too think it should be renamed. I'd suggest
> > >
> > > 	__rcu_sync_enter(rss);
> > >
> > > although I do not really mind and agree with any name.
> >
> > Hmmm...  Why not just move the checks out into the caller?  That
> > would make the intent much more clear.
> 
> Hmm. which caller?

The ones associated with a percpu_rwsem_bias of PERCPU_RWSEM_READER.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2016-07-15 16:30   ` Oleg Nesterov
  2016-07-15 19:47     ` Peter Zijlstra
  2016-07-18 18:23   ` kbuild test robot
  2016-07-18 22:51   ` kbuild test robot
  2 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-15 16:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On 07/14, Peter Zijlstra wrote:
>
> Currently the percpu-rwsem switches to (global) atomic ops while a
> writer is waiting; which could be quite a while and slows down
> releasing the readers.
>
> This patch cures this problem by ordering the reader-state vs
> reader-count (see the comments in __percpu_down_read() and
> percpu_down_write()). This changes a global atomic op into a full
> memory barrier, which doesn't have the global cacheline contention.

I've applied this patch + another change you sent on top of it.

Everything looks good to me except the __this_cpu_inc() in
__percpu_down_read(),

> +	__down_read(&sem->rw_sem);
> +	__this_cpu_inc(*sem->read_count);
> +	__up_read(&sem->rw_sem);

Preemption is already enabled, don't we need this_cpu_inc() ?

> -EXPORT_SYMBOL_GPL(percpu_up_write);
> +EXPORT_SYMBOL(percpu_up_write);

and this one ;) I do not really care, but it seems you did this change
by accident.

Actually, I _think_ we can do some cleanups/improvements on top of this
change, but we can do this later. In particular, _perhaps_ we can avoid
the unconditional wakeup in __percpu_up_read(), but I am not sure and in
any case this needs another change.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-15 15:38             ` Paul E. McKenney
@ 2016-07-15 16:49               ` Oleg Nesterov
  2016-07-15 18:01                 ` Paul E. McKenney
  0 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-15 16:49 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On 07/15, Paul E. McKenney wrote:
> On Fri, Jul 15, 2016 at 03:45:24PM +0200, Oleg Nesterov wrote:
> > On 07/15, Paul E. McKenney wrote:
> > >
> > > On Fri, Jul 15, 2016 at 03:27:09PM +0200, Oleg Nesterov wrote:
> > > >
> > > > Heh ;) I too think it should be renamed. I'd suggest
> > > >
> > > > 	__rcu_sync_enter(rss);
> > > >
> > > > although I do not really mind and agree with any name.
> > >
> > > Hmmm...  Why not just move the checks out into the caller?  That
> > > would make the intent much more clear.
> >
> > Hmm. which caller?
>
> The ones associated with a percpu_rwsem_bias of PERCPU_RWSEM_READER.

Ah. But if we add __rcu_sync_enter() instead of bias/PERCPU_RWSEM_READER.

IOW, please ignore 2/2 which adds PERCPU_RWSEM_READER, the new version
just adds rcu_sync_sabotage() which should be renamed (and use GP_PASSED).

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob
  2016-07-15 16:49               ` Oleg Nesterov
@ 2016-07-15 18:01                 ` Paul E. McKenney
  2016-07-16 17:10                   ` [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter() Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-15 18:01 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Fri, Jul 15, 2016 at 06:49:39PM +0200, Oleg Nesterov wrote:
> On 07/15, Paul E. McKenney wrote:
> > On Fri, Jul 15, 2016 at 03:45:24PM +0200, Oleg Nesterov wrote:
> > > On 07/15, Paul E. McKenney wrote:
> > > >
> > > > On Fri, Jul 15, 2016 at 03:27:09PM +0200, Oleg Nesterov wrote:
> > > > >
> > > > > Heh ;) I too think it should be renamed. I'd suggest
> > > > >
> > > > > 	__rcu_sync_enter(rss);
> > > > >
> > > > > although I do not really mind and agree with any name.
> > > >
> > > > Hmmm...  Why not just move the checks out into the caller?  That
> > > > would make the intent much more clear.
> > >
> > > Hmm. which caller?
> >
> > The ones associated with a percpu_rwsem_bias of PERCPU_RWSEM_READER.
> 
> Ah. But if we add __rcu_sync_enter() instead of bias/PERCPU_RWSEM_READER.
> 
> IOW, please ignore 2/2 which adds PERCPU_RWSEM_READER, the new version
> just adds rcu_sync_sabotage() which should be renamed (and use GP_PASSED).

OK, then move the checks out into the callers that would have used
__rcu_sync_enter().  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-07-15 16:30   ` Oleg Nesterov
@ 2016-07-15 19:47     ` Peter Zijlstra
  0 siblings, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-15 19:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: mingo, linux-kernel, tj, paulmck, john.stultz, dimitrysh, romlem,
	ccross, tkjos

On Fri, Jul 15, 2016 at 06:30:54PM +0200, Oleg Nesterov wrote:
> On 07/14, Peter Zijlstra wrote:
> >
> > Currently the percpu-rwsem switches to (global) atomic ops while a
> > writer is waiting; which could be quite a while and slows down
> > releasing the readers.
> >
> > This patch cures this problem by ordering the reader-state vs
> > reader-count (see the comments in __percpu_down_read() and
> > percpu_down_write()). This changes a global atomic op into a full
> > memory barrier, which doesn't have the global cacheline contention.
> 
> I've applied this patch + another change you sent on top of it.
> 
> Everything looks good to me except the __this_cpu_inc() in
> __percpu_down_read(),
> 
> > +	__down_read(&sem->rw_sem);
> > +	__this_cpu_inc(*sem->read_count);
> > +	__up_read(&sem->rw_sem);
> 
> Preemption is already enabled, don't we need this_cpu_inc() ?

Ah indeed. This mistake is quite old it seems, good catch.

> > -EXPORT_SYMBOL_GPL(percpu_up_write);
> > +EXPORT_SYMBOL(percpu_up_write);
> 
> and this one ;) I do not really care, but it seems you did this change
> by accident.

Yep, oops ;-)

> Actually, I _think_ we can do some cleanups/improvements on top of this
> change, but we can do this later. In particular, _perhaps_ we can avoid
> the unconditional wakeup in __percpu_up_read(), but I am not sure and in
> any case this needs another change.
> 
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>

Thanks!

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-15 18:01                 ` Paul E. McKenney
@ 2016-07-16 17:10                   ` Oleg Nesterov
  2016-07-16 18:40                     ` Oleg Nesterov
                                       ` (2 more replies)
  0 siblings, 3 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-16 17:10 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra
  Cc: mingo, linux-kernel, tj, john.stultz, dimitrysh, romlem, ccross, tkjos

On 07/15, Paul E. McKenney wrote:
>
> On Fri, Jul 15, 2016 at 06:49:39PM +0200, Oleg Nesterov wrote:
> >
> > IOW, please ignore 2/2 which adds PERCPU_RWSEM_READER, the new version
> > just adds rcu_sync_sabotage() which should be renamed (and use GP_PASSED).
>
> OK, then move the checks out into the callers that would have used
> __rcu_sync_enter().  ;-)

Cough, now I can't understand which check do you mean ;) OK, let me
show the code, then we will hopefully understand each other.

----------------------------------------------------------------------
So, as you can see I have fooled you ;) I'll send the patch on top of
Peter's changes, this is the (UNTESTED) code with the patch applied.

Peter, Paul, could you review? Do you see any hole?

Why. Firstly, note that the state machine was greatly simplified, and
rsp->cb_state has gone, we have a single "state" variable, gp_state.
Note also the ->sync() op has gone (and actually ->wait() too, see the
"TODO" comment).

GP_IDLE	- owned by __rcu_sync_enter() which can only move this state to

GP_ENTER - owned by rcu-callback which moves it to

GP_PASSED - owned by the last rcu_sync_exit() which moves it to

GP_EXIT - owned by rcu-callback which moves it back to GP_IDLE.

Yes, this is a bit simplified, we also have GP_REPLAY, but hopefully
clear.

And, there is another possible transition, GP_ENTER -> GP_IDLE, because
not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
state (except obviously they should be balanced), and they do not block.

The only blocking call is __rcu_sync_wait() which actually waits for GP.
Obviously should only be called if gp_count != 0, iow after __rcu_sync_enter.


--------------------------------------------------------------------------
Now, cgroup_init() can simply call __rcu_sync_enter(&cgroup_threadgroup_rwsem)
and switch this sem into the slow mode. Compared to "sabotage" from Peter
this implies the unnecessary call_rcu_sched(), but I hope we can tolerate
this.

And we can even add a runtime knob to switch between "fast" and "slow
aka writer-biased" modes for cgroup_threadgroup_rwsem.

--------------------------------------------------------------------------
And I think __rcu_sync_enter() can have more users. Let's look at
freeze_super(). It calls percpu_down_write() 3 times, and waits for 3 GP's
sequentally.

Now we can add 3 __rcu_sync_enter's at the start and 3 rcu_sync_exit's at
the end (actually we can do better, just to simplify). And again, note
that rcu_sync_exit() will work correctly even if we (say) return -EBUSY,
so rcu_sync_wait and/or percpu_down_write() was not called in between,
and in this case we won't block waiting for GP.

What do you think?

Oleg.
---

// rcu_sync.h: ----------------------------------------------------------------

struct rcu_sync {
	int			gp_state;
	int			gp_count;
	wait_queue_head_t	gp_wait;

	struct rcu_head		cb_head;
	enum rcu_sync_type	gp_type;
};

// sync.c ---------------------------------------------------------------------
#include <linux/rcu_sync.h>
#include <linux/sched.h>

#ifdef CONFIG_PROVE_RCU
#define __INIT_HELD(func)	.held = func,
#else
#define __INIT_HELD(func)
#endif

static const struct {
	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
	void (*wait)(void);	// TODO: remove this, see the comment in dtor
#ifdef CONFIG_PROVE_RCU
	int  (*held)(void);
#endif
} gp_ops[] = {
	[RCU_SYNC] = {
		.call = call_rcu,
		.wait = rcu_barrier,
		__INIT_HELD(rcu_read_lock_held)
	},
	[RCU_SCHED_SYNC] = {
		.call = call_rcu_sched,
		.wait = rcu_barrier_sched,
		__INIT_HELD(rcu_read_lock_sched_held)
	},
	[RCU_BH_SYNC] = {
		.call = call_rcu_bh,
		.wait = rcu_barrier_bh,
		__INIT_HELD(rcu_read_lock_bh_held)
	},
};

#define	rss_lock	gp_wait.lock

enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };

static void rcu_sync_func(struct rcu_head *rcu);

static void rcu_sync_call(struct rcu_sync *rsp)
{
	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
	// if rcu_blocking_is_gp() == T, but it has might_sleep().
	gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
}

static void rcu_sync_func(struct rcu_head *rcu)
{
	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
	unsigned long flags;

	BUG_ON(rsp->gp_state == GP_IDLE);
	BUG_ON(rsp->gp_state == GP_PASSED);

	spin_lock_irqsave(&rsp->rss_lock, flags);
	if (rsp->gp_count) {
		/*
		 * We're at least a GP after the first __rcu_sync_enter().
		 */
		rsp->gp_state = GP_PASSED;
	} else if (rsp->gp_state == GP_REPLAY) {
		/*
		 * A new rcu_sync_exit() has happened; requeue the callback
		 * to catch a later GP.
		 */
		rsp->gp_state = GP_EXIT;
		rcu_sync_call(rsp);
	} else {
		/*
		 * We're at least a GP after the last rcu_sync_exit();
		 * eveybody will now have observed the write side critical
		 * section. Let 'em rip!.
		 *
		 * OR. ->gp_state can be still GP_ENTER if __rcu_sync_wait()
		 * wasn't called after __rcu_sync_enter(), abort.
		 */
		rsp->gp_state = GP_IDLE;
	}
	spin_unlock_irqrestore(&rsp->rss_lock, flags);
}

bool __rcu_sync_enter(struct rcu_sync *rsp)
{
	int gp_count, gp_state;

	spin_lock_irq(&rsp->rss_lock);
	gp_count = rsp->gp_count++;
	gp_state = rsp->gp_state;
	if (gp_state == GP_IDLE) {
		rsp->gp_state = GP_ENTER;
		rcu_sync_call(rsp);
	}
	spin_unlock_irq(&rsp->rss_lock);

	BUG_ON(gp_count != 0 && gp_state == GP_IDLE);
	BUG_ON(gp_count == 0 && gp_state == GP_PASSED);

	return gp_state < GP_PASSED;
}

void __rcu_sync_wait(struct rcu_sync *rsp)
{
	BUG_ON(rsp->gp_state == GP_IDLE);
	BUG_ON(rsp->gp_count == 0);

	wait_event(rsp->gp_wait, rsp->gp_state >= GP_PASSED);
}

void rcu_sync_enter(struct rcu_sync *rsp)
{
	if (__rcu_sync_enter(rsp))
		__rcu_sync_wait(rsp);
}

void rcu_sync_exit(struct rcu_sync *rsp)
{
	BUG_ON(rsp->gp_state == GP_IDLE);
	BUG_ON(rsp->gp_count == 0);

	spin_lock_irq(&rsp->rss_lock);
	if (!--rsp->gp_count) {
		if (rsp->gp_state == GP_PASSED) {
			rsp->gp_state = GP_EXIT;
			rcu_sync_call(rsp);
		} else if (rsp->gp_state == GP_EXIT) {
			rsp->gp_state = GP_REPLAY;
		}
	}
	spin_unlock_irq(&rsp->rss_lock);
}

void rcu_sync_dtor(struct rcu_sync *rsp)
{
	int gp_state;

	BUG_ON(rsp->gp_count);
	BUG_ON(rsp->gp_state == GP_PASSED);

	spin_lock_irq(&rsp->rss_lock);
	if (rsp->gp_state == GP_REPLAY)
		rsp->gp_state = GP_EXIT;
	gp_state = rsp->gp_state;
	spin_unlock_irq(&rsp->rss_lock);

	// TODO: add another wake_up_locked() into rcu_sync_func(),
	// use wait_event + spin_lock_wait, remove gp_ops->wait().
	if (gp_state != GP_IDLE) {
		gp_ops[rsp->gp_type].wait();
		BUG_ON(rsp->gp_state != GP_IDLE);
	}
}

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-16 17:10                   ` [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter() Oleg Nesterov
@ 2016-07-16 18:40                     ` Oleg Nesterov
  2016-07-18 11:54                     ` Peter Zijlstra
  2016-07-19 20:50                     ` Paul E. McKenney
  2 siblings, 0 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-16 18:40 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra
  Cc: mingo, linux-kernel, tj, john.stultz, dimitrysh, romlem, ccross, tkjos

Damn, sorry for noise...

On 07/16, Oleg Nesterov wrote:
>
> And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
  ^^^
I tried to say "now it is possible ..."

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-16 17:10                   ` [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter() Oleg Nesterov
  2016-07-16 18:40                     ` Oleg Nesterov
@ 2016-07-18 11:54                     ` Peter Zijlstra
  2016-07-18 13:44                       ` Oleg Nesterov
  2016-07-19 20:50                     ` Paul E. McKenney
  2 siblings, 1 reply; 38+ messages in thread
From: Peter Zijlstra @ 2016-07-18 11:54 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, mingo, linux-kernel, tj, john.stultz,
	dimitrysh, romlem, ccross, tkjos

On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
> Peter, Paul, could you review? Do you see any hole?
> 
> Why. Firstly, note that the state machine was greatly simplified, and
> rsp->cb_state has gone, we have a single "state" variable, gp_state.
> Note also the ->sync() op has gone (and actually ->wait() too, see the
> "TODO" comment).

Yes, that seems like a nice simplification.

> GP_IDLE	- owned by __rcu_sync_enter() which can only move this state to
> 
> GP_ENTER - owned by rcu-callback which moves it to
> 
> GP_PASSED - owned by the last rcu_sync_exit() which moves it to
> 
> GP_EXIT - owned by rcu-callback which moves it back to GP_IDLE.
> 
> Yes, this is a bit simplified, we also have GP_REPLAY, but hopefully
> clear.
> 
> And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> state (except obviously they should be balanced), and they do not block.
> 
> The only blocking call is __rcu_sync_wait() which actually waits for GP.
> Obviously should only be called if gp_count != 0, iow after __rcu_sync_enter.

So I'm a complete moron today, to aid the failing brain I draw pictures.

I think I ended up with this:

 .---->	GP_IDLE <--------------.
 |	  |                    |
 |	  | __rcu_sync_enter() | <GP> / rcu_sync_exit()
 |	  v                    |
 |	GP_ENTER --------------'
 |	  |
 |  <GP>  |
 |        v
 |	GP_PASSED <---------.
 |	  |                 |
 |	  | rcu_sync_exit() | <GP> / __rcu_sync_enter()
 |	  v                 |
 `----- GP_EXIT ------------'
	  ^
    <GP>  | __rcu_sync_enter() + rcu_sync_exit()
	  v
	GP_RETRY


> enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
> 
> static void rcu_sync_func(struct rcu_head *rcu);
> 
> static void rcu_sync_call(struct rcu_sync *rsp)
> {
> 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> 	// if rcu_blocking_is_gp() == T, but it has might_sleep().

Not sure I get that comment..

> 	gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
> }
> 
> static void rcu_sync_func(struct rcu_head *rcu)
> {
> 	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
> 	unsigned long flags;
> 

Right, those are 'stable' states and must be advanced through explicit
calls to __rcu_sync_enter()/rcu_sync_exit() respectively.

> 	BUG_ON(rsp->gp_state == GP_IDLE);
> 	BUG_ON(rsp->gp_state == GP_PASSED);
> 
> 	spin_lock_irqsave(&rsp->rss_lock, flags);
> 	if (rsp->gp_count) {
> 		/*
> 		 * We're at least a GP after the first __rcu_sync_enter().
> 		 */
> 		rsp->gp_state = GP_PASSED;

So we can end up here in two ways afaict.

The simple way: someone called __rcu_sync_enter(), we go IDLE -> ENTER
with a raised count. Once the GP passes, we get here, observe the raised
count and advance to PASSED.

The more involved way: we were EXIT and someone calls __rcu_sync_enter()
to raise the count again. The callback from rcu_sync_exit() was still
pending and once we get here we observe the raised count and voila.

Now, since state != IDLE, I suppose this is valid, but it does hurt my
brain.

Else, !count:

> 	} else if (rsp->gp_state == GP_REPLAY) {

Fairly straight forward: during EXIT someone did __rcu_sync_enter +
rcu_sync_exit() and we need to wait longer still.

> 		/*
> 		 * A new rcu_sync_exit() has happened; requeue the callback
> 		 * to catch a later GP.
> 		 */
> 		rsp->gp_state = GP_EXIT;
> 		rcu_sync_call(rsp);
> 	} else {

Again two ways here, simple: we were EXIT, GP passed, we let em rip.

But its also possible, as you noted, that someone called rcu_sync_exit()
during ENTER and we ended up with !count which gets us back to IDLE.

> 		/*
> 		 * We're at least a GP after the last rcu_sync_exit();
> 		 * eveybody will now have observed the write side critical
> 		 * section. Let 'em rip!.
> 		 *
> 		 * OR. ->gp_state can be still GP_ENTER if __rcu_sync_wait()
> 		 * wasn't called after __rcu_sync_enter(), abort.
> 		 */
> 		rsp->gp_state = GP_IDLE;
> 	}
> 	spin_unlock_irqrestore(&rsp->rss_lock, flags);
> }

Agreed on the rest.

So I think its solid, but you failed to mention one state transition,
which seems ok in any case.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-18 11:54                     ` Peter Zijlstra
@ 2016-07-18 13:44                       ` Oleg Nesterov
  0 siblings, 0 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-18 13:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, mingo, linux-kernel, tj, john.stultz,
	dimitrysh, romlem, ccross, tkjos

On 07/18, Peter Zijlstra wrote:
>
> I think I ended up with this:
>
>  .---->	GP_IDLE <--------------.
>  |	  |                    |
>  |	  | __rcu_sync_enter() | <GP> / rcu_sync_exit()
>  |	  v                    |
>  |	GP_ENTER --------------'
>  |	  |
>  |  <GP>  |
>  |        v
>  |	GP_PASSED <---------.
>  |	  |                 |
>  |	  | rcu_sync_exit() | <GP> / __rcu_sync_enter()
>  |	  v                 |
>  `----- GP_EXIT ------------'
> 	  ^
>     <GP>  | __rcu_sync_enter() + rcu_sync_exit()
> 	  v
> 	GP_RETRY

Thanks! I'll include this into the changelog.

> > static void rcu_sync_call(struct rcu_sync *rsp)
> > {
> > 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> > 	// if rcu_blocking_is_gp() == T, but it has might_sleep().
>
> Not sure I get that comment..

I meant, we actually want

	static void rcu_sync_call(struct rcu_sync *rsp)
	{
		if (rcu_blocking_is_gp())
			rcu_sync_func(rsp);
		else
			gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func)
	}

but this needs some other simple changes. rcu_sync_func() needs
the same spinlock and rcu_blocking_is_gp() calls might_sleep().

We can simply move rcu_sync_call() outside of rsp->rss_lock, but
then we will need more comments to explain why we can't race with
enter/exit from someone else. Or introduce __rcu_blocking_is_gp()
without might_sleep(), but this needs a trivial change outside of
rcu/sync.c.

We will see. This is simple anyway.

> > 	spin_lock_irqsave(&rsp->rss_lock, flags);
> > 	if (rsp->gp_count) {
> > 		/*
> > 		 * We're at least a GP after the first __rcu_sync_enter().
> > 		 */
> > 		rsp->gp_state = GP_PASSED;
>
> So we can end up here in two ways afaict.
>
> The simple way: someone called __rcu_sync_enter(), we go IDLE -> ENTER
> with a raised count. Once the GP passes, we get here, observe the raised
> count and advance to PASSED.
>
> The more involved way: we were EXIT and someone calls __rcu_sync_enter()
> to raise the count again. The callback from rcu_sync_exit() was still
> pending and once we get here we observe the raised count and voila.

Yes, yes.

> Now, since state != IDLE, I suppose this is valid, but it does hurt my
> brain.

Simply put, if rsp->gp_count != 0 we do not care about the history and
GP_PASSED is always correct when rcu callback is called, this obviously
means that we passed a GP.

Except GP_IDLE -> GP_PASSED transition is wrong, but this must not be
possible because only rcu callback can set GP_IDLE and only if !gp_count,
so we must always have at least one GP in between. Note also BUG_ON()
checks at the start.

> So I think its solid, but you failed to mention one state transition,
> which seems ok in any case.

Great, thanks for review.

I'll send the actual patch on top of your changes.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2016-07-15 16:30   ` Oleg Nesterov
@ 2016-07-18 18:23   ` kbuild test robot
  2016-07-18 22:51   ` kbuild test robot
  2 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2016-07-18 18:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kbuild-all, mingo, linux-kernel, oleg, tj, paulmck, john.stultz,
	dimitrysh, romlem, ccross, tkjos, peterz

[-- Attachment #1: Type: text/plain, Size: 841 bytes --]

Hi,

[auto build test ERROR on v4.7-rc7]
[also build test ERROR on next-20160718]
[cannot apply to tip/core/locking]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Peter-Zijlstra/locking-percpu-rwsem-Optimize-readers-and-reduce-global-impact/20160715-142422
config: x86_64-randconfig-i0-07182210 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-1) 6.1.1 20160430
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

>> ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 30197 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact
  2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2016-07-15 16:30   ` Oleg Nesterov
  2016-07-18 18:23   ` kbuild test robot
@ 2016-07-18 22:51   ` kbuild test robot
  2 siblings, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2016-07-18 22:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: kbuild-all, mingo, linux-kernel, oleg, tj, paulmck, john.stultz,
	dimitrysh, romlem, ccross, tkjos, peterz

[-- Attachment #1: Type: text/plain, Size: 896 bytes --]

Hi,

[auto build test ERROR on v4.7-rc7]
[also build test ERROR on next-20160718]
[cannot apply to tip/core/locking]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Peter-Zijlstra/locking-percpu-rwsem-Optimize-readers-and-reduce-global-impact/20160715-142422
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-1) 6.1.1 20160430
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   ERROR: "rcu_sync_lockdep_assert" [kernel/locking/locktorture.ko] undefined!
>> ERROR: "rcu_sync_lockdep_assert" [fs/ext4/ext4.ko] undefined!

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/octet-stream, Size: 54806 bytes --]

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-16 17:10                   ` [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter() Oleg Nesterov
  2016-07-16 18:40                     ` Oleg Nesterov
  2016-07-18 11:54                     ` Peter Zijlstra
@ 2016-07-19 20:50                     ` Paul E. McKenney
  2016-07-20 15:13                       ` Oleg Nesterov
  2016-07-20 17:16                       ` Oleg Nesterov
  2 siblings, 2 replies; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-19 20:50 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
> On 07/15, Paul E. McKenney wrote:
> >
> > On Fri, Jul 15, 2016 at 06:49:39PM +0200, Oleg Nesterov wrote:
> > >
> > > IOW, please ignore 2/2 which adds PERCPU_RWSEM_READER, the new version
> > > just adds rcu_sync_sabotage() which should be renamed (and use GP_PASSED).
> >
> > OK, then move the checks out into the callers that would have used
> > __rcu_sync_enter().  ;-)
> 
> Cough, now I can't understand which check do you mean ;) OK, let me
> show the code, then we will hopefully understand each other.

The check in an earlier patch having to do with some _NONE value.  I don't
see any such checks here.  Which might be a good thing, who knows?  ;-)

> ----------------------------------------------------------------------
> So, as you can see I have fooled you ;) I'll send the patch on top of
> Peter's changes, this is the (UNTESTED) code with the patch applied.
> 
> Peter, Paul, could you review? Do you see any hole?

Please see below.  I see what looks like a missing wakeup and something
that could fail to wait for pre-existing RCU readers.  At the very end,
I have a somewhat more elaborate state diagram, along with assumptions
I made during review.  Please do check these assumptions carefully.

And the state table, for that matter...

> Why. Firstly, note that the state machine was greatly simplified, and
> rsp->cb_state has gone, we have a single "state" variable, gp_state.
> Note also the ->sync() op has gone (and actually ->wait() too, see the
> "TODO" comment).
> 
> GP_IDLE	- owned by __rcu_sync_enter() which can only move this state to
> 
> GP_ENTER - owned by rcu-callback which moves it to
> 
> GP_PASSED - owned by the last rcu_sync_exit() which moves it to
> 
> GP_EXIT - owned by rcu-callback which moves it back to GP_IDLE.
> 
> Yes, this is a bit simplified, we also have GP_REPLAY, but hopefully
> clear.

Agreed, given that you are ignoring GP_REPLAY.

> And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> state (except obviously they should be balanced), and they do not block.

Hmmm...  I am assuming that the only threads allowed to invoke
rcu_sync_exit() are those that have executed an unpaired rcu_sync_enter().
With that assumption, rcu_sync_enter() is illegal if GP_IDLE or GP_ENTER,
because the rcu_sync_enter() calls won't return until the state reaches
GP_PASSED.

If you feel strongly about allowing rcu_sync_exit() in GP_ENTER state,
could you please tell me your use case?  Or am I confused?

> The only blocking call is __rcu_sync_wait() which actually waits for GP.
> Obviously should only be called if gp_count != 0, iow after __rcu_sync_enter.

Well, rcu_sync_dtor() as well.  But to your point, it is blocking in
rcu_barrier(), not in wait_event().

> --------------------------------------------------------------------------
> Now, cgroup_init() can simply call __rcu_sync_enter(&cgroup_threadgroup_rwsem)
> and switch this sem into the slow mode. Compared to "sabotage" from Peter
> this implies the unnecessary call_rcu_sched(), but I hope we can tolerate
> this.

Or you could add a flag to rcu_sync_init() and friends, but I vaguely
recall some objection to that.

> And we can even add a runtime knob to switch between "fast" and "slow
> aka writer-biased" modes for cgroup_threadgroup_rwsem.

This might well be the check that I was objecting to earlier.  ;-)

> --------------------------------------------------------------------------
> And I think __rcu_sync_enter() can have more users. Let's look at
> freeze_super(). It calls percpu_down_write() 3 times, and waits for 3 GP's
> sequentally.
> 
> Now we can add 3 __rcu_sync_enter's at the start and 3 rcu_sync_exit's at
> the end (actually we can do better, just to simplify). And again, note
> that rcu_sync_exit() will work correctly even if we (say) return -EBUSY,
> so rcu_sync_wait and/or percpu_down_write() was not called in between,
> and in this case we won't block waiting for GP.
> 
> What do you think?

I am not going to claim to understand freeze_super(), but it does seem
to have a fair amount of waiting.

But yes, you could put rcu_sync_enter() and rcu_sync_exit() before and
after a series of write-side enter/exit pairs in order to force things
to stay in writer mode, if that is what you are suggesting.

> Oleg.
> ---
> 
> // rcu_sync.h: ----------------------------------------------------------------
> 
> struct rcu_sync {
> 	int			gp_state;
> 	int			gp_count;
> 	wait_queue_head_t	gp_wait;
> 
> 	struct rcu_head		cb_head;
> 	enum rcu_sync_type	gp_type;
> };
> 
> // sync.c ---------------------------------------------------------------------
> #include <linux/rcu_sync.h>
> #include <linux/sched.h>
> 
> #ifdef CONFIG_PROVE_RCU
> #define __INIT_HELD(func)	.held = func,
> #else
> #define __INIT_HELD(func)
> #endif
> 
> static const struct {
> 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
> 	void (*wait)(void);	// TODO: remove this, see the comment in dtor
> #ifdef CONFIG_PROVE_RCU
> 	int  (*held)(void);
> #endif
> } gp_ops[] = {
> 	[RCU_SYNC] = {
> 		.call = call_rcu,
> 		.wait = rcu_barrier,
> 		__INIT_HELD(rcu_read_lock_held)
> 	},
> 	[RCU_SCHED_SYNC] = {
> 		.call = call_rcu_sched,
> 		.wait = rcu_barrier_sched,
> 		__INIT_HELD(rcu_read_lock_sched_held)
> 	},
> 	[RCU_BH_SYNC] = {
> 		.call = call_rcu_bh,
> 		.wait = rcu_barrier_bh,
> 		__INIT_HELD(rcu_read_lock_bh_held)
> 	},
> };
> 
> #define	rss_lock	gp_wait.lock

Interesting...  Should there instead be a wait_queue_lock() and
wait_queue_unlock()?  If someone changes how wait-queue locking work,
this one might be a bit tricky.

> enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
> 
> static void rcu_sync_func(struct rcu_head *rcu);
> 
> static void rcu_sync_call(struct rcu_sync *rsp)
> {
> 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> 	// if rcu_blocking_is_gp() == T, but it has might_sleep().

Careful!  This optimization works only for RCU-sched and RCU-bh.
With normal RCU, you can be tripped up by tasks preempted within RCU
read-side critical sections should CONFIG_PREEMPT=y.

> 	gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
> }
> 
> static void rcu_sync_func(struct rcu_head *rcu)
> {
> 	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
> 	unsigned long flags;
> 
> 	BUG_ON(rsp->gp_state == GP_IDLE);

If you really want rcu_sync_exit() to be invoked before the corresponding
rcu_sync_enter() returns, and if you want that to set the state to
GP_IDLE, then the above BUG_ON() can fire.  This is why I was assuming
that rcu_sync_enter() must return before the corresponding rcu_sync_exit()
can start.

> 	BUG_ON(rsp->gp_state == GP_PASSED);

READ_ONCE() would feel better in both of the above BUG_ON()s, given that
we are not yet holding the lock.

> 	spin_lock_irqsave(&rsp->rss_lock, flags);
> 	if (rsp->gp_count) {
> 		/*
> 		 * We're at least a GP after the first __rcu_sync_enter().
> 		 */

Don't we need a wakeup here?

> 		rsp->gp_state = GP_PASSED;
> 	} else if (rsp->gp_state == GP_REPLAY) {
> 		/*
> 		 * A new rcu_sync_exit() has happened; requeue the callback
> 		 * to catch a later GP.
> 		 */
> 		rsp->gp_state = GP_EXIT;
> 		rcu_sync_call(rsp);
> 	} else {
> 		/*
> 		 * We're at least a GP after the last rcu_sync_exit();
> 		 * EVeybody will now have observed the write side critical
> 		 * section. Let 'em rip!.
> 		 *
> 		 * OR. ->gp_state can be still GP_ENTER if __rcu_sync_wait()
> 		 * wasn't called after __rcu_sync_enter(), abort.
> 		 */
> 		rsp->gp_state = GP_IDLE;
> 	}
> 	spin_unlock_irqrestore(&rsp->rss_lock, flags);
> }
> 
> bool __rcu_sync_enter(struct rcu_sync *rsp)
> {
> 	int gp_count, gp_state;
> 
> 	spin_lock_irq(&rsp->rss_lock);
> 	gp_count = rsp->gp_count++;
> 	gp_state = rsp->gp_state;
> 	if (gp_state == GP_IDLE) {
> 		rsp->gp_state = GP_ENTER;
> 		rcu_sync_call(rsp);
> 	}
> 	spin_unlock_irq(&rsp->rss_lock);
> 
> 	BUG_ON(gp_count != 0 && gp_state == GP_IDLE);
> 	BUG_ON(gp_count == 0 && gp_state == GP_PASSED);

Isn't gp_count==0 illegal here regardless of gp_state value?

> 	return gp_state < GP_PASSED;

And the above statement is another reason why I believe that it
should be illegal to invoke rcu_sync_exit() until after the matching
rcu_sync_enter() has returned.  If we were preempted just before the
above "return" statement, we might be in GP_IDLE state upon return,
which could fatally disappoint the caller.

And ditto for the next BUG_ON().

> }
> 
> void __rcu_sync_wait(struct rcu_sync *rsp)
> {
> 	BUG_ON(rsp->gp_state == GP_IDLE);
> 	BUG_ON(rsp->gp_count == 0);
> 
> 	wait_event(rsp->gp_wait, rsp->gp_state >= GP_PASSED);

I would feel better if all the references to ->gp_state and ->gp_count
outside of the lock be READ_ONCE().  Compilers can be tricky beasts.

> }

Here is what I believe rcu_sync_enter() does:

rcu_sync_enter() checks to see if the rcu_sync structure is in writer
mode, and, if not, initiates a transition to writer mode and waits for
that transition to complete.  Either way, a writer reference is acquired.

> void rcu_sync_enter(struct rcu_sync *rsp)
> {
> 	if (__rcu_sync_enter(rsp))
> 		__rcu_sync_wait(rsp);
> }

Here is what I believe rcu_sync_exit() does:

rcu_sync_exit() removes an updater reference, and if there are no more,
starts the transition to reader mode.  Note that this must handle
transitions back to writer mode that occur before the transition to
reader mode has fully completed.

> void rcu_sync_exit(struct rcu_sync *rsp)
> {
> 	BUG_ON(rsp->gp_state == GP_IDLE);

	BUG_ON(READ_ONCE(rsp->gp_state) == GP_ENTER);

If you do allow rcu_sync_exit() while GP_ENTER, then all the readers
can do rcu_sync_exit() in that state, leaving the state at GP_PASSED
instead of the desired GP_IDLE.

> 	BUG_ON(rsp->gp_count == 0);
> 
> 	spin_lock_irq(&rsp->rss_lock);
> 	if (!--rsp->gp_count) {
> 		if (rsp->gp_state == GP_PASSED) {
> 			rsp->gp_state = GP_EXIT;
> 			rcu_sync_call(rsp);
> 		} else if (rsp->gp_state == GP_EXIT) {
> 			rsp->gp_state = GP_REPLAY;
> 		}
> 	}
> 	spin_unlock_irq(&rsp->rss_lock);
> }

And here is what I believe rcu_sync_dtor() does:

rcu_sync_dtor() cleans up at end, waiting for a full transition back
into reader mode.  Anything invoking rcu_sync_dtor() must have ensured
that there will be no more calls to rcu_sync_enter() and rcu_sync_exit().

> void rcu_sync_dtor(struct rcu_sync *rsp)
> {
> 	int gp_state;
> 
> 	BUG_ON(rsp->gp_count);
> 	BUG_ON(rsp->gp_state == GP_PASSED);
> 
> 	spin_lock_irq(&rsp->rss_lock);
> 	if (rsp->gp_state == GP_REPLAY)
> 		rsp->gp_state = GP_EXIT;

OK, this ensures that the .wait() below will wait for the callback, but
it might result in some RCU read-side critical sections still being in
flight after rcu_sync_dtor() completes.  In other words, any transition
started by an rcu_sync_exit() will normally wait until all pre-existing
RCU readers complete -- unless that rcu_sync_exit() set the state to
GP_REPLAY and was quickly followed by rcu_sync_dtor().

Is this really OK???

You might get a more robust API if you left the value of ->gp_state
alone and rechecked ->gp_state below (under the lock).  The additional
overhead is incurred only at cleanup, so should not be a problem.

Or am I missing some subtle constraint on use cases here?

> 	gp_state = rsp->gp_state;
> 	spin_unlock_irq(&rsp->rss_lock);
> 
> 	// TODO: add another wake_up_locked() into rcu_sync_func(),

We need to add the first wakeup before we can add another one.  ;-)

But I don't see why we need a second wakeup, given that rcu_barrier()
wakes itself up.

> 	// use wait_event + spin_lock_wait, remove gp_ops->wait().
> 	if (gp_state != GP_IDLE) {
> 		gp_ops[rsp->gp_type].wait();
> 		BUG_ON(rsp->gp_state != GP_IDLE);
> 	}
> }
> 

And please see below for the state table and assumptions.

So, what am I missing?

							Thanx, Paul

------------------------------------------------------------------------


o       "count" is the value of the rcu_sync structure's ->gp_count field.

o       "state" is the value of the rcu_sync structure's ->gp_state field.

o       "CB?" is "yes" if there is an RCU callback pending and "no" otherwise.



   | count | state     | CB? | next state
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 A |   0   | GP_IDLE   | no  | rcu_sync_enter() -> B (wait)
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> H
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 B |   1+  | GP_ENTER  | yes | rcu_sync_enter() -> B (wait)
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C (ends _enter() wait)
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 C |   1+  | GP_PASSED | no  | rcu_sync_enter() -> C
   |       |           |     | last rcu_sync_exit() -> E
   |       |           |     | non-last rcu_sync_exit() -> C
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 D |   0   | GP_REPLAY | yes | rcu_sync_enter() -> F
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> I (wait ???)
   |       |           |     | callback -> E
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 E |   0   | GP_EXIT   | yes | rcu_sync_enter() -> G
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> I (wait)
   |       |           |     | callback -> A
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 F |   1+  | GP_REPLAY | yes | rcu_sync_enter() -> F
   |       |           |     | last rcu_sync_exit() -> D
   |       |           |     | non-last rcu_sync_exit() -> F
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 G |   1+  | GP_EXIT   | yes | rcu_sync_enter() -> G
   |       |           |     | last rcu_sync_exit() -> D
   |       |           |     | non-last rcu_sync_exit() -> F
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 H |   0   | GP_IDLE   | no  | rcu_sync_enter() -> illegal
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 I |   0   | GP_EXIT   | yes | rcu_sync_enter() -> illegal
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> H (ends dtor() wait)


Assumptions:

o	Initial state is A.

o	Final state is H.

o	There will never be enough unpaired rcu_sync_enter() calls to
	overflow ->gp_count.

o	All calls to rcu_sync_exit() must pair with a preceding call
	to rcu_sync_enter() by that same thread.

o	It is illegal to invoke rcu_sync_dtor() until after the caller
	has ensured that there will be no future calls to either
	rcu_sync_enter() or rcu_sync_exit().

o	It is illegal to invoke rcu_sync_dtor() while there are any
	unpaired calls to rcu_sync_enter().

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-19 20:50                     ` Paul E. McKenney
@ 2016-07-20 15:13                       ` Oleg Nesterov
  2016-07-20 20:58                         ` Paul E. McKenney
  2016-07-20 17:16                       ` Oleg Nesterov
  1 sibling, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-20 15:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On 07/19, Paul E. McKenney wrote:
>
> On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
>
> > And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> > not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> > state (except obviously they should be balanced), and they do not block.
>
> Hmmm...  I am assuming that the only threads allowed to invoke
> rcu_sync_exit() are those that have executed an unpaired rcu_sync_enter().
> With that assumption, rcu_sync_enter() is illegal if GP_IDLE or GP_ENTER,
> because the rcu_sync_enter() calls won't return until the state reaches
> GP_PASSED.

Not sure I understand... Obviously rcu_sync_exit() can only be used if
it pairs with the preceeding __rcu_sync_enter() or rcu_sync_enter(), if
nothing else rsp->gp_count-- must not underflow.

rcu_sync_enter() or __rcu_sync_enter() is legal in any state, the latter
won't block.

> If you feel strongly about allowing rcu_sync_exit() in GP_ENTER state,
> could you please tell me your use case?  Or am I confused?

I'll write another email, let me reply to the code review first. And
thanks for your review!

> > The only blocking call is __rcu_sync_wait() which actually waits for GP.
> > Obviously should only be called if gp_count != 0, iow after __rcu_sync_enter.
>
> Well, rcu_sync_dtor() as well.  But to your point, it is blocking in
> rcu_barrier(), not in wait_event().

Yes, yes, sure. I mean't the only blocking call in enter/exit paths.

> > #define	rss_lock	gp_wait.lock
>
> Interesting...  Should there instead be a wait_queue_lock() and
> wait_queue_unlock()?  If someone changes how wait-queue locking work,
> this one might be a bit tricky.

Well, the current version does the same... I do not mind to add
wait_queue_lock(), but personally I do not think this would be better.
Either way the code should not use gp_wait.lock directly, to make it
clear that we abuse the spinlock we already have in rcu_sync->gp_wait.

To me

	spin_lock_irq(&rsp->rss_lock);
	spin_unlock_irq(&rsp->rss_lock);

looks a bit better than

	wait_queue_lock();
	wait_queue_unlock();

because we need to protect the state, not the wait-queue. And if
someone changes how wait-queue locking work, we can just actually
add "spinlock_t rss_lock" into rcu_sync and remove that "#define".

> > static void rcu_sync_call(struct rcu_sync *rsp)
> > {
> > 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> > 	// if rcu_blocking_is_gp() == T, but it has might_sleep().
>
> Careful!  This optimization works only for RCU-sched and RCU-bh.
> With normal RCU, you can be tripped up by tasks preempted within RCU
> read-side critical sections should CONFIG_PREEMPT=y.

Yes, thanks, I understand ;) another reason why I do not want to add
this optimization into the initial version.

> > static void rcu_sync_func(struct rcu_head *rcu)
> > {
> > 	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
> > 	unsigned long flags;
> >
> > 	BUG_ON(rsp->gp_state == GP_IDLE);
>
> If you really want rcu_sync_exit() to be invoked before the corresponding
> rcu_sync_enter() returns, and if you want that to set the state to
> GP_IDLE, then the above BUG_ON() can fire.  This is why I was assuming
> that rcu_sync_enter() must return before the corresponding rcu_sync_exit()
> can start.

Again, can't understand. Let me repeat, of course rcu_sync_exit() can only
be called after __rcu_sync_enter() (or rcu_sync_enter() which does
__rcu_sync_enter + wait), they obviously should be balanced.

But, it should be fine to call rcu_sync_exit() right after __rcu_sync_enter()
returns, without waiting for GP_PASSED, iow without __rcu_sync_wait().

But probably I misunderstood you... Either way, rcu_sync_func() can never
hit gp_state == GP_IDLE. Because this state can only be set by rcu_sync_func(),
nobody else can set this state. Ignoring the initial state of course.

And every caller of rcu_sync_call() (which does call_rcu(rcu_sync_func))
updates the state first:

	rsp->gp_state = GP_ENTER;	// __rcu_sync_enter()
	rcu_sync_call(rsp);

	rsp->gp_state = GP_EXIT;	// rcu_sync_exit() or REPLAY
	rcu_sync_call(rsp);

so I simply can't understand your concern.

> > 	BUG_ON(rsp->gp_state == GP_PASSED);
>
> READ_ONCE() would feel better in both of the above BUG_ON()s, given that
> we are not yet holding the lock.

The lockless checks are fine, both states can only be set by the callback.
As for READ_ONCE(), please see below.

> > 	spin_lock_irqsave(&rsp->rss_lock, flags);
> > 	if (rsp->gp_count) {
> > 		/*
> > 		 * We're at least a GP after the first __rcu_sync_enter().
> > 		 */
>
> Don't we need a wakeup here?

Heh ;) let me show you what I have in kernel/rcu/sync.c~

	if (rsp->gp_count) {
		/*
		 * COMMENT.
		 */
		rsp->gp_state = GP_PASSED;
		wake_up_locked(&rsp->gp_wait);
	} else if (rsp->gp_state == GP_REPLAY) {

apparently I removed this wake_up_locked() by accident when I edited
the "COMMENT" placeholder ;)

Thanks!

> > bool __rcu_sync_enter(struct rcu_sync *rsp)
> > {
> > 	int gp_count, gp_state;
> >
> > 	spin_lock_irq(&rsp->rss_lock);
> > 	gp_count = rsp->gp_count++;
> > 	gp_state = rsp->gp_state;
> > 	if (gp_state == GP_IDLE) {
> > 		rsp->gp_state = GP_ENTER;
> > 		rcu_sync_call(rsp);
> > 	}
> > 	spin_unlock_irq(&rsp->rss_lock);
> >
> > 	BUG_ON(gp_count != 0 && gp_state == GP_IDLE);
> > 	BUG_ON(gp_count == 0 && gp_state == GP_PASSED);
>
> Isn't gp_count==0 illegal here regardless of gp_state value?

No. gp_count is the old value of rsp->gp_count, before it was
incremented by us.

> > 	return gp_state < GP_PASSED;
>
> And the above statement is another reason why I believe that it
> should be illegal to invoke rcu_sync_exit() until after the matching
> rcu_sync_enter() has returned.  If we were preempted just before the
> above "return" statement, we might be in GP_IDLE state upon return,

How? Note the GP_IDLE -> GP_ENTER transition above, it can't be GP_IDLE.
And since we incremented rsp->gp_count, GP_IDLE is not possibly until
rcu_sync_exit() which pairs with this "enter" decrements the counter
and fires the RCU callback which will set GP_IDLE after GP.

> > void __rcu_sync_wait(struct rcu_sync *rsp)
> > {
> > 	BUG_ON(rsp->gp_state == GP_IDLE);
> > 	BUG_ON(rsp->gp_count == 0);
> >
> > 	wait_event(rsp->gp_wait, rsp->gp_state >= GP_PASSED);
>
> I would feel better if all the references to ->gp_state and ->gp_count
> outside of the lock be READ_ONCE().  Compilers can be tricky beasts.

Oh. I won't argue too much, but I do not agree.

Because I started to hate these _ONCE() helpers a long ago. Why? Because
(imo) we have too many XXX_ONCE() added "just in case, because compiler
can be buggy".

Now, when I look some particular READ_ONCE() I can almost never understand
the reason: do we really need it for correctness? or this is another "it
does not hurt" case?

So why do we need READ_ONCE() here? How this wait_event() differs from
any other wait_event() which does a plain LOAD? Should we "fix" them all?

> rcu_sync_enter() checks to see if the rcu_sync structure is in writer
> mode, and, if not, initiates a transition to writer mode and waits for
> that transition to complete.  Either way, a writer reference is acquired.
>
> > void rcu_sync_enter(struct rcu_sync *rsp)
> > {
> > 	if (__rcu_sync_enter(rsp))
> > 		__rcu_sync_wait(rsp);
> > }

Yes. And of course it could call __rcu_sync_wait() unconditionally:

	__rcu_sync_enter(rsp);
	__rcu_sync_wait(rsp);

This "if" is more the documentation than optimization.

> Here is what I believe rcu_sync_exit() does:
>
> rcu_sync_exit() removes an updater reference, and if there are no more,
> starts the transition to reader mode.  Note that this must handle
> transitions back to writer mode that occur before the transition to
> reader mode has fully completed.
>
> > void rcu_sync_exit(struct rcu_sync *rsp)
> > {
> > 	BUG_ON(rsp->gp_state == GP_IDLE);
>
> 	BUG_ON(READ_ONCE(rsp->gp_state) == GP_ENTER);
>
> If you do allow rcu_sync_exit() while GP_ENTER, then all the readers
> can do rcu_sync_exit() in that state, leaving the state at GP_PASSED
> instead of the desired GP_IDLE.

Can't understand this too...

Firstly, readers do not do rcu_sync_enter/exit. They only use
rcu_sync_is_idle() which checks gp_state == GP_IDLE.

As for enter() path, we only need to ensure that after __rcu_sync_wait()
returns, all CPU's must see gp_state != GP_IDLE. Yes, yes, this is not
really true, we also need the memory barriers implied by RCU, lets ignore
this to simplify the discussion.

Now lets suppose that rcu_sync_exit() is called while GP_ENTER. This is
only possible if __rcu_sync_wait() was not called by us. IOW, this thread
does

	__rcu_sync_enter();
	rcu_sync_exit();

by any reason.

Now. If gp_count is not zero after we decrement it, rcu_sync_exit() does
nothing.

If it is zero, we could set GP_IDLE right now. We do not care about the
readers in this case, exactly because __rcu_sync_wait() was not called
and thus we can't even know if other CPU's had any chance to observe the
gp_state != GP_IDLE state set by __rcu_sync_enter().

But. We can't actually set GP_IDLE, because GP_ENTER means that the RCU
callback is already pending and this can break the next enter(). IOW, this
would break the "GP_IDLE is owned by __rcu_sync_enter" contract.

And we do not need to do this! We rely on the already pending rcu_sync_func()
which will set GP_IDLE _if_ the counter is still zero.

Now suppose that another writer (__rcu_sync_enter) comes and increments
gp_count right before rcu_sync_func() is called by RCU. In this case
rcu_sync_func() will notice rsp->gp_count != 0 and correctly set GP_PASSED.

Note that in this case that new writer won't wait for a full GP, quite
possibly __rcu_sync_wait() won't even block. And this is correct, we know
that we have at least one full GP after the previous GP_IDLE -> GP_ENTER
transition.


> And here is what I believe rcu_sync_dtor() does:
>
> rcu_sync_dtor() cleans up at end, waiting for a full transition back
> into reader mode.  Anything invoking rcu_sync_dtor() must have ensured
> that there will be no more calls to rcu_sync_enter() and rcu_sync_exit().

Sure, we are going to destroy/free this object.

> > void rcu_sync_dtor(struct rcu_sync *rsp)
> > {
> > 	int gp_state;
> >
> > 	BUG_ON(rsp->gp_count);
> > 	BUG_ON(rsp->gp_state == GP_PASSED);
> >
> > 	spin_lock_irq(&rsp->rss_lock);
> > 	if (rsp->gp_state == GP_REPLAY)
> > 		rsp->gp_state = GP_EXIT;
>
> OK, this ensures that the .wait() below will wait for the callback, but
> it might result in some RCU read-side critical sections still being in
> flight after rcu_sync_dtor() completes.

Hmm. Obviously, the caller should prevent this somehow or it is simply
buggy. Or I misunderstood.

> In other words, any transition
> started by an rcu_sync_exit() will normally wait until all pre-existing
> RCU readers complete -- unless that rcu_sync_exit() set the state to
> GP_REPLAY and was quickly followed by rcu_sync_dtor().

Again, we are going to (say) free this memory. The caller must ensure
that rcu_sync_is_idle(rsp) or anything else which can touch this object
is not possible.

The only reason for rcu_sync_dtor() is that we can't free/destroy this
memory until we ensure that the pending rcu_sync_func() finishes. And
of course we need to ensure it won't re-arm itself, that is why we
do GP_REPLAY -> GP_EXIT transition.

And in any case, this patch doesn't change rcu_sync_dtor() logic, it
is the same even if ->cb_state goes away.

> We need to add the first wakeup before we can add another one.  ;-)

Yes ;) see above.

> But I don't see why we need a second wakeup, given that rcu_barrier()
> wakes itself up.
>
> > 	// use wait_event + spin_lock_wait, remove gp_ops->wait().
> > 	if (gp_state != GP_IDLE) {
> > 		gp_ops[rsp->gp_type].wait();
> > 		BUG_ON(rsp->gp_state != GP_IDLE);
> > 	}
> > }

Because we can remove .wait() op and simplify the code a bit more. This
also can be more efficient, altough this doesn't really matter. The main
reason is that this would be more clear: we only need to sync with one
particular callback, we do not need to wait for all outstanding RCU
callbacks to complete. Anyway, this is TODO and needs another patch, we
will discuss this later.

> And please see below for the state table and assumptions.

I'll try to read it carefully later.

Thanks Paul!

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-19 20:50                     ` Paul E. McKenney
  2016-07-20 15:13                       ` Oleg Nesterov
@ 2016-07-20 17:16                       ` Oleg Nesterov
  2016-07-20 21:31                         ` Paul E. McKenney
  1 sibling, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-20 17:16 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

Paul, I had to switch to internal bugzillas after the first email, and now
I feel I can't read... I'll try to answer one question right now, tomorrow
I'll reread your email, probably I need to answer something else...

On 07/19, Paul E. McKenney wrote:
>
> On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
>
> > And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> > not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> > state (except obviously they should be balanced), and they do not block.
...
> If you feel strongly about allowing rcu_sync_exit() in GP_ENTER state,
> could you please tell me your use case?  Or am I confused?

See below,

> And I think __rcu_sync_enter() can have more users. Let's look at
> freeze_super(). It calls percpu_down_write() 3 times, and waits for 3 GP's
> sequentally.
>
> > Now we can add 3 __rcu_sync_enter's at the start and 3 rcu_sync_exit's at
> > the end (actually we can do better, just to simplify). And again, note
> > that rcu_sync_exit() will work correctly even if we (say) return -EBUSY,
> > so rcu_sync_wait and/or percpu_down_write() was not called in between,
> > and in this case we won't block waiting for GP.
> >
>
> I am not going to claim to understand freeze_super(), but it does seem
> to have a fair amount of waiting.
>
> But yes, you could put rcu_sync_enter()
                         ^^^^^^^^^^^^^^
__rcu_sync_enter, see below,

> and rcu_sync_exit() before and
> after a series of write-side enter/exit pairs in order to force things
> to stay in writer mode, if that is what you are suggesting.

No, no, this is not what I am trying to suggest.

The problem is that freeze_super() takes 3 semaphores for writing in row,
this means that it needs to wait for 3 GP's sequentally, and it does this
with sb->s_umount held. This is just ugly.

OK, lets suppose it simply does

	freeze_super(sb)
	{
		down_write(&sb->s_umount);
		if (NEED_TO_FREEZE) {
			percpu_down_write(SEM1);
			percpu_down_write(SEM2);
			percpu_down_write(SEM3);
		}
		up_write(&sb->s_umount);
	}

and every percpu_down_write() waits for GP.

Now, suppose we add the additional enter/exit's:

	freeze_super(sb)
	{
		// this doesn't block
		__rcu_sync_enter(SEM3);
		__rcu_sync_enter(SEM2);
		__rcu_sync_enter(SEM1);

		down_write(&sb->s_umount);
		if (NEED_TO_FREEZE) {
			percpu_down_write(SEM1);
			percpu_down_write(SEM2);
			percpu_down_write(SEM3);
		}
		up_write(&sb->s_umount);

		rcu_sync_exit(SEM1);
		rcu_sync_exit(SEM2);
		rcu_sync_exit(SEM3);
		
	}

Again, actually we can do better, just to simplify.

Now. the fisrt percpu_down_write(SEM1) can block waiting for GP or not,
this depends on how many time it spends in down_write().

But the 2nd and the 3rd percpu_down_write() most likely won't block, so
in the likely case freeze_super() will need a single GP pass.

And note that NEED_TO_FREEZE can be false, in this case rcu_sync_exit()
will be called in GP_ENTER state.


To some degree, this is like get_state_synchronize_rcu/cond_synchronize_rcu.
But obviously percpu_down_write() can not use these helpers, and in this
particular case __rcu_sync_enter() is better because it forces the start
of GP pass.

-----------------------------------------------------------------------
As for cgroups, we want to switch cgroup_threadgroup_rwsem into the
slow mode, at least for now.

We could add the additional hooks/hacks into rcu/sync.c but why? We can
do this without any changes outside of cgroup.c right now, just add
rcu_sync_enter() into cgroup_init().

But we do not want to add a pointless synchronize_sched() at boot time,
__rcu_sync_enter() looks much better.

------------------------------------------------------------------------
And even __cgroup_procs_write() could use __rcu_sync_enter(). But lets
ignore this for now.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-20 15:13                       ` Oleg Nesterov
@ 2016-07-20 20:58                         ` Paul E. McKenney
  2016-07-21 17:34                           ` Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-20 20:58 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Wed, Jul 20, 2016 at 05:13:58PM +0200, Oleg Nesterov wrote:
> On 07/19, Paul E. McKenney wrote:
> >
> > On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
> >
> > > And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> > > not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> > > state (except obviously they should be balanced), and they do not block.
> >
> > Hmmm...  I am assuming that the only threads allowed to invoke
> > rcu_sync_exit() are those that have executed an unpaired rcu_sync_enter().
> > With that assumption, rcu_sync_enter() is illegal if GP_IDLE or GP_ENTER,
> > because the rcu_sync_enter() calls won't return until the state reaches
> > GP_PASSED.
> 
> Not sure I understand... Obviously rcu_sync_exit() can only be used if
> it pairs with the preceeding __rcu_sync_enter() or rcu_sync_enter(), if
> nothing else rsp->gp_count-- must not underflow.

Agreed, just wondering what the no-wait use case is.

> rcu_sync_enter() or __rcu_sync_enter() is legal in any state, the latter
> won't block.

Actually, I had no idea that __rcu_sync_enter() was intended for anything
other than internal use.

Other than that, agreed, with the exception that it is illegal after
rcu_sync_dtor() has been called.

> > If you feel strongly about allowing rcu_sync_exit() in GP_ENTER state,
> > could you please tell me your use case?  Or am I confused?
> 
> I'll write another email, let me reply to the code review first. And
> thanks for your review!

Fair enough, hope it is helpful.

> > > The only blocking call is __rcu_sync_wait() which actually waits for GP.
> > > Obviously should only be called if gp_count != 0, iow after __rcu_sync_enter.
> >
> > Well, rcu_sync_dtor() as well.  But to your point, it is blocking in
> > rcu_barrier(), not in wait_event().
> 
> Yes, yes, sure. I mean't the only blocking call in enter/exit paths.

Pedantic of me, isn't it?  ;-)

> > > #define	rss_lock	gp_wait.lock
> >
> > Interesting...  Should there instead be a wait_queue_lock() and
> > wait_queue_unlock()?  If someone changes how wait-queue locking work,
> > this one might be a bit tricky.
> 
> Well, the current version does the same... I do not mind to add
> wait_queue_lock(), but personally I do not think this would be better.
> Either way the code should not use gp_wait.lock directly, to make it
> clear that we abuse the spinlock we already have in rcu_sync->gp_wait.
> 
> To me
> 
> 	spin_lock_irq(&rsp->rss_lock);
> 	spin_unlock_irq(&rsp->rss_lock);
> 
> looks a bit better than
> 
> 	wait_queue_lock();
> 	wait_queue_unlock();
> 
> because we need to protect the state, not the wait-queue. And if
> someone changes how wait-queue locking work, we can just actually
> add "spinlock_t rss_lock" into rcu_sync and remove that "#define".

I don't feel all that strongly about it, but it did look strange.

In any case, please don't reach inside RCU in this way without asking
me first!  ;-)

> > > static void rcu_sync_call(struct rcu_sync *rsp)
> > > {
> > > 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> > > 	// if rcu_blocking_is_gp() == T, but it has might_sleep().
> >
> > Careful!  This optimization works only for RCU-sched and RCU-bh.
> > With normal RCU, you can be tripped up by tasks preempted within RCU
> > read-side critical sections should CONFIG_PREEMPT=y.
> 
> Yes, thanks, I understand ;) another reason why I do not want to add
> this optimization into the initial version.

So I should take this as a request to export rcu_blocking_is_gp()?

I would probably need to change the name.  Or comment it more heavily.
Or hand out coccinelle scripts looking for this pattern:

	if (!rcu_blocking_is_gp())
		synchronize_rcu();

Or maybe all of the above.

> > > static void rcu_sync_func(struct rcu_head *rcu)
> > > {
> > > 	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
> > > 	unsigned long flags;
> > >
> > > 	BUG_ON(rsp->gp_state == GP_IDLE);
> >
> > If you really want rcu_sync_exit() to be invoked before the corresponding
> > rcu_sync_enter() returns, and if you want that to set the state to
> > GP_IDLE, then the above BUG_ON() can fire.  This is why I was assuming
> > that rcu_sync_enter() must return before the corresponding rcu_sync_exit()
> > can start.
> 
> Again, can't understand. Let me repeat, of course rcu_sync_exit() can only
> be called after __rcu_sync_enter() (or rcu_sync_enter() which does
> __rcu_sync_enter + wait), they obviously should be balanced.
> 
> But, it should be fine to call rcu_sync_exit() right after __rcu_sync_enter()
> returns, without waiting for GP_PASSED, iow without __rcu_sync_wait().
> 
> But probably I misunderstood you... Either way, rcu_sync_func() can never
> hit gp_state == GP_IDLE. Because this state can only be set by rcu_sync_func(),
> nobody else can set this state. Ignoring the initial state of course.
> 
> And every caller of rcu_sync_call() (which does call_rcu(rcu_sync_func))
> updates the state first:
> 
> 	rsp->gp_state = GP_ENTER;	// __rcu_sync_enter()
> 	rcu_sync_call(rsp);
> 
> 	rsp->gp_state = GP_EXIT;	// rcu_sync_exit() or REPLAY
> 	rcu_sync_call(rsp);
> 
> so I simply can't understand your concern.

You are right, the state is always set to something other than GP_IDLE
before the callback is registered.  Never mind!

> > > 	BUG_ON(rsp->gp_state == GP_PASSED);
> >
> > READ_ONCE() would feel better in both of the above BUG_ON()s, given that
> > we are not yet holding the lock.
> 
> The lockless checks are fine, both states can only be set by the callback.
> As for READ_ONCE(), please see below.
> 
> > > 	spin_lock_irqsave(&rsp->rss_lock, flags);
> > > 	if (rsp->gp_count) {
> > > 		/*
> > > 		 * We're at least a GP after the first __rcu_sync_enter().
> > > 		 */
> >
> > Don't we need a wakeup here?
> 
> Heh ;) let me show you what I have in kernel/rcu/sync.c~
> 
> 	if (rsp->gp_count) {
> 		/*
> 		 * COMMENT.
> 		 */
> 		rsp->gp_state = GP_PASSED;
> 		wake_up_locked(&rsp->gp_wait);
> 	} else if (rsp->gp_state == GP_REPLAY) {
> 
> apparently I removed this wake_up_locked() by accident when I edited
> the "COMMENT" placeholder ;)

Sounds like something that I might do!  ;-)

> Thanks!

No problem -- you have done the same for me many times!

> > > bool __rcu_sync_enter(struct rcu_sync *rsp)
> > > {
> > > 	int gp_count, gp_state;
> > >
> > > 	spin_lock_irq(&rsp->rss_lock);
> > > 	gp_count = rsp->gp_count++;
> > > 	gp_state = rsp->gp_state;
> > > 	if (gp_state == GP_IDLE) {
> > > 		rsp->gp_state = GP_ENTER;
> > > 		rcu_sync_call(rsp);
> > > 	}
> > > 	spin_unlock_irq(&rsp->rss_lock);
> > >
> > > 	BUG_ON(gp_count != 0 && gp_state == GP_IDLE);
> > > 	BUG_ON(gp_count == 0 && gp_state == GP_PASSED);
> >
> > Isn't gp_count==0 illegal here regardless of gp_state value?
> 
> No. gp_count is the old value of rsp->gp_count, before it was
> incremented by us.

Got it -- blindness on my part, I guess.

> > > 	return gp_state < GP_PASSED;
> >
> > And the above statement is another reason why I believe that it
> > should be illegal to invoke rcu_sync_exit() until after the matching
> > rcu_sync_enter() has returned.  If we were preempted just before the
> > above "return" statement, we might be in GP_IDLE state upon return,
> 
> How? Note the GP_IDLE -> GP_ENTER transition above, it can't be GP_IDLE.
> And since we incremented rsp->gp_count, GP_IDLE is not possibly until
> rcu_sync_exit() which pairs with this "enter" decrements the counter
> and fires the RCU callback which will set GP_IDLE after GP.

Right -- I was again confusing gp_state with rsp->gp_state.

> > > void __rcu_sync_wait(struct rcu_sync *rsp)
> > > {
> > > 	BUG_ON(rsp->gp_state == GP_IDLE);
> > > 	BUG_ON(rsp->gp_count == 0);
> > >
> > > 	wait_event(rsp->gp_wait, rsp->gp_state >= GP_PASSED);
> >
> > I would feel better if all the references to ->gp_state and ->gp_count
> > outside of the lock be READ_ONCE().  Compilers can be tricky beasts.
> 
> Oh. I won't argue too much, but I do not agree.
> 
> Because I started to hate these _ONCE() helpers a long ago. Why? Because
> (imo) we have too many XXX_ONCE() added "just in case, because compiler
> can be buggy".
> 
> Now, when I look some particular READ_ONCE() I can almost never understand
> the reason: do we really need it for correctness? or this is another "it
> does not hurt" case?
> 
> So why do we need READ_ONCE() here? How this wait_event() differs from
> any other wait_event() which does a plain LOAD? Should we "fix" them all?

I think of them as "this accesses a shared variable".  I have been
learning more about what the compiler is permitted to do to normal
variables, and I am paranoid.  The default compiler assumption that no
other thread is looking at or modifying the variable is a very powerful
and profound assumption, and it points to a rather stunning array of
"creative" optimizations.

It has been suggested that gcc implement a -std=kernel, which might be
a decent alternative.  However, we would need to specify what we want
gcc to do (and not do) in that case.

This is not an easy thing, nor will it go away.  Though I suppose that
doesn't mean we cannot ignore it.  For awhile, anyway.  ;-)

> > rcu_sync_enter() checks to see if the rcu_sync structure is in writer
> > mode, and, if not, initiates a transition to writer mode and waits for
> > that transition to complete.  Either way, a writer reference is acquired.
> >
> > > void rcu_sync_enter(struct rcu_sync *rsp)
> > > {
> > > 	if (__rcu_sync_enter(rsp))
> > > 		__rcu_sync_wait(rsp);
> > > }
> 
> Yes. And of course it could call __rcu_sync_wait() unconditionally:
> 
> 	__rcu_sync_enter(rsp);
> 	__rcu_sync_wait(rsp);
> 
> This "if" is more the documentation than optimization.

Understood.

> > Here is what I believe rcu_sync_exit() does:
> >
> > rcu_sync_exit() removes an updater reference, and if there are no more,
> > starts the transition to reader mode.  Note that this must handle
> > transitions back to writer mode that occur before the transition to
> > reader mode has fully completed.
> >
> > > void rcu_sync_exit(struct rcu_sync *rsp)
> > > {
> > > 	BUG_ON(rsp->gp_state == GP_IDLE);
> >
> > 	BUG_ON(READ_ONCE(rsp->gp_state) == GP_ENTER);
> >
> > If you do allow rcu_sync_exit() while GP_ENTER, then all the readers

s/readers/writers/, sorry!

> > can do rcu_sync_exit() in that state, leaving the state at GP_PASSED
> > instead of the desired GP_IDLE.
> 
> Can't understand this too...
> 
> Firstly, readers do not do rcu_sync_enter/exit. They only use
> rcu_sync_is_idle() which checks gp_state == GP_IDLE.
> 
> As for enter() path, we only need to ensure that after __rcu_sync_wait()
> returns, all CPU's must see gp_state != GP_IDLE. Yes, yes, this is not
> really true, we also need the memory barriers implied by RCU, lets ignore
> this to simplify the discussion.
> 
> Now lets suppose that rcu_sync_exit() is called while GP_ENTER. This is
> only possible if __rcu_sync_wait() was not called by us. IOW, this thread
> does
> 
> 	__rcu_sync_enter();
> 	rcu_sync_exit();
> 
> by any reason.
> 
> Now. If gp_count is not zero after we decrement it, rcu_sync_exit() does
> nothing.
> 
> If it is zero, we could set GP_IDLE right now. We do not care about the
> readers in this case, exactly because __rcu_sync_wait() was not called
> and thus we can't even know if other CPU's had any chance to observe the
> gp_state != GP_IDLE state set by __rcu_sync_enter().
> 
> But. We can't actually set GP_IDLE, because GP_ENTER means that the RCU
> callback is already pending and this can break the next enter(). IOW, this
> would break the "GP_IDLE is owned by __rcu_sync_enter" contract.
> 
> And we do not need to do this! We rely on the already pending rcu_sync_func()
> which will set GP_IDLE _if_ the counter is still zero.
> 
> Now suppose that another writer (__rcu_sync_enter) comes and increments
> gp_count right before rcu_sync_func() is called by RCU. In this case
> rcu_sync_func() will notice rsp->gp_count != 0 and correctly set GP_PASSED.
> 
> Note that in this case that new writer won't wait for a full GP, quite
> possibly __rcu_sync_wait() won't even block. And this is correct, we know
> that we have at least one full GP after the previous GP_IDLE -> GP_ENTER
> transition.

Let me try laying out the sequence of events that I was concerned about,
which will hopefully either prove my point or identify my error:

o	The rcu_sync structure is in state GP_IDLE with count 0 and
	no CB posted.  (State A in my table.)

o	Task A invokes __rcu_sync_enter(), which puts the state at
	GP_ENTER with count 1 and a CB posted.

o	Task B also invokes __rcu_sync_enter(), which increments count
	to 2, but leaves the state otherwise unchanged.

o	Both Task A and Task B invoke rcu_sync_exit(), which decrements
	count back down to zero, but otherwise leaves the state unchanged.
	So we are at GP_ENTER with count 0.

Ah, but the CB is still pending, and when it is invoked, as you say above,
it will see that count==0 and therefore set the state to GP_IDLE.

OK, you are quite right.

> > And here is what I believe rcu_sync_dtor() does:
> >
> > rcu_sync_dtor() cleans up at end, waiting for a full transition back
> > into reader mode.  Anything invoking rcu_sync_dtor() must have ensured
> > that there will be no more calls to rcu_sync_enter() and rcu_sync_exit().
> 
> Sure, we are going to destroy/free this object.
> 
> > > void rcu_sync_dtor(struct rcu_sync *rsp)
> > > {
> > > 	int gp_state;
> > >
> > > 	BUG_ON(rsp->gp_count);
> > > 	BUG_ON(rsp->gp_state == GP_PASSED);
> > >
> > > 	spin_lock_irq(&rsp->rss_lock);
> > > 	if (rsp->gp_state == GP_REPLAY)
> > > 		rsp->gp_state = GP_EXIT;
> >
> > OK, this ensures that the .wait() below will wait for the callback, but
> > it might result in some RCU read-side critical sections still being in
> > flight after rcu_sync_dtor() completes.
> 
> Hmm. Obviously, the caller should prevent this somehow or it is simply
> buggy. Or I misunderstood.

Hard to say without knowing what the permitted use cases are...

Me, I would make rcu_sync_dtor() wait the extra grace period in this case.
It should be a low-probability race, and it reduces the _dtor-time
state space.

What it looks like you are saying is that the caller must not only ensure
that there will never again be a __rcu_sync_enter(), rcu_sync_enter(),
or rcu_sync_exit() (or, I suppose, rcu_sync_dtor()) for this rcu_sync
structure, but must also ensure that any relevant RCU read-side critical
sections have completed.

I guess that either way is OK, but whatever the rules are, they do need
to be clearly communicated.  Of course, this does indicate a bug in
rcu_sync_dtor()'s current comment header.  ;-)

> > In other words, any transition
> > started by an rcu_sync_exit() will normally wait until all pre-existing
> > RCU readers complete -- unless that rcu_sync_exit() set the state to
> > GP_REPLAY and was quickly followed by rcu_sync_dtor().
> 
> Again, we are going to (say) free this memory. The caller must ensure
> that rcu_sync_is_idle(rsp) or anything else which can touch this object
> is not possible.
> 
> The only reason for rcu_sync_dtor() is that we can't free/destroy this
> memory until we ensure that the pending rcu_sync_func() finishes. And
> of course we need to ensure it won't re-arm itself, that is why we
> do GP_REPLAY -> GP_EXIT transition.
> 
> And in any case, this patch doesn't change rcu_sync_dtor() logic, it
> is the same even if ->cb_state goes away.

Agreed, this is something that I should have complained about eleven
months ago.  So I am slow!  ;-)

> > We need to add the first wakeup before we can add another one.  ;-)
> 
> Yes ;) see above.
> 
> > But I don't see why we need a second wakeup, given that rcu_barrier()
> > wakes itself up.
> >
> > > 	// use wait_event + spin_lock_wait, remove gp_ops->wait().
> > > 	if (gp_state != GP_IDLE) {
> > > 		gp_ops[rsp->gp_type].wait();
> > > 		BUG_ON(rsp->gp_state != GP_IDLE);
> > > 	}
> > > }
> 
> Because we can remove .wait() op and simplify the code a bit more. This
> also can be more efficient, altough this doesn't really matter. The main
> reason is that this would be more clear: we only need to sync with one
> particular callback, we do not need to wait for all outstanding RCU
> callbacks to complete. Anyway, this is TODO and needs another patch, we
> will discuss this later.

Yes, you could have a flag that the callback manipulates and a
wait_event()/wake_up() or similar, which would be quite a bit lighter
weight than rcu_barrier().  So it does sound worthwhile.

The only concern would be scalability, but we should run into that
problem before solving it, and the callback will be the least of
our worries.

							Thanx, Paul

> > And please see below for the state table and assumptions.
> 
> I'll try to read it carefully later.
> 
> Thanks Paul!
> 
> Oleg.
> 

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-20 17:16                       ` Oleg Nesterov
@ 2016-07-20 21:31                         ` Paul E. McKenney
  2016-07-21 17:34                           ` Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-20 21:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
> Paul, I had to switch to internal bugzillas after the first email, and now
> I feel I can't read... I'll try to answer one question right now, tomorrow
> I'll reread your email, probably I need to answer something else...

I know that feeling of distraction!

> On 07/19, Paul E. McKenney wrote:
> >
> > On Sat, Jul 16, 2016 at 07:10:07PM +0200, Oleg Nesterov wrote:
> >
> > > And, there is another possible transition, GP_ENTER -> GP_IDLE, because
> > > not it is possible to call __rcu_sync_enter() and rcu_sync_exit() in any
> > > state (except obviously they should be balanced), and they do not block.
> ...
> > If you feel strongly about allowing rcu_sync_exit() in GP_ENTER state,
> > could you please tell me your use case?  Or am I confused?
> 
> See below,
> 
> > And I think __rcu_sync_enter() can have more users. Let's look at
> > freeze_super(). It calls percpu_down_write() 3 times, and waits for 3 GP's
> > sequentally.
> >
> > > Now we can add 3 __rcu_sync_enter's at the start and 3 rcu_sync_exit's at
> > > the end (actually we can do better, just to simplify). And again, note
> > > that rcu_sync_exit() will work correctly even if we (say) return -EBUSY,
> > > so rcu_sync_wait and/or percpu_down_write() was not called in between,
> > > and in this case we won't block waiting for GP.
> > >
> >
> > I am not going to claim to understand freeze_super(), but it does seem
> > to have a fair amount of waiting.
> >
> > But yes, you could put rcu_sync_enter()
>                          ^^^^^^^^^^^^^^
> __rcu_sync_enter, see below,
> 
> > and rcu_sync_exit() before and
> > after a series of write-side enter/exit pairs in order to force things
> > to stay in writer mode, if that is what you are suggesting.
> 
> No, no, this is not what I am trying to suggest.
> 
> The problem is that freeze_super() takes 3 semaphores for writing in row,
> this means that it needs to wait for 3 GP's sequentally, and it does this
> with sb->s_umount held. This is just ugly.

Slow, too.  ;-)

> OK, lets suppose it simply does
> 
> 	freeze_super(sb)
> 	{
> 		down_write(&sb->s_umount);
> 		if (NEED_TO_FREEZE) {
> 			percpu_down_write(SEM1);
> 			percpu_down_write(SEM2);
> 			percpu_down_write(SEM3);
> 		}
> 		up_write(&sb->s_umount);
> 	}
> 
> and every percpu_down_write() waits for GP.
> 
> Now, suppose we add the additional enter/exit's:
> 
> 	freeze_super(sb)
> 	{
> 		// this doesn't block
> 		__rcu_sync_enter(SEM3);
> 		__rcu_sync_enter(SEM2);
> 		__rcu_sync_enter(SEM1);
> 
> 		down_write(&sb->s_umount);
> 		if (NEED_TO_FREEZE) {
> 			percpu_down_write(SEM1);

The above waits for the grace period initiated by __rcu_sync_enter(),
correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
will see the state as GP_ENTER, and will thus wait.  Or am I confused?

> 			percpu_down_write(SEM2);
> 			percpu_down_write(SEM3);
> 		}
> 		up_write(&sb->s_umount);
> 
> 		rcu_sync_exit(SEM1);

But your point is that if !NEED_TO_FREEZE, we will get here without
waiting for a grace period.

But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
the "if" statement?

> 		rcu_sync_exit(SEM2);
> 		rcu_sync_exit(SEM3);

And the reason it is OK to invoke rcu_sync_exit() before doing
the percpu_up_write() calls is that those calls will do their own
rcu_sync_exit() calls, and the above calls really don't do anything
other than decrement the count.

> 		
> 	}
> 
> Again, actually we can do better, just to simplify.
> 
> Now. the fisrt percpu_down_write(SEM1) can block waiting for GP or not,
> this depends on how many time it spends in down_write().
> 
> But the 2nd and the 3rd percpu_down_write() most likely won't block, so
> in the likely case freeze_super() will need a single GP pass.
> 
> And note that NEED_TO_FREEZE can be false, in this case rcu_sync_exit()
> will be called in GP_ENTER state.

Understood, the above approach allows the three grace periods to
elapse concurrently, at least assuming RCU is busy at the time.
(If RCU is not busy, the first one gets its own grace period, and
the other two share the next grace period.)

But again why aren't the __rcu_sync_enter() and rcu_sync_exit() calls
inside that "if" statement?

That aside, would it make sense to name __rcu_sync_enter() something
like rcu_sync_begin_to_enter(), rcu_sync_pre_enter() or some such?
Something to make it clear that it just starts the job and that something
else is needed to finish it.

> To some degree, this is like get_state_synchronize_rcu/cond_synchronize_rcu.
> But obviously percpu_down_write() can not use these helpers, and in this
> particular case __rcu_sync_enter() is better because it forces the start
> of GP pass.

OK, I now understand the motivation, thank you for your patience!

> -----------------------------------------------------------------------
> As for cgroups, we want to switch cgroup_threadgroup_rwsem into the
> slow mode, at least for now.
> 
> We could add the additional hooks/hacks into rcu/sync.c but why? We can
> do this without any changes outside of cgroup.c right now, just add
> rcu_sync_enter() into cgroup_init().
> 
> But we do not want to add a pointless synchronize_sched() at boot time,
> __rcu_sync_enter() looks much better.

I do like that approach much better than an rcu_sync_sabotage().

I still have concern with the __rcu_sync_enter() name, and would prefer
something that clearly indicates that it starts the rcu_sync() entry
job, but does not synchronously complete it.

> ------------------------------------------------------------------------
> And even __cgroup_procs_write() could use __rcu_sync_enter(). But lets
> ignore this for now.

And here is an updated state table.  I do not yet separately call out
__rcu_sync_enter(), though without it the rcu_sync_exit() transition
out of state B cannot happen.

							Thanx, Paul

------------------------------------------------------------------------

o       "count" is the value of the rcu_sync structure's ->gp_count field.

o       "state" is the value of the rcu_sync structure's ->gp_state field.

o       "CB?" is "yes" if there is an RCU callback pending and "no" otherwise.



   | count | state     | CB? | next state
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 A |   0   | GP_IDLE   | no  | rcu_sync_enter() -> B (wait)
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> H
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 B |   1+  | GP_ENTER  | yes | rcu_sync_enter() -> B (wait)
   |       |           |     | last rcu_sync_exit() -> J
   |       |           |     | non-last rcu_sync_exit() -> B
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C (ends _enter() wait)
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 C |   1+  | GP_PASSED | no  | rcu_sync_enter() -> C
   |       |           |     | last rcu_sync_exit() -> E
   |       |           |     | non-last rcu_sync_exit() -> C
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 D |   0   | GP_REPLAY | yes | rcu_sync_enter() -> F
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> I (wait ???)
   |       |           |     | callback -> E
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 E |   0   | GP_EXIT   | yes | rcu_sync_enter() -> G
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> I (wait)
   |       |           |     | callback -> A
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 F |   1+  | GP_REPLAY | yes | rcu_sync_enter() -> F
   |       |           |     | last rcu_sync_exit() -> D
   |       |           |     | non-last rcu_sync_exit() -> F
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 G |   1+  | GP_EXIT   | yes | rcu_sync_enter() -> G
   |       |           |     | last rcu_sync_exit() -> D
   |       |           |     | non-last rcu_sync_exit() -> F
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> C
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 H |   0   | GP_IDLE   | no  | rcu_sync_enter() -> illegal
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> cannot happen
   |       |           |     |
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 I |   0   | GP_EXIT   | yes | rcu_sync_enter() -> illegal
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> H (ends dtor() wait)
---+-------+-----------+-----+------------------------------------
   |       |           |     |
 J |   0   | GP_ENTER  | yes | rcu_sync_enter() -> B (wait)
   |       |           |     | rcu_sync_exit() -> illegal
   |       |           |     | rcu_sync_dtor() -> illegal
   |       |           |     | callback -> A
   |       |           |     |


Assumptions:

o	Initial state is A.

o	Final state is H.

o	There will never be enough unpaired rcu_sync_enter() calls to
	overflow ->gp_count.

o	All calls to rcu_sync_exit() must pair with a preceding call
	to rcu_sync_enter() by that same thread.

o	It is illegal to invoke rcu_sync_dtor() until after the caller
	has ensured that there will be no future calls to either
	rcu_sync_enter() or rcu_sync_exit().

o	It is illegal to invoke rcu_sync_dtor() while there are any
	unpaired calls to rcu_sync_enter().

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-20 20:58                         ` Paul E. McKenney
@ 2016-07-21 17:34                           ` Oleg Nesterov
  0 siblings, 0 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-21 17:34 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On 07/20, Paul E. McKenney wrote:
>
> On Wed, Jul 20, 2016 at 05:13:58PM +0200, Oleg Nesterov wrote:
>
> > rcu_sync_enter() or __rcu_sync_enter() is legal in any state, the latter
> > won't block.
>
> Actually, I had no idea that __rcu_sync_enter() was intended for anything
> other than internal use.
>
> Other than that, agreed, with the exception that it is illegal after
> rcu_sync_dtor() has been called.

Yes, sure. rcu_sync_dtor() "destroys" struct rcu_sync, and currently it
is only called by destroy_super_work() right before kfree(). Nothing is
legal after rcu_sync_dtor().

> > > > static void rcu_sync_call(struct rcu_sync *rsp)
> > > > {
> > > > 	// TODO: THIS IS SUBOPTIMAL. We want to call it directly
> > > > 	// if rcu_blocking_is_gp() == T, but it has might_sleep().
> > >
> > > Careful!  This optimization works only for RCU-sched and RCU-bh.
> > > With normal RCU, you can be tripped up by tasks preempted within RCU
> > > read-side critical sections should CONFIG_PREEMPT=y.
> >
> > Yes, thanks, I understand ;) another reason why I do not want to add
> > this optimization into the initial version.
>
> So I should take this as a request to export rcu_blocking_is_gp()?

Would be nice ;) Or we can do something else. Nevermind, this needs
another discussion.

> > > > void rcu_sync_dtor(struct rcu_sync *rsp)
> > > > {
> > > > 	int gp_state;
> > > >
> > > > 	BUG_ON(rsp->gp_count);
> > > > 	BUG_ON(rsp->gp_state == GP_PASSED);
> > > >
> > > > 	spin_lock_irq(&rsp->rss_lock);
> > > > 	if (rsp->gp_state == GP_REPLAY)
> > > > 		rsp->gp_state = GP_EXIT;
> > >
> > > OK, this ensures that the .wait() below will wait for the callback, but
> > > it might result in some RCU read-side critical sections still being in
> > > flight after rcu_sync_dtor() completes.
> >
> > Hmm. Obviously, the caller should prevent this somehow or it is simply
> > buggy. Or I misunderstood.
>
> Hard to say without knowing what the permitted use cases are...
>
> Me, I would make rcu_sync_dtor() wait the extra grace period in this case.
> It should be a low-probability race, and it reduces the _dtor-time
> state space.
>
> What it looks like you are saying is that the caller must not only ensure
> that there will never again be a __rcu_sync_enter(), rcu_sync_enter(),
> or rcu_sync_exit() (or, I suppose, rcu_sync_dtor()) for this rcu_sync
> structure,

Yes, and

> but must also ensure that any relevant RCU read-side critical
> sections have completed.

Ah, now I understand your concerns. Yes, yes, sure. The caller must
ensure that all RCU read-side critical sections which might look at this
rcu_sync via rcu_sync_is_idle() have completed.

Currently the only caller of dtor() is percpu_free_rwsem(). So if you do,
say,

	struct percpu_rw_semaphore *sem = kmalloc(...);

	...

	percpu_free_rwsem(sem);
	kfree(sem);

you obviously need to ensure that percpu_free_rwsem() can't be called
before all readers fully complete their percpu_down_read/percpu_up_read
critical sections, this includes the RCU read-side critical sections.

And this doesn't doesn't really differ from the plain rw_semaphore.


And can't resist... let me add another "TODO" note ;) we actually want
to improve it a bit (probably just a "bool wait" arg) and kill the ugly
super_block->destroy_work which currently does percpu_free_rwsem(). This
should be simple.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-20 21:31                         ` Paul E. McKenney
@ 2016-07-21 17:34                           ` Oleg Nesterov
  2016-07-22  3:26                             ` Paul E. McKenney
  0 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-21 17:34 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On 07/20, Paul E. McKenney wrote:
>
> On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
>
> > Now, suppose we add the additional enter/exit's:
> >
> > 	freeze_super(sb)
> > 	{
> > 		// this doesn't block
> > 		__rcu_sync_enter(SEM3);
> > 		__rcu_sync_enter(SEM2);
> > 		__rcu_sync_enter(SEM1);
> >
> > 		down_write(&sb->s_umount);
> > 		if (NEED_TO_FREEZE) {
> > 			percpu_down_write(SEM1);
>
> The above waits for the grace period initiated by __rcu_sync_enter(),
> correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
> will see the state as GP_ENTER, and will thus wait.

But if down_write() blocks and/or NEED_TO_FREEZE takes some time it
could already see the GP_PASSED state, or at least it can sleep less.

> But your point is that if !NEED_TO_FREEZE, we will get here without
> waiting for a grace period.
>
> But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
> the "if" statement?

Yes, if we do __rcu_sync_enter() inside "if", then rcu_sync_exit() can't
hit GP_ENTER.

But why we should disallow this use-case? It does not complicate the code
at all.

And see above, we want to initiate the GP "asap", so that we will sleep
less later. Although yes, freeze_super() is not the best example. And
__cgroup_procs_write() too, but note that cgroup_kn_lock_live() is rather
heavy, takes the global locks, and can fail. So (ignoring the fact we
are going to switch cgroup_threadgroup_rwsem into the slow mode for now)
__rcu_sync_enter() at the start could help to lessen the time
percpu_down_write(cgroup_threadgroup_rwsem) sleeps with the cgroup_mutex
held.

> That aside, would it make sense to name __rcu_sync_enter() something
> like rcu_sync_begin_to_enter(), rcu_sync_pre_enter() or some such?
> Something to make it clear that it just starts the job and that something
> else is needed to finish it.

Sure. Agreed, will rename.

> And here is an updated state table.  I do not yet separately call out
> __rcu_sync_enter(), though without it the rcu_sync_exit() transition
> out of state B cannot happen.

Thanks! I'll try to double-check it.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-21 17:34                           ` Oleg Nesterov
@ 2016-07-22  3:26                             ` Paul E. McKenney
  2016-07-25 17:01                               ` Oleg Nesterov
  0 siblings, 1 reply; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-22  3:26 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Thu, Jul 21, 2016 at 07:34:36PM +0200, Oleg Nesterov wrote:
> On 07/20, Paul E. McKenney wrote:
> >
> > On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
> >
> > > Now, suppose we add the additional enter/exit's:
> > >
> > > 	freeze_super(sb)
> > > 	{
> > > 		// this doesn't block
> > > 		__rcu_sync_enter(SEM3);
> > > 		__rcu_sync_enter(SEM2);
> > > 		__rcu_sync_enter(SEM1);
> > >
> > > 		down_write(&sb->s_umount);
> > > 		if (NEED_TO_FREEZE) {
> > > 			percpu_down_write(SEM1);
> >
> > The above waits for the grace period initiated by __rcu_sync_enter(),
> > correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
> > will see the state as GP_ENTER, and will thus wait.
> 
> But if down_write() blocks and/or NEED_TO_FREEZE takes some time it
> could already see the GP_PASSED state, or at least it can sleep less.
> 
> > But your point is that if !NEED_TO_FREEZE, we will get here without
> > waiting for a grace period.
> >
> > But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
> > the "if" statement?
> 
> Yes, if we do __rcu_sync_enter() inside "if", then rcu_sync_exit() can't
> hit GP_ENTER.
> 
> But why we should disallow this use-case? It does not complicate the code
> at all.

I do agree that it doesn't complicate the current implementation.
But it relies on a global lock, so I am not at all confident that this
implementation is the final word.  I therefore tend to try to avoid
supporting more than is required.

And speaking of global locks, failing to discourage the pattern above
means that the code is unnecessarily acquiring three global locks,
which doesn't seem like a good thing to me.

> And see above, we want to initiate the GP "asap", so that we will sleep
> less later. Although yes, freeze_super() is not the best example. And
> __cgroup_procs_write() too, but note that cgroup_kn_lock_live() is rather
> heavy, takes the global locks, and can fail. So (ignoring the fact we
> are going to switch cgroup_threadgroup_rwsem into the slow mode for now)
> __rcu_sync_enter() at the start could help to lessen the time
> percpu_down_write(cgroup_threadgroup_rwsem) sleeps with the cgroup_mutex
> held.

I agree that there are use cases for beginning-of-time __rcu_sync_enter()
or whatever we end up naming it.

> > That aside, would it make sense to name __rcu_sync_enter() something
> > like rcu_sync_begin_to_enter(), rcu_sync_pre_enter() or some such?
> > Something to make it clear that it just starts the job and that something
> > else is needed to finish it.
> 
> Sure. Agreed, will rename.

Thank you!

> > And here is an updated state table.  I do not yet separately call out
> > __rcu_sync_enter(), though without it the rcu_sync_exit() transition
> > out of state B cannot happen.
> 
> Thanks! I'll try to double-check it.

And thank you again!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-22  3:26                             ` Paul E. McKenney
@ 2016-07-25 17:01                               ` Oleg Nesterov
  2016-07-25 17:05                                 ` John Stultz
  2016-07-25 17:49                                 ` Paul E. McKenney
  0 siblings, 2 replies; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-25 17:01 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

Paul, sorry for delay.

On 07/21, Paul E. McKenney wrote:
>
> On Thu, Jul 21, 2016 at 07:34:36PM +0200, Oleg Nesterov wrote:
> > On 07/20, Paul E. McKenney wrote:
> > >
> > > On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
> > >
> > > > Now, suppose we add the additional enter/exit's:
> > > >
> > > > 	freeze_super(sb)
> > > > 	{
> > > > 		// this doesn't block
> > > > 		__rcu_sync_enter(SEM3);
> > > > 		__rcu_sync_enter(SEM2);
> > > > 		__rcu_sync_enter(SEM1);
> > > >
> > > > 		down_write(&sb->s_umount);
> > > > 		if (NEED_TO_FREEZE) {
> > > > 			percpu_down_write(SEM1);
> > >
> > > The above waits for the grace period initiated by __rcu_sync_enter(),
> > > correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
> > > will see the state as GP_ENTER, and will thus wait.
> >
> > But if down_write() blocks and/or NEED_TO_FREEZE takes some time it
> > could already see the GP_PASSED state, or at least it can sleep less.
> >
> > > But your point is that if !NEED_TO_FREEZE, we will get here without
> > > waiting for a grace period.
> > >
> > > But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
> > > the "if" statement?
> >
> > Yes, if we do __rcu_sync_enter() inside "if", then rcu_sync_exit() can't
> > hit GP_ENTER.
> >
> > But why we should disallow this use-case? It does not complicate the code
> > at all.
>
> I do agree that it doesn't complicate the current implementation.
> But it relies on a global lock, so I am not at all confident that this
> implementation is the final word.

Hmm. which global lock? Or did you mean freeze_super(), not rcu_sync?

> And speaking of global locks, failing to discourage the pattern above
> means that the code is unnecessarily acquiring three global locks,
> which doesn't seem like a good thing to me.

Well, I do not agree, but this wasn't written by me. Just in case, all these
locks above are not really global, they are per-sb, but this is minor.

And the patches which changed sb->s_writers to use percpu_rw_semaphore/rcu_sync
didn't change this logic.

Except the old implementation was buggy, and the readers were slower than now.

> I agree that there are use cases for beginning-of-time __rcu_sync_enter()
> or whatever we end up naming it.

OK, at least iiuc you agree that cgroup_init() can use __rcu_sync_enter().
As for other potential use-cases, we will disccuss this later. I will have
to CC you anyway ;)

So I'll send v2 with renames after I test it. Thanks again.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-25 17:01                               ` Oleg Nesterov
@ 2016-07-25 17:05                                 ` John Stultz
  2016-07-25 17:26                                   ` Oleg Nesterov
  2016-07-25 17:49                                 ` Paul E. McKenney
  1 sibling, 1 reply; 38+ messages in thread
From: John Stultz @ 2016-07-25 17:05 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, Peter Zijlstra, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On Mon, Jul 25, 2016 at 10:01 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> Paul, sorry for delay.
>
> On 07/21, Paul E. McKenney wrote:
>>
>> On Thu, Jul 21, 2016 at 07:34:36PM +0200, Oleg Nesterov wrote:
>> > On 07/20, Paul E. McKenney wrote:
>> > >
>> > > On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
>> > >
>> > > > Now, suppose we add the additional enter/exit's:
>> > > >
>> > > >         freeze_super(sb)
>> > > >         {
>> > > >                 // this doesn't block
>> > > >                 __rcu_sync_enter(SEM3);
>> > > >                 __rcu_sync_enter(SEM2);
>> > > >                 __rcu_sync_enter(SEM1);
>> > > >
>> > > >                 down_write(&sb->s_umount);
>> > > >                 if (NEED_TO_FREEZE) {
>> > > >                         percpu_down_write(SEM1);
>> > >
>> > > The above waits for the grace period initiated by __rcu_sync_enter(),
>> > > correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
>> > > will see the state as GP_ENTER, and will thus wait.
>> >
>> > But if down_write() blocks and/or NEED_TO_FREEZE takes some time it
>> > could already see the GP_PASSED state, or at least it can sleep less.
>> >
>> > > But your point is that if !NEED_TO_FREEZE, we will get here without
>> > > waiting for a grace period.
>> > >
>> > > But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
>> > > the "if" statement?
>> >
>> > Yes, if we do __rcu_sync_enter() inside "if", then rcu_sync_exit() can't
>> > hit GP_ENTER.
>> >
>> > But why we should disallow this use-case? It does not complicate the code
>> > at all.
>>
>> I do agree that it doesn't complicate the current implementation.
>> But it relies on a global lock, so I am not at all confident that this
>> implementation is the final word.
>
> Hmm. which global lock? Or did you mean freeze_super(), not rcu_sync?
>
>> And speaking of global locks, failing to discourage the pattern above
>> means that the code is unnecessarily acquiring three global locks,
>> which doesn't seem like a good thing to me.
>
> Well, I do not agree, but this wasn't written by me. Just in case, all these
> locks above are not really global, they are per-sb, but this is minor.
>
> And the patches which changed sb->s_writers to use percpu_rw_semaphore/rcu_sync
> didn't change this logic.
>
> Except the old implementation was buggy, and the readers were slower than now.
>
>> I agree that there are use cases for beginning-of-time __rcu_sync_enter()
>> or whatever we end up naming it.
>
> OK, at least iiuc you agree that cgroup_init() can use __rcu_sync_enter().
> As for other potential use-cases, we will disccuss this later. I will have
> to CC you anyway ;)
>
> So I'll send v2 with renames after I test it. Thanks again.

Can you also make clear which patches of PeterZ's I should be adding
as well for testing?  I've lost the plot as to what goes with what..

thanks
-john

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-25 17:05                                 ` John Stultz
@ 2016-07-25 17:26                                   ` Oleg Nesterov
  2016-08-09  8:48                                     ` Peter Zijlstra
  0 siblings, 1 reply; 38+ messages in thread
From: Oleg Nesterov @ 2016-07-25 17:26 UTC (permalink / raw)
  To: John Stultz
  Cc: Paul E. McKenney, Peter Zijlstra, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On 07/25, John Stultz wrote:
>
> Can you also make clear which patches of PeterZ's I should be adding
> as well for testing?  I've lost the plot as to what goes with what..

Well, my understanding is that Peter is going to send v2 with some minor
changes, I am going to send this patch on top of his series.

In particular 2/2 should set GP_PASSED rather than !GP_IDLE. It would be
nice to also rename rcu_sync_sabotage() to rcu_sync_enter_start(), this
will save another change in cgroup.c, but this is minor.

Oleg.

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-25 17:01                               ` Oleg Nesterov
  2016-07-25 17:05                                 ` John Stultz
@ 2016-07-25 17:49                                 ` Paul E. McKenney
  1 sibling, 0 replies; 38+ messages in thread
From: Paul E. McKenney @ 2016-07-25 17:49 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, mingo, linux-kernel, tj, john.stultz, dimitrysh,
	romlem, ccross, tkjos

On Mon, Jul 25, 2016 at 07:01:17PM +0200, Oleg Nesterov wrote:
> Paul, sorry for delay.
> 
> On 07/21, Paul E. McKenney wrote:
> >
> > On Thu, Jul 21, 2016 at 07:34:36PM +0200, Oleg Nesterov wrote:
> > > On 07/20, Paul E. McKenney wrote:
> > > >
> > > > On Wed, Jul 20, 2016 at 07:16:03PM +0200, Oleg Nesterov wrote:
> > > >
> > > > > Now, suppose we add the additional enter/exit's:
> > > > >
> > > > > 	freeze_super(sb)
> > > > > 	{
> > > > > 		// this doesn't block
> > > > > 		__rcu_sync_enter(SEM3);
> > > > > 		__rcu_sync_enter(SEM2);
> > > > > 		__rcu_sync_enter(SEM1);
> > > > >
> > > > > 		down_write(&sb->s_umount);
> > > > > 		if (NEED_TO_FREEZE) {
> > > > > 			percpu_down_write(SEM1);
> > > >
> > > > The above waits for the grace period initiated by __rcu_sync_enter(),
> > > > correct?  Presumably "yes", because it will invoke rcu_sync_enter(), which
> > > > will see the state as GP_ENTER, and will thus wait.
> > >
> > > But if down_write() blocks and/or NEED_TO_FREEZE takes some time it
> > > could already see the GP_PASSED state, or at least it can sleep less.
> > >
> > > > But your point is that if !NEED_TO_FREEZE, we will get here without
> > > > waiting for a grace period.
> > > >
> > > > But why aren't the __rcu_sync_enter() and rcu_sync_exit() calls inside
> > > > the "if" statement?
> > >
> > > Yes, if we do __rcu_sync_enter() inside "if", then rcu_sync_exit() can't
> > > hit GP_ENTER.
> > >
> > > But why we should disallow this use-case? It does not complicate the code
> > > at all.
> >
> > I do agree that it doesn't complicate the current implementation.
> > But it relies on a global lock, so I am not at all confident that this
> > implementation is the final word.
> 
> Hmm. which global lock? Or did you mean freeze_super(), not rcu_sync?

OK, you are right, they are per-superblock locks rather than being global
locks.  Still, given that workloads that hammer a single filesystem hard
are quite common, it might still eventually be of some concern.

> > And speaking of global locks, failing to discourage the pattern above
> > means that the code is unnecessarily acquiring three global locks,
> > which doesn't seem like a good thing to me.
> 
> Well, I do not agree, but this wasn't written by me. Just in case, all these
> locks above are not really global, they are per-sb, but this is minor.

Agreed.

> And the patches which changed sb->s_writers to use percpu_rw_semaphore/rcu_sync
> didn't change this logic.
> 
> Except the old implementation was buggy, and the readers were slower than now.
> 
> > I agree that there are use cases for beginning-of-time __rcu_sync_enter()
> > or whatever we end up naming it.
> 
> OK, at least iiuc you agree that cgroup_init() can use __rcu_sync_enter().
> As for other potential use-cases, we will disccuss this later. I will have
> to CC you anyway ;)
> 
> So I'll send v2 with renames after I test it. Thanks again.

Sounds good!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter()
  2016-07-25 17:26                                   ` Oleg Nesterov
@ 2016-08-09  8:48                                     ` Peter Zijlstra
  0 siblings, 0 replies; 38+ messages in thread
From: Peter Zijlstra @ 2016-08-09  8:48 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: John Stultz, Paul E. McKenney, Ingo Molnar, lkml, Tejun Heo,
	Dmitry Shmidt, Rom Lemarchand, Colin Cross, Todd Kjos

On Mon, Jul 25, 2016 at 07:26:26PM +0200, Oleg Nesterov wrote:
> On 07/25, John Stultz wrote:
> >
> > Can you also make clear which patches of PeterZ's I should be adding
> > as well for testing?  I've lost the plot as to what goes with what..
> 
> Well, my understanding is that Peter is going to send v2 with some minor
> changes, I am going to send this patch on top of his series.

OK, back from holidays and I finally got around to reading this again.

So let me send out the rwsem optmization patch v2 in a new thread and
lets take it from there.

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2016-08-09  8:48 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-14 18:25 [PATCH 0/2] locking/percpu-rwsem: Optimizations/tweaks Peter Zijlstra
2016-07-14 18:25 ` [PATCH 1/2] locking/percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2016-07-15 16:30   ` Oleg Nesterov
2016-07-15 19:47     ` Peter Zijlstra
2016-07-18 18:23   ` kbuild test robot
2016-07-18 22:51   ` kbuild test robot
2016-07-14 18:25 ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Peter Zijlstra
2016-07-14 18:37   ` Peter Zijlstra
2016-07-14 18:43   ` Oleg Nesterov
2016-07-14 18:56     ` Peter Zijlstra
2016-07-14 19:20     ` Peter Zijlstra
2016-07-14 19:29       ` Paul E. McKenney
2016-07-14 19:38         ` Peter Zijlstra
2016-07-14 19:54           ` Paul E. McKenney
2016-07-15 13:27       ` Oleg Nesterov
2016-07-15 13:39         ` Paul E. McKenney
2016-07-15 13:45           ` Oleg Nesterov
2016-07-15 15:38             ` Paul E. McKenney
2016-07-15 16:49               ` Oleg Nesterov
2016-07-15 18:01                 ` Paul E. McKenney
2016-07-16 17:10                   ` [PATCH] rcu_sync: simplify the state machine, introduce __rcu_sync_enter() Oleg Nesterov
2016-07-16 18:40                     ` Oleg Nesterov
2016-07-18 11:54                     ` Peter Zijlstra
2016-07-18 13:44                       ` Oleg Nesterov
2016-07-19 20:50                     ` Paul E. McKenney
2016-07-20 15:13                       ` Oleg Nesterov
2016-07-20 20:58                         ` Paul E. McKenney
2016-07-21 17:34                           ` Oleg Nesterov
2016-07-20 17:16                       ` Oleg Nesterov
2016-07-20 21:31                         ` Paul E. McKenney
2016-07-21 17:34                           ` Oleg Nesterov
2016-07-22  3:26                             ` Paul E. McKenney
2016-07-25 17:01                               ` Oleg Nesterov
2016-07-25 17:05                                 ` John Stultz
2016-07-25 17:26                                   ` Oleg Nesterov
2016-08-09  8:48                                     ` Peter Zijlstra
2016-07-25 17:49                                 ` Paul E. McKenney
2016-07-15 13:42       ` [PATCH 2/2] locking/percpu-rwsem: Introduce bias knob Oleg Nesterov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.